summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_inode.c7
-rw-r--r--fs/9p/vfs_inode_dotl.c8
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile1
-rw-r--r--fs/affs/affs.h2
-rw-r--r--fs/affs/namei.c8
-rw-r--r--fs/afs/addr_list.c50
-rw-r--r--fs/afs/cell.c437
-rw-r--r--fs/afs/cmservice.c82
-rw-r--r--fs/afs/dir.c17
-rw-r--r--fs/afs/dynroot.c492
-rw-r--r--fs/afs/fs_probe.c32
-rw-r--r--fs/afs/fsclient.c4
-rw-r--r--fs/afs/internal.h98
-rw-r--r--fs/afs/main.c16
-rw-r--r--fs/afs/mntpt.c5
-rw-r--r--fs/afs/proc.c15
-rw-r--r--fs/afs/rxrpc.c8
-rw-r--r--fs/afs/server.c601
-rw-r--r--fs/afs/server_list.c6
-rw-r--r--fs/afs/super.c25
-rw-r--r--fs/afs/vl_alias.c7
-rw-r--r--fs/afs/vl_rotate.c2
-rw-r--r--fs/afs/volume.c15
-rw-r--r--fs/autofs/autofs_i.h2
-rw-r--r--fs/autofs/dev-ioctl.c3
-rw-r--r--fs/autofs/root.c14
-rw-r--r--fs/bad_inode.c6
-rw-r--r--fs/bcachefs/fs-ioctl.c4
-rw-r--r--fs/bcachefs/fs.c8
-rw-r--r--fs/binfmt_elf.c21
-rw-r--r--fs/binfmt_elf_fdpic.c13
-rw-r--r--fs/btrfs/inode.c8
-rw-r--r--fs/buffer.c58
-rw-r--r--fs/cachefiles/namei.c16
-rw-r--r--fs/cachefiles/ondemand.c7
-rw-r--r--fs/ceph/addr.c1259
-rw-r--r--fs/ceph/dir.c45
-rw-r--r--fs/ceph/inode.c31
-rw-r--r--fs/ceph/mds_client.c2
-rw-r--r--fs/ceph/mds_client.h3
-rw-r--r--fs/ceph/super.c11
-rw-r--r--fs/ceph/super.h2
-rw-r--r--fs/coda/dir.c14
-rw-r--r--fs/configfs/dir.c6
-rw-r--r--fs/coredump.c38
-rw-r--r--fs/crypto/crypto.c22
-rw-r--r--fs/dax.c111
-rw-r--r--fs/dcache.c49
-rw-r--r--fs/devpts/inode.c251
-rw-r--r--fs/ecryptfs/inode.c20
-rw-r--r--fs/ecryptfs/super.c1
-rw-r--r--fs/efivarfs/super.c52
-rw-r--r--fs/eventfd.c5
-rw-r--r--fs/eventpoll.c95
-rw-r--r--fs/exec.c2
-rw-r--r--fs/exfat/namei.c8
-rw-r--r--fs/exportfs/expfs.c2
-rw-r--r--fs/ext2/namei.c9
-rw-r--r--fs/ext4/inode.c4
-rw-r--r--fs/ext4/namei.c10
-rw-r--r--fs/ext4/page-io.c2
-rw-r--r--fs/f2fs/data.c2
-rw-r--r--fs/f2fs/namei.c14
-rw-r--r--fs/fat/namei_msdos.c8
-rw-r--r--fs/fat/namei_vfat.c8
-rw-r--r--fs/file.c133
-rw-r--r--fs/file_table.c73
-rw-r--r--fs/fsopen.c2
-rw-r--r--fs/fuse/dev.c2
-rw-r--r--fs/fuse/dev_uring.c4
-rw-r--r--fs/fuse/dir.c50
-rw-r--r--fs/gfs2/bmap.c3
-rw-r--r--fs/gfs2/inode.c9
-rw-r--r--fs/hfs/dir.c10
-rw-r--r--fs/hfsplus/dir.c6
-rw-r--r--fs/hostfs/hostfs_kern.c16
-rw-r--r--fs/hpfs/namei.c10
-rw-r--r--fs/hugetlbfs/inode.c6
-rw-r--r--fs/init.c7
-rw-r--r--fs/inode.c127
-rw-r--r--fs/internal.h11
-rw-r--r--fs/ioctl.c10
-rw-r--r--fs/iomap/Makefile1
-rw-r--r--fs/iomap/buffered-io.c356
-rw-r--r--fs/iomap/direct-io.c279
-rw-r--r--fs/iomap/fiemap.c21
-rw-r--r--fs/iomap/internal.h10
-rw-r--r--fs/iomap/ioend.c216
-rw-r--r--fs/iomap/iter.c97
-rw-r--r--fs/iomap/seek.c16
-rw-r--r--fs/iomap/swapfile.c7
-rw-r--r--fs/iomap/trace.h8
-rw-r--r--fs/jffs2/dir.c18
-rw-r--r--fs/jfs/namei.c8
-rw-r--r--fs/kernfs/dir.c12
-rw-r--r--fs/libfs.c6
-rw-r--r--fs/minix/namei.c8
-rw-r--r--fs/mnt_idmapping.c51
-rw-r--r--fs/mount.h37
-rw-r--r--fs/mpage.c49
-rw-r--r--fs/namei.c149
-rw-r--r--fs/namespace.c852
-rw-r--r--fs/netfs/direct_read.c6
-rw-r--r--fs/netfs/read_collect.c18
-rw-r--r--fs/netfs/rolling_buffer.c4
-rw-r--r--fs/netfs/write_collect.c3
-rw-r--r--fs/nfs/dir.c20
-rw-r--r--fs/nfs/internal.h4
-rw-r--r--fs/nfs/nfs3proc.c29
-rw-r--r--fs/nfs/nfs4proc.c47
-rw-r--r--fs/nfs/proc.c12
-rw-r--r--fs/nfsd/nfs4recover.c7
-rw-r--r--fs/nfsd/vfs.c34
-rw-r--r--fs/nilfs2/namei.c8
-rw-r--r--fs/notify/fanotify/fanotify.c38
-rw-r--r--fs/notify/fanotify/fanotify.h18
-rw-r--r--fs/notify/fanotify/fanotify_user.c89
-rw-r--r--fs/notify/fdinfo.c5
-rw-r--r--fs/notify/fsnotify.c47
-rw-r--r--fs/notify/fsnotify.h11
-rw-r--r--fs/notify/mark.c14
-rw-r--r--fs/nsfs.c32
-rw-r--r--fs/ntfs3/namei.c8
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c10
-rw-r--r--fs/ocfs2/namei.c10
-rw-r--r--fs/omfs/dir.c6
-rw-r--r--fs/open.c44
-rw-r--r--fs/orangefs/file.c4
-rw-r--r--fs/orangefs/inode.c149
-rw-r--r--fs/orangefs/namei.c8
-rw-r--r--fs/orangefs/orangefs-debug.h43
-rw-r--r--fs/orangefs/orangefs-debugfs.c43
-rw-r--r--fs/overlayfs/dir.c46
-rw-r--r--fs/overlayfs/overlayfs.h15
-rw-r--r--fs/overlayfs/params.c25
-rw-r--r--fs/overlayfs/super.c23
-rw-r--r--fs/pidfs.c247
-rw-r--r--fs/pipe.c181
-rw-r--r--fs/pnode.c14
-rw-r--r--fs/pnode.h2
-rw-r--r--fs/proc/base.c55
-rw-r--r--fs/proc/generic.c10
-rw-r--r--fs/proc/inode.c6
-rw-r--r--fs/proc/internal.h14
-rw-r--r--fs/proc/kcore.c12
-rw-r--r--fs/pstore/inode.c111
-rw-r--r--fs/pstore/internal.h4
-rw-r--r--fs/pstore/platform.c11
-rw-r--r--fs/ramfs/inode.c6
-rw-r--r--fs/read_write.c13
-rw-r--r--fs/signalfd.c7
-rw-r--r--fs/smb/client/cifsfs.h4
-rw-r--r--fs/smb/client/cifssmb.c46
-rw-r--r--fs/smb/client/file.c2
-rw-r--r--fs/smb/client/inode.c10
-rw-r--r--fs/smb/client/smb2pdu.c96
-rw-r--r--fs/smb/server/vfs.c58
-rw-r--r--fs/splice.c40
-rw-r--r--fs/squashfs/cache.c2
-rw-r--r--fs/super.c57
-rw-r--r--fs/sysv/Kconfig38
-rw-r--r--fs/sysv/Makefile9
-rw-r--r--fs/sysv/balloc.c240
-rw-r--r--fs/sysv/dir.c378
-rw-r--r--fs/sysv/file.c59
-rw-r--r--fs/sysv/ialloc.c235
-rw-r--r--fs/sysv/inode.c354
-rw-r--r--fs/sysv/itree.c511
-rw-r--r--fs/sysv/namei.c280
-rw-r--r--fs/sysv/super.c595
-rw-r--r--fs/sysv/sysv.h245
-rw-r--r--fs/timerfd.c11
-rw-r--r--fs/tracefs/inode.c10
-rw-r--r--fs/ubifs/dir.c10
-rw-r--r--fs/ubifs/io.c3
-rw-r--r--fs/udf/namei.c12
-rw-r--r--fs/ufs/namei.c8
-rw-r--r--fs/unicode/Kconfig5
-rw-r--r--fs/unicode/Makefile2
-rw-r--r--fs/unicode/tests/.kunitconfig3
-rw-r--r--fs/unicode/tests/utf8_kunit.c (renamed from fs/unicode/utf8-selftest.c)153
-rw-r--r--fs/unicode/utf8-norm.c2
-rw-r--r--fs/vboxsf/dir.c8
-rw-r--r--fs/xfs/scrub/orphanage.c9
-rw-r--r--fs/xfs/xfs_aops.c25
-rw-r--r--fs/xfs/xfs_file.c6
-rw-r--r--fs/xfs/xfs_iomap.c8
-rw-r--r--fs/xfs/xfs_iops.c4
-rw-r--r--fs/xfs/xfs_super.c3
-rw-r--r--fs/zonefs/file.c2
191 files changed, 5193 insertions, 6788 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 3e68521f4e2f..399d455d50d6 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -669,8 +669,8 @@ v9fs_vfs_create(struct mnt_idmap *idmap, struct inode *dir,
*
*/
-static int v9fs_vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *v9fs_vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int err;
u32 perm;
@@ -692,8 +692,7 @@ static int v9fs_vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
if (fid)
p9_fid_put(fid);
-
- return err;
+ return ERR_PTR(err);
}
/**
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 143ac03b7425..cc2007be2173 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -350,9 +350,9 @@ out:
*
*/
-static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
- struct inode *dir, struct dentry *dentry,
- umode_t omode)
+static struct dentry *v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
+ struct inode *dir, struct dentry *dentry,
+ umode_t omode)
{
int err;
struct v9fs_session_info *v9ses;
@@ -417,7 +417,7 @@ error:
p9_fid_put(fid);
v9fs_put_acl(dacl, pacl);
p9_fid_put(dfid);
- return err;
+ return ERR_PTR(err);
}
static int
diff --git a/fs/Kconfig b/fs/Kconfig
index 64d420e3c475..afe21866d6b4 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -336,7 +336,6 @@ source "fs/qnx4/Kconfig"
source "fs/qnx6/Kconfig"
source "fs/romfs/Kconfig"
source "fs/pstore/Kconfig"
-source "fs/sysv/Kconfig"
source "fs/ufs/Kconfig"
source "fs/erofs/Kconfig"
source "fs/vboxsf/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 15df0a923d3a..77fd7f7b5d02 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -87,7 +87,6 @@ obj-$(CONFIG_NFSD) += nfsd/
obj-$(CONFIG_LOCKD) += lockd/
obj-$(CONFIG_NLS) += nls/
obj-y += unicode/
-obj-$(CONFIG_SYSV_FS) += sysv/
obj-$(CONFIG_SMBFS) += smb/
obj-$(CONFIG_HPFS_FS) += hpfs/
obj-$(CONFIG_NTFS3_FS) += ntfs3/
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index e8c2c4535cb3..ac4e9a02910b 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -168,7 +168,7 @@ extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, unsi
extern int affs_unlink(struct inode *dir, struct dentry *dentry);
extern int affs_create(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, bool);
-extern int affs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+extern struct dentry *affs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode);
extern int affs_rmdir(struct inode *dir, struct dentry *dentry);
extern int affs_link(struct dentry *olddentry, struct inode *dir,
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 8c154490a2d6..f883be50db12 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -273,7 +273,7 @@ affs_create(struct mnt_idmap *idmap, struct inode *dir,
return 0;
}
-int
+struct dentry *
affs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode)
{
@@ -285,7 +285,7 @@ affs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
inode = affs_new_inode(dir);
if (!inode)
- return -ENOSPC;
+ return ERR_PTR(-ENOSPC);
inode->i_mode = S_IFDIR | mode;
affs_mode_to_prot(inode);
@@ -298,9 +298,9 @@ affs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
clear_nlink(inode);
mark_inode_dirty(inode);
iput(inode);
- return error;
+ return ERR_PTR(error);
}
- return 0;
+ return NULL;
}
int
diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c
index 6d42f85c6be5..e941da5b6dd9 100644
--- a/fs/afs/addr_list.c
+++ b/fs/afs/addr_list.c
@@ -362,3 +362,53 @@ int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *alist,
alist->nr_addrs++;
return 0;
}
+
+/*
+ * Set the app data on the rxrpc peers an address list points to
+ */
+void afs_set_peer_appdata(struct afs_server *server,
+ struct afs_addr_list *old_alist,
+ struct afs_addr_list *new_alist)
+{
+ unsigned long data = (unsigned long)server;
+ int n = 0, o = 0;
+
+ if (!old_alist) {
+ /* New server. Just set all. */
+ for (; n < new_alist->nr_addrs; n++)
+ rxrpc_kernel_set_peer_data(new_alist->addrs[n].peer, data);
+ return;
+ }
+ if (!new_alist) {
+ /* Dead server. Just remove all. */
+ for (; o < old_alist->nr_addrs; o++)
+ rxrpc_kernel_set_peer_data(old_alist->addrs[o].peer, 0);
+ return;
+ }
+
+ /* Walk through the two lists simultaneously, setting new peers and
+ * clearing old ones. The two lists are ordered by pointer to peer
+ * record.
+ */
+ while (n < new_alist->nr_addrs && o < old_alist->nr_addrs) {
+ struct rxrpc_peer *pn = new_alist->addrs[n].peer;
+ struct rxrpc_peer *po = old_alist->addrs[o].peer;
+
+ if (pn == po)
+ continue;
+ if (pn < po) {
+ rxrpc_kernel_set_peer_data(pn, data);
+ n++;
+ } else {
+ rxrpc_kernel_set_peer_data(po, 0);
+ o++;
+ }
+ }
+
+ if (n < new_alist->nr_addrs)
+ for (; n < new_alist->nr_addrs; n++)
+ rxrpc_kernel_set_peer_data(new_alist->addrs[n].peer, data);
+ if (o < old_alist->nr_addrs)
+ for (; o < old_alist->nr_addrs; o++)
+ rxrpc_kernel_set_peer_data(old_alist->addrs[o].peer, 0);
+}
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 96a6781f3653..0168bbf53fe0 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -20,8 +20,9 @@ static unsigned __read_mostly afs_cell_min_ttl = 10 * 60;
static unsigned __read_mostly afs_cell_max_ttl = 24 * 60 * 60;
static atomic_t cell_debug_id;
-static void afs_queue_cell_manager(struct afs_net *);
-static void afs_manage_cell_work(struct work_struct *);
+static void afs_cell_timer(struct timer_list *timer);
+static void afs_destroy_cell_work(struct work_struct *work);
+static void afs_manage_cell_work(struct work_struct *work);
static void afs_dec_cells_outstanding(struct afs_net *net)
{
@@ -29,19 +30,11 @@ static void afs_dec_cells_outstanding(struct afs_net *net)
wake_up_var(&net->cells_outstanding);
}
-/*
- * Set the cell timer to fire after a given delay, assuming it's not already
- * set for an earlier time.
- */
-static void afs_set_cell_timer(struct afs_net *net, time64_t delay)
+static void afs_set_cell_state(struct afs_cell *cell, enum afs_cell_state state)
{
- if (net->live) {
- atomic_inc(&net->cells_outstanding);
- if (timer_reduce(&net->cells_timer, jiffies + delay * HZ))
- afs_dec_cells_outstanding(net);
- } else {
- afs_queue_cell_manager(net);
- }
+ smp_store_release(&cell->state, state); /* Commit cell changes before state */
+ smp_wmb(); /* Set cell state before task state */
+ wake_up_var(&cell->state);
}
/*
@@ -116,7 +109,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
const char *name, unsigned int namelen,
const char *addresses)
{
- struct afs_vlserver_list *vllist;
+ struct afs_vlserver_list *vllist = NULL;
struct afs_cell *cell;
int i, ret;
@@ -163,13 +156,15 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
cell->net = net;
refcount_set(&cell->ref, 1);
atomic_set(&cell->active, 0);
+ INIT_WORK(&cell->destroyer, afs_destroy_cell_work);
INIT_WORK(&cell->manager, afs_manage_cell_work);
+ timer_setup(&cell->management_timer, afs_cell_timer, 0);
init_rwsem(&cell->vs_lock);
cell->volumes = RB_ROOT;
INIT_HLIST_HEAD(&cell->proc_volumes);
seqlock_init(&cell->volume_lock);
cell->fs_servers = RB_ROOT;
- seqlock_init(&cell->fs_lock);
+ init_rwsem(&cell->fs_lock);
rwlock_init(&cell->vl_servers_lock);
cell->flags = (1 << AFS_CELL_FL_CHECK_ALIAS);
@@ -204,7 +199,13 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
cell->dns_status = vllist->status;
smp_store_release(&cell->dns_lookup_count, 1); /* vs source/status */
atomic_inc(&net->cells_outstanding);
+ ret = idr_alloc_cyclic(&net->cells_dyn_ino, cell,
+ 2, INT_MAX / 2, GFP_KERNEL);
+ if (ret < 0)
+ goto error;
+ cell->dynroot_ino = ret;
cell->debug_id = atomic_inc_return(&cell_debug_id);
+
trace_afs_cell(cell->debug_id, 1, 0, afs_cell_trace_alloc);
_leave(" = %p", cell);
@@ -214,6 +215,7 @@ parse_failed:
if (ret == -EINVAL)
printk(KERN_ERR "kAFS: bad VL server IP address\n");
error:
+ afs_put_vlserverlist(cell->net, vllist);
kfree(cell->name - 1);
kfree(cell);
_leave(" = %d", ret);
@@ -227,6 +229,7 @@ error:
* @namesz: The strlen of the cell name.
* @vllist: A colon/comma separated list of numeric IP addresses or NULL.
* @excl: T if an error should be given if the cell name already exists.
+ * @trace: The reason to be logged if the lookup is successful.
*
* Look up a cell record by name and query the DNS for VL server addresses if
* needed. Note that that actual DNS query is punted off to the manager thread
@@ -235,7 +238,8 @@ error:
*/
struct afs_cell *afs_lookup_cell(struct afs_net *net,
const char *name, unsigned int namesz,
- const char *vllist, bool excl)
+ const char *vllist, bool excl,
+ enum afs_cell_trace trace)
{
struct afs_cell *cell, *candidate, *cursor;
struct rb_node *parent, **pp;
@@ -245,7 +249,7 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
_enter("%s,%s", name, vllist);
if (!excl) {
- cell = afs_find_cell(net, name, namesz, afs_cell_trace_use_lookup);
+ cell = afs_find_cell(net, name, namesz, trace);
if (!IS_ERR(cell))
goto wait_for_cell;
}
@@ -288,26 +292,28 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
cell = candidate;
candidate = NULL;
- atomic_set(&cell->active, 2);
- trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), 2, afs_cell_trace_insert);
+ afs_use_cell(cell, trace);
rb_link_node_rcu(&cell->net_node, parent, pp);
rb_insert_color(&cell->net_node, &net->cells);
up_write(&net->cells_lock);
- afs_queue_cell(cell, afs_cell_trace_get_queue_new);
+ afs_queue_cell(cell, afs_cell_trace_queue_new);
wait_for_cell:
- trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), atomic_read(&cell->active),
- afs_cell_trace_wait);
_debug("wait_for_cell");
- wait_var_event(&cell->state,
- ({
- state = smp_load_acquire(&cell->state); /* vs error */
- state == AFS_CELL_ACTIVE || state == AFS_CELL_REMOVED;
- }));
+ state = smp_load_acquire(&cell->state); /* vs error */
+ if (state != AFS_CELL_ACTIVE &&
+ state != AFS_CELL_DEAD) {
+ afs_see_cell(cell, afs_cell_trace_wait);
+ wait_var_event(&cell->state,
+ ({
+ state = smp_load_acquire(&cell->state); /* vs error */
+ state == AFS_CELL_ACTIVE || state == AFS_CELL_DEAD;
+ }));
+ }
/* Check the state obtained from the wait check. */
- if (state == AFS_CELL_REMOVED) {
+ if (state == AFS_CELL_DEAD) {
ret = cell->error;
goto error;
}
@@ -321,7 +327,7 @@ cell_already_exists:
if (excl) {
ret = -EEXIST;
} else {
- afs_use_cell(cursor, afs_cell_trace_use_lookup);
+ afs_use_cell(cursor, trace);
ret = 0;
}
up_write(&net->cells_lock);
@@ -331,7 +337,7 @@ cell_already_exists:
goto wait_for_cell;
goto error_noput;
error:
- afs_unuse_cell(net, cell, afs_cell_trace_unuse_lookup);
+ afs_unuse_cell(cell, afs_cell_trace_unuse_lookup_error);
error_noput:
_leave(" = %d [error]", ret);
return ERR_PTR(ret);
@@ -376,8 +382,9 @@ int afs_cell_init(struct afs_net *net, const char *rootcell)
if (cp && cp < rootcell + len)
return -EINVAL;
- /* allocate a cell record for the root cell */
- new_root = afs_lookup_cell(net, rootcell, len, vllist, false);
+ /* allocate a cell record for the root/workstation cell */
+ new_root = afs_lookup_cell(net, rootcell, len, vllist, false,
+ afs_cell_trace_use_lookup_ws);
if (IS_ERR(new_root)) {
_leave(" = %ld", PTR_ERR(new_root));
return PTR_ERR(new_root);
@@ -388,12 +395,11 @@ int afs_cell_init(struct afs_net *net, const char *rootcell)
/* install the new cell */
down_write(&net->cells_lock);
- afs_see_cell(new_root, afs_cell_trace_see_ws);
old_root = rcu_replace_pointer(net->ws_cell, new_root,
lockdep_is_held(&net->cells_lock));
up_write(&net->cells_lock);
- afs_unuse_cell(net, old_root, afs_cell_trace_unuse_ws);
+ afs_unuse_cell(old_root, afs_cell_trace_unuse_ws);
_leave(" = 0");
return 0;
}
@@ -511,8 +517,9 @@ static void afs_cell_destroy(struct rcu_head *rcu)
trace_afs_cell(cell->debug_id, r, atomic_read(&cell->active), afs_cell_trace_free);
afs_put_vlserverlist(net, rcu_access_pointer(cell->vl_servers));
- afs_unuse_cell(net, cell->alias_of, afs_cell_trace_unuse_alias);
+ afs_unuse_cell(cell->alias_of, afs_cell_trace_unuse_alias);
key_put(cell->anonymous_key);
+ idr_remove(&net->cells_dyn_ino, cell->dynroot_ino);
kfree(cell->name - 1);
kfree(cell);
@@ -520,30 +527,14 @@ static void afs_cell_destroy(struct rcu_head *rcu)
_leave(" [destroyed]");
}
-/*
- * Queue the cell manager.
- */
-static void afs_queue_cell_manager(struct afs_net *net)
-{
- int outstanding = atomic_inc_return(&net->cells_outstanding);
-
- _enter("%d", outstanding);
-
- if (!queue_work(afs_wq, &net->cells_manager))
- afs_dec_cells_outstanding(net);
-}
-
-/*
- * Cell management timer. We have an increment on cells_outstanding that we
- * need to pass along to the work item.
- */
-void afs_cells_timer(struct timer_list *timer)
+static void afs_destroy_cell_work(struct work_struct *work)
{
- struct afs_net *net = container_of(timer, struct afs_net, cells_timer);
+ struct afs_cell *cell = container_of(work, struct afs_cell, destroyer);
- _enter("");
- if (!queue_work(afs_wq, &net->cells_manager))
- afs_dec_cells_outstanding(net);
+ afs_see_cell(cell, afs_cell_trace_destroy);
+ timer_delete_sync(&cell->management_timer);
+ cancel_work_sync(&cell->manager);
+ call_rcu(&cell->rcu, afs_cell_destroy);
}
/*
@@ -575,7 +566,7 @@ void afs_put_cell(struct afs_cell *cell, enum afs_cell_trace reason)
if (zero) {
a = atomic_read(&cell->active);
WARN(a != 0, "Cell active count %u > 0\n", a);
- call_rcu(&cell->rcu, afs_cell_destroy);
+ WARN_ON(!queue_work(afs_wq, &cell->destroyer));
}
}
}
@@ -587,10 +578,9 @@ struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason)
{
int r, a;
- r = refcount_read(&cell->ref);
- WARN_ON(r == 0);
+ __refcount_inc(&cell->ref, &r);
a = atomic_inc_return(&cell->active);
- trace_afs_cell(cell->debug_id, r, a, reason);
+ trace_afs_cell(cell->debug_id, r + 1, a, reason);
return cell;
}
@@ -598,10 +588,11 @@ struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason)
* Record a cell becoming less active. When the active counter reaches 1, it
* is scheduled for destruction, but may get reactivated.
*/
-void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_trace reason)
+void afs_unuse_cell(struct afs_cell *cell, enum afs_cell_trace reason)
{
unsigned int debug_id;
time64_t now, expire_delay;
+ bool zero;
int r, a;
if (!cell)
@@ -616,13 +607,15 @@ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_tr
expire_delay = afs_cell_gc_delay;
debug_id = cell->debug_id;
- r = refcount_read(&cell->ref);
a = atomic_dec_return(&cell->active);
- trace_afs_cell(debug_id, r, a, reason);
- WARN_ON(a == 0);
- if (a == 1)
+ if (!a)
/* 'cell' may now be garbage collected. */
- afs_set_cell_timer(net, expire_delay);
+ afs_set_cell_timer(cell, expire_delay);
+
+ zero = __refcount_dec_and_test(&cell->ref, &r);
+ trace_afs_cell(debug_id, r - 1, a, reason);
+ if (zero)
+ WARN_ON(!queue_work(afs_wq, &cell->destroyer));
}
/*
@@ -642,9 +635,27 @@ void afs_see_cell(struct afs_cell *cell, enum afs_cell_trace reason)
*/
void afs_queue_cell(struct afs_cell *cell, enum afs_cell_trace reason)
{
- afs_get_cell(cell, reason);
- if (!queue_work(afs_wq, &cell->manager))
- afs_put_cell(cell, afs_cell_trace_put_queue_fail);
+ queue_work(afs_wq, &cell->manager);
+}
+
+/*
+ * Cell-specific management timer.
+ */
+static void afs_cell_timer(struct timer_list *timer)
+{
+ struct afs_cell *cell = container_of(timer, struct afs_cell, management_timer);
+
+ afs_see_cell(cell, afs_cell_trace_see_mgmt_timer);
+ if (refcount_read(&cell->ref) > 0 && cell->net->live)
+ queue_work(afs_wq, &cell->manager);
+}
+
+/*
+ * Set/reduce the cell timer.
+ */
+void afs_set_cell_timer(struct afs_cell *cell, unsigned int delay_secs)
+{
+ timer_reduce(&cell->management_timer, jiffies + delay_secs * HZ);
}
/*
@@ -706,7 +717,6 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell)
if (cell->proc_link.next)
cell->proc_link.next->pprev = &cell->proc_link.next;
- afs_dynroot_mkdir(net, cell);
mutex_unlock(&net->proc_cells_lock);
return 0;
}
@@ -723,217 +733,130 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell)
mutex_lock(&net->proc_cells_lock);
if (!hlist_unhashed(&cell->proc_link))
hlist_del_rcu(&cell->proc_link);
- afs_dynroot_rmdir(net, cell);
mutex_unlock(&net->proc_cells_lock);
_leave("");
}
+static bool afs_has_cell_expired(struct afs_cell *cell, time64_t *_next_manage)
+{
+ const struct afs_vlserver_list *vllist;
+ time64_t expire_at = cell->last_inactive;
+ time64_t now = ktime_get_real_seconds();
+
+ if (atomic_read(&cell->active))
+ return false;
+ if (!cell->net->live)
+ return true;
+
+ vllist = rcu_dereference_protected(cell->vl_servers, true);
+ if (vllist && vllist->nr_servers > 0)
+ expire_at += afs_cell_gc_delay;
+
+ if (expire_at <= now)
+ return true;
+ if (expire_at < *_next_manage)
+ *_next_manage = expire_at;
+ return false;
+}
+
/*
* Manage a cell record, initialising and destroying it, maintaining its DNS
* records.
*/
-static void afs_manage_cell(struct afs_cell *cell)
+static bool afs_manage_cell(struct afs_cell *cell)
{
struct afs_net *net = cell->net;
- int ret, active;
+ time64_t next_manage = TIME64_MAX;
+ int ret;
_enter("%s", cell->name);
-again:
_debug("state %u", cell->state);
switch (cell->state) {
- case AFS_CELL_INACTIVE:
- case AFS_CELL_FAILED:
- down_write(&net->cells_lock);
- active = 1;
- if (atomic_try_cmpxchg_relaxed(&cell->active, &active, 0)) {
- rb_erase(&cell->net_node, &net->cells);
- trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), 0,
- afs_cell_trace_unuse_delete);
- smp_store_release(&cell->state, AFS_CELL_REMOVED);
- }
- up_write(&net->cells_lock);
- if (cell->state == AFS_CELL_REMOVED) {
- wake_up_var(&cell->state);
- goto final_destruction;
- }
- if (cell->state == AFS_CELL_FAILED)
- goto done;
- smp_store_release(&cell->state, AFS_CELL_UNSET);
- wake_up_var(&cell->state);
- goto again;
-
- case AFS_CELL_UNSET:
- smp_store_release(&cell->state, AFS_CELL_ACTIVATING);
- wake_up_var(&cell->state);
- goto again;
-
- case AFS_CELL_ACTIVATING:
- ret = afs_activate_cell(net, cell);
- if (ret < 0)
- goto activation_failed;
+ case AFS_CELL_SETTING_UP:
+ goto set_up_cell;
+ case AFS_CELL_ACTIVE:
+ goto cell_is_active;
+ case AFS_CELL_REMOVING:
+ WARN_ON_ONCE(1);
+ return false;
+ case AFS_CELL_DEAD:
+ return false;
+ default:
+ _debug("bad state %u", cell->state);
+ WARN_ON_ONCE(1); /* Unhandled state */
+ return false;
+ }
- smp_store_release(&cell->state, AFS_CELL_ACTIVE);
- wake_up_var(&cell->state);
- goto again;
+set_up_cell:
+ ret = afs_activate_cell(net, cell);
+ if (ret < 0) {
+ cell->error = ret;
+ goto remove_cell;
+ }
- case AFS_CELL_ACTIVE:
- if (atomic_read(&cell->active) > 1) {
- if (test_and_clear_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags)) {
- ret = afs_update_cell(cell);
- if (ret < 0)
- cell->error = ret;
- }
- goto done;
- }
- smp_store_release(&cell->state, AFS_CELL_DEACTIVATING);
- wake_up_var(&cell->state);
- goto again;
+ afs_set_cell_state(cell, AFS_CELL_ACTIVE);
- case AFS_CELL_DEACTIVATING:
- if (atomic_read(&cell->active) > 1)
- goto reverse_deactivation;
- afs_deactivate_cell(net, cell);
- smp_store_release(&cell->state, AFS_CELL_INACTIVE);
- wake_up_var(&cell->state);
- goto again;
+cell_is_active:
+ if (afs_has_cell_expired(cell, &next_manage))
+ goto remove_cell;
- case AFS_CELL_REMOVED:
- goto done;
+ if (test_and_clear_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags)) {
+ ret = afs_update_cell(cell);
+ if (ret < 0)
+ cell->error = ret;
+ }
- default:
- break;
+ if (next_manage < TIME64_MAX && cell->net->live) {
+ time64_t now = ktime_get_real_seconds();
+
+ if (next_manage - now <= 0)
+ afs_queue_cell(cell, afs_cell_trace_queue_again);
+ else
+ afs_set_cell_timer(cell, next_manage - now);
}
- _debug("bad state %u", cell->state);
- BUG(); /* Unhandled state */
+ _leave(" [done %u]", cell->state);
+ return false;
-activation_failed:
- cell->error = ret;
- afs_deactivate_cell(net, cell);
+remove_cell:
+ down_write(&net->cells_lock);
- smp_store_release(&cell->state, AFS_CELL_FAILED); /* vs error */
- wake_up_var(&cell->state);
- goto again;
+ if (atomic_read(&cell->active)) {
+ up_write(&net->cells_lock);
+ goto cell_is_active;
+ }
-reverse_deactivation:
- smp_store_release(&cell->state, AFS_CELL_ACTIVE);
- wake_up_var(&cell->state);
- _leave(" [deact->act]");
- return;
+ /* Make sure that the expiring server records are going to see the fact
+ * that the cell is caput.
+ */
+ afs_set_cell_state(cell, AFS_CELL_REMOVING);
-done:
- _leave(" [done %u]", cell->state);
- return;
+ afs_deactivate_cell(net, cell);
+ afs_purge_servers(cell);
+
+ rb_erase(&cell->net_node, &net->cells);
+ afs_see_cell(cell, afs_cell_trace_unuse_delete);
+ up_write(&net->cells_lock);
-final_destruction:
/* The root volume is pinning the cell */
afs_put_volume(cell->root_volume, afs_volume_trace_put_cell_root);
cell->root_volume = NULL;
- afs_put_cell(cell, afs_cell_trace_put_destroy);
+
+ afs_set_cell_state(cell, AFS_CELL_DEAD);
+ return true;
}
static void afs_manage_cell_work(struct work_struct *work)
{
struct afs_cell *cell = container_of(work, struct afs_cell, manager);
+ bool final_put;
- afs_manage_cell(cell);
- afs_put_cell(cell, afs_cell_trace_put_queue_work);
-}
-
-/*
- * Manage the records of cells known to a network namespace. This includes
- * updating the DNS records and garbage collecting unused cells that were
- * automatically added.
- *
- * Note that constructed cell records may only be removed from net->cells by
- * this work item, so it is safe for this work item to stash a cursor pointing
- * into the tree and then return to caller (provided it skips cells that are
- * still under construction).
- *
- * Note also that we were given an increment on net->cells_outstanding by
- * whoever queued us that we need to deal with before returning.
- */
-void afs_manage_cells(struct work_struct *work)
-{
- struct afs_net *net = container_of(work, struct afs_net, cells_manager);
- struct rb_node *cursor;
- time64_t now = ktime_get_real_seconds(), next_manage = TIME64_MAX;
- bool purging = !net->live;
-
- _enter("");
-
- /* Trawl the cell database looking for cells that have expired from
- * lack of use and cells whose DNS results have expired and dispatch
- * their managers.
- */
- down_read(&net->cells_lock);
-
- for (cursor = rb_first(&net->cells); cursor; cursor = rb_next(cursor)) {
- struct afs_cell *cell =
- rb_entry(cursor, struct afs_cell, net_node);
- unsigned active;
- bool sched_cell = false;
-
- active = atomic_read(&cell->active);
- trace_afs_cell(cell->debug_id, refcount_read(&cell->ref),
- active, afs_cell_trace_manage);
-
- ASSERTCMP(active, >=, 1);
-
- if (purging) {
- if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags)) {
- active = atomic_dec_return(&cell->active);
- trace_afs_cell(cell->debug_id, refcount_read(&cell->ref),
- active, afs_cell_trace_unuse_pin);
- }
- }
-
- if (active == 1) {
- struct afs_vlserver_list *vllist;
- time64_t expire_at = cell->last_inactive;
-
- read_lock(&cell->vl_servers_lock);
- vllist = rcu_dereference_protected(
- cell->vl_servers,
- lockdep_is_held(&cell->vl_servers_lock));
- if (vllist->nr_servers > 0)
- expire_at += afs_cell_gc_delay;
- read_unlock(&cell->vl_servers_lock);
- if (purging || expire_at <= now)
- sched_cell = true;
- else if (expire_at < next_manage)
- next_manage = expire_at;
- }
-
- if (!purging) {
- if (test_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags))
- sched_cell = true;
- }
-
- if (sched_cell)
- afs_queue_cell(cell, afs_cell_trace_get_queue_manage);
- }
-
- up_read(&net->cells_lock);
-
- /* Update the timer on the way out. We have to pass an increment on
- * cells_outstanding in the namespace that we are in to the timer or
- * the work scheduler.
- */
- if (!purging && next_manage < TIME64_MAX) {
- now = ktime_get_real_seconds();
-
- if (next_manage - now <= 0) {
- if (queue_work(afs_wq, &net->cells_manager))
- atomic_inc(&net->cells_outstanding);
- } else {
- afs_set_cell_timer(net, next_manage - now);
- }
- }
-
- afs_dec_cells_outstanding(net);
- _leave(" [%d]", atomic_read(&net->cells_outstanding));
+ afs_see_cell(cell, afs_cell_trace_manage);
+ final_put = afs_manage_cell(cell);
+ afs_see_cell(cell, afs_cell_trace_managed);
+ if (final_put)
+ afs_put_cell(cell, afs_cell_trace_put_final);
}
/*
@@ -942,6 +865,7 @@ void afs_manage_cells(struct work_struct *work)
void afs_cell_purge(struct afs_net *net)
{
struct afs_cell *ws;
+ struct rb_node *cursor;
_enter("");
@@ -949,14 +873,21 @@ void afs_cell_purge(struct afs_net *net)
ws = rcu_replace_pointer(net->ws_cell, NULL,
lockdep_is_held(&net->cells_lock));
up_write(&net->cells_lock);
- afs_unuse_cell(net, ws, afs_cell_trace_unuse_ws);
+ afs_unuse_cell(ws, afs_cell_trace_unuse_ws);
- _debug("del timer");
- if (del_timer_sync(&net->cells_timer))
- atomic_dec(&net->cells_outstanding);
+ _debug("kick cells");
+ down_read(&net->cells_lock);
+ for (cursor = rb_first(&net->cells); cursor; cursor = rb_next(cursor)) {
+ struct afs_cell *cell = rb_entry(cursor, struct afs_cell, net_node);
+
+ afs_see_cell(cell, afs_cell_trace_purge);
- _debug("kick mgr");
- afs_queue_cell_manager(net);
+ if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags))
+ afs_unuse_cell(cell, afs_cell_trace_unuse_pin);
+
+ afs_queue_cell(cell, afs_cell_trace_queue_purge);
+ }
+ up_read(&net->cells_lock);
_debug("wait");
wait_var_event(&net->cells_outstanding,
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 99a3f20bc786..1a906805a9e3 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -139,49 +139,6 @@ bool afs_cm_incoming_call(struct afs_call *call)
}
/*
- * Find the server record by peer address and record a probe to the cache
- * manager from a server.
- */
-static int afs_find_cm_server_by_peer(struct afs_call *call)
-{
- struct sockaddr_rxrpc srx;
- struct afs_server *server;
- struct rxrpc_peer *peer;
-
- peer = rxrpc_kernel_get_call_peer(call->net->socket, call->rxcall);
-
- server = afs_find_server(call->net, peer);
- if (!server) {
- trace_afs_cm_no_server(call, &srx);
- return 0;
- }
-
- call->server = server;
- return 0;
-}
-
-/*
- * Find the server record by server UUID and record a probe to the cache
- * manager from a server.
- */
-static int afs_find_cm_server_by_uuid(struct afs_call *call,
- struct afs_uuid *uuid)
-{
- struct afs_server *server;
-
- rcu_read_lock();
- server = afs_find_server_by_uuid(call->net, call->request);
- rcu_read_unlock();
- if (!server) {
- trace_afs_cm_no_server_u(call, call->request);
- return 0;
- }
-
- call->server = server;
- return 0;
-}
-
-/*
* Clean up a cache manager call.
*/
static void afs_cm_destructor(struct afs_call *call)
@@ -322,10 +279,7 @@ static int afs_deliver_cb_callback(struct afs_call *call)
if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
return afs_io_error(call, afs_io_error_cm_reply);
-
- /* we'll need the file server record as that tells us which set of
- * vnodes to operate upon */
- return afs_find_cm_server_by_peer(call);
+ return 0;
}
/*
@@ -349,18 +303,10 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work)
*/
static int afs_deliver_cb_init_call_back_state(struct afs_call *call)
{
- int ret;
-
_enter("");
afs_extract_discard(call, 0);
- ret = afs_extract_data(call, false);
- if (ret < 0)
- return ret;
-
- /* we'll need the file server record as that tells us which set of
- * vnodes to operate upon */
- return afs_find_cm_server_by_peer(call);
+ return afs_extract_data(call, false);
}
/*
@@ -373,8 +319,6 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
__be32 *b;
int ret;
- _enter("");
-
_enter("{%u}", call->unmarshall);
switch (call->unmarshall) {
@@ -421,9 +365,13 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
return afs_io_error(call, afs_io_error_cm_reply);
- /* we'll need the file server record as that tells us which set of
- * vnodes to operate upon */
- return afs_find_cm_server_by_uuid(call, call->request);
+ if (memcmp(call->request, &call->server->_uuid, sizeof(call->server->_uuid)) != 0) {
+ pr_notice("Callback UUID does not match fileserver UUID\n");
+ trace_afs_cm_no_server_u(call, call->request);
+ return 0;
+ }
+
+ return 0;
}
/*
@@ -455,7 +403,7 @@ static int afs_deliver_cb_probe(struct afs_call *call)
if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
return afs_io_error(call, afs_io_error_cm_reply);
- return afs_find_cm_server_by_peer(call);
+ return 0;
}
/*
@@ -533,7 +481,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call)
if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
return afs_io_error(call, afs_io_error_cm_reply);
- return afs_find_cm_server_by_peer(call);
+ return 0;
}
/*
@@ -593,7 +541,7 @@ static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call)
if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
return afs_io_error(call, afs_io_error_cm_reply);
- return afs_find_cm_server_by_peer(call);
+ return 0;
}
/*
@@ -667,9 +615,5 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call)
if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
return afs_io_error(call, afs_io_error_cm_reply);
-
- /* We'll need the file server record as that tells us which set of
- * vnodes to operate upon.
- */
- return afs_find_cm_server_by_peer(call);
+ return 0;
}
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 02cbf38e1a77..9e7b1fe82c27 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -33,8 +33,8 @@ static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, int nl
loff_t fpos, u64 ino, unsigned dtype);
static int afs_create(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, bool excl);
-static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode);
+static struct dentry *afs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode);
static int afs_rmdir(struct inode *dir, struct dentry *dentry);
static int afs_unlink(struct inode *dir, struct dentry *dentry);
static int afs_link(struct dentry *from, struct inode *dir,
@@ -1004,9 +1004,8 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
afs_stat_v(dvnode, n_lookup);
inode = afs_do_lookup(dir, dentry);
if (inode == ERR_PTR(-ENOENT))
- inode = afs_try_auto_mntpt(dentry, dir);
-
- if (!IS_ERR_OR_NULL(inode))
+ inode = NULL;
+ else if (!IS_ERR_OR_NULL(inode))
fid = AFS_FS_I(inode)->fid;
_debug("splice %p", dentry->d_inode);
@@ -1315,8 +1314,8 @@ static const struct afs_operation_ops afs_mkdir_operation = {
/*
* create a directory on an AFS filesystem
*/
-static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *afs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct afs_operation *op;
struct afs_vnode *dvnode = AFS_FS_I(dir);
@@ -1328,7 +1327,7 @@ static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
op = afs_alloc_operation(NULL, dvnode->volume);
if (IS_ERR(op)) {
d_drop(dentry);
- return PTR_ERR(op);
+ return ERR_CAST(op);
}
fscache_use_cookie(afs_vnode_cache(dvnode), true);
@@ -1344,7 +1343,7 @@ static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
op->ops = &afs_mkdir_operation;
ret = afs_do_sync_operation(op);
afs_dir_unuse_cookie(dvnode, ret);
- return ret;
+ return ERR_PTR(ret);
}
/*
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index 008698d706ca..691e0ae607a1 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -10,16 +10,19 @@
#include <linux/dns_resolver.h>
#include "internal.h"
-static atomic_t afs_autocell_ino;
+#define AFS_MIN_DYNROOT_CELL_INO 4 /* Allow for ., .., @cell, .@cell */
+#define AFS_MAX_DYNROOT_CELL_INO ((unsigned int)INT_MAX)
+
+static struct dentry *afs_lookup_atcell(struct inode *dir, struct dentry *dentry, ino_t ino);
/*
* iget5() comparator for inode created by autocell operations
- *
- * These pseudo inodes don't match anything.
*/
static int afs_iget5_pseudo_test(struct inode *inode, void *opaque)
{
- return 0;
+ struct afs_fid *fid = opaque;
+
+ return inode->i_ino == fid->vnode;
}
/*
@@ -39,28 +42,16 @@ static int afs_iget5_pseudo_set(struct inode *inode, void *opaque)
}
/*
- * Create an inode for a dynamic root directory or an autocell dynamic
- * automount dir.
+ * Create an inode for an autocell dynamic automount dir.
*/
-struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
+static struct inode *afs_iget_pseudo_dir(struct super_block *sb, ino_t ino)
{
- struct afs_super_info *as = AFS_FS_S(sb);
struct afs_vnode *vnode;
struct inode *inode;
- struct afs_fid fid = {};
+ struct afs_fid fid = { .vnode = ino, .unique = 1, };
_enter("");
- if (as->volume)
- fid.vid = as->volume->vid;
- if (root) {
- fid.vnode = 1;
- fid.unique = 1;
- } else {
- fid.vnode = atomic_inc_return(&afs_autocell_ino);
- fid.unique = 0;
- }
-
inode = iget5_locked(sb, fid.vnode,
afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid);
if (!inode) {
@@ -73,115 +64,71 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
vnode = AFS_FS_I(inode);
- /* there shouldn't be an existing inode */
- BUG_ON(!(inode->i_state & I_NEW));
-
- netfs_inode_init(&vnode->netfs, NULL, false);
- inode->i_size = 0;
- inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
- if (root) {
- inode->i_op = &afs_dynroot_inode_operations;
- inode->i_fop = &simple_dir_operations;
- } else {
- inode->i_op = &afs_autocell_inode_operations;
- }
- set_nlink(inode, 2);
- inode->i_uid = GLOBAL_ROOT_UID;
- inode->i_gid = GLOBAL_ROOT_GID;
- simple_inode_init_ts(inode);
- inode->i_blocks = 0;
- inode->i_generation = 0;
-
- set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
- if (!root) {
+ if (inode->i_state & I_NEW) {
+ netfs_inode_init(&vnode->netfs, NULL, false);
+ simple_inode_init_ts(inode);
+ set_nlink(inode, 2);
+ inode->i_size = 0;
+ inode->i_mode = S_IFDIR | 0555;
+ inode->i_op = &afs_autocell_inode_operations;
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
+ inode->i_blocks = 0;
+ inode->i_generation = 0;
+ inode->i_flags |= S_AUTOMOUNT | S_NOATIME;
+
+ set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
- inode->i_flags |= S_AUTOMOUNT;
- }
- inode->i_flags |= S_NOATIME;
- unlock_new_inode(inode);
+ unlock_new_inode(inode);
+ }
_leave(" = %p", inode);
return inode;
}
/*
- * Probe to see if a cell may exist. This prevents positive dentries from
- * being created unnecessarily.
+ * Try to automount the mountpoint with pseudo directory, if the autocell
+ * option is set.
*/
-static int afs_probe_cell_name(struct dentry *dentry)
+static struct dentry *afs_dynroot_lookup_cell(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
{
- struct afs_cell *cell;
+ struct afs_cell *cell = NULL;
struct afs_net *net = afs_d2net(dentry);
+ struct inode *inode = NULL;
const char *name = dentry->d_name.name;
size_t len = dentry->d_name.len;
- char *result = NULL;
- int ret;
+ bool dotted = false;
+ int ret = -ENOENT;
/* Names prefixed with a dot are R/W mounts. */
if (name[0] == '.') {
- if (len == 1)
- return -EINVAL;
name++;
len--;
+ dotted = true;
}
- cell = afs_find_cell(net, name, len, afs_cell_trace_use_probe);
- if (!IS_ERR(cell)) {
- afs_unuse_cell(net, cell, afs_cell_trace_unuse_probe);
- return 0;
+ cell = afs_lookup_cell(net, name, len, NULL, false,
+ afs_cell_trace_use_lookup_dynroot);
+ if (IS_ERR(cell)) {
+ ret = PTR_ERR(cell);
+ goto out_no_cell;
}
- ret = dns_query(net->net, "afsdb", name, len, "srv=1",
- &result, NULL, false);
- if (ret == -ENODATA || ret == -ENOKEY || ret == 0)
- ret = -ENOENT;
- if (ret > 0 && ret >= sizeof(struct dns_server_list_v1_header)) {
- struct dns_server_list_v1_header *v1 = (void *)result;
-
- if (v1->hdr.zero == 0 &&
- v1->hdr.content == DNS_PAYLOAD_IS_SERVER_LIST &&
- v1->hdr.version == 1 &&
- (v1->status != DNS_LOOKUP_GOOD &&
- v1->status != DNS_LOOKUP_GOOD_WITH_BAD))
- return -ENOENT;
-
- }
-
- kfree(result);
- return ret;
-}
-
-/*
- * Try to auto mount the mountpoint with pseudo directory, if the autocell
- * operation is setted.
- */
-struct inode *afs_try_auto_mntpt(struct dentry *dentry, struct inode *dir)
-{
- struct afs_vnode *vnode = AFS_FS_I(dir);
- struct inode *inode;
- int ret = -ENOENT;
-
- _enter("%p{%pd}, {%llx:%llu}",
- dentry, dentry, vnode->fid.vid, vnode->fid.vnode);
-
- if (!test_bit(AFS_VNODE_AUTOCELL, &vnode->flags))
- goto out;
-
- ret = afs_probe_cell_name(dentry);
- if (ret < 0)
- goto out;
-
- inode = afs_iget_pseudo_dir(dir->i_sb, false);
+ inode = afs_iget_pseudo_dir(dir->i_sb, cell->dynroot_ino * 2 + dotted);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
goto out;
}
- _leave("= %p", inode);
- return inode;
+ dentry->d_fsdata = cell;
+ return d_splice_alias(inode, dentry);
out:
- _leave("= %d", ret);
+ afs_unuse_cell(cell, afs_cell_trace_unuse_lookup_dynroot);
+out_no_cell:
+ if (!inode)
+ return d_splice_alias(inode, dentry);
return ret == -ENOENT ? NULL : ERR_PTR(ret);
}
@@ -193,8 +140,6 @@ static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentr
{
_enter("%pd", dentry);
- ASSERTCMP(d_inode(dentry), ==, NULL);
-
if (flags & LOOKUP_CREATE)
return ERR_PTR(-EOPNOTSUPP);
@@ -203,98 +148,49 @@ static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentr
return ERR_PTR(-ENAMETOOLONG);
}
- return d_splice_alias(afs_try_auto_mntpt(dentry, dir), dentry);
+ if (dentry->d_name.len == 5 &&
+ memcmp(dentry->d_name.name, "@cell", 5) == 0)
+ return afs_lookup_atcell(dir, dentry, 2);
+
+ if (dentry->d_name.len == 6 &&
+ memcmp(dentry->d_name.name, ".@cell", 6) == 0)
+ return afs_lookup_atcell(dir, dentry, 3);
+
+ return afs_dynroot_lookup_cell(dir, dentry, flags);
}
const struct inode_operations afs_dynroot_inode_operations = {
.lookup = afs_dynroot_lookup,
};
-const struct dentry_operations afs_dynroot_dentry_operations = {
- .d_delete = always_delete_dentry,
- .d_release = afs_d_release,
- .d_automount = afs_d_automount,
-};
-
-/*
- * Create a manually added cell mount directory.
- * - The caller must hold net->proc_cells_lock
- */
-int afs_dynroot_mkdir(struct afs_net *net, struct afs_cell *cell)
-{
- struct super_block *sb = net->dynroot_sb;
- struct dentry *root, *subdir, *dsubdir;
- char *dotname = cell->name - 1;
- int ret;
-
- if (!sb || atomic_read(&sb->s_active) == 0)
- return 0;
-
- /* Let the ->lookup op do the creation */
- root = sb->s_root;
- inode_lock(root->d_inode);
- subdir = lookup_one_len(cell->name, root, cell->name_len);
- if (IS_ERR(subdir)) {
- ret = PTR_ERR(subdir);
- goto unlock;
- }
-
- dsubdir = lookup_one_len(dotname, root, cell->name_len + 1);
- if (IS_ERR(dsubdir)) {
- ret = PTR_ERR(dsubdir);
- dput(subdir);
- goto unlock;
- }
-
- /* Note that we're retaining extra refs on the dentries. */
- subdir->d_fsdata = (void *)1UL;
- dsubdir->d_fsdata = (void *)1UL;
- ret = 0;
-unlock:
- inode_unlock(root->d_inode);
- return ret;
-}
-
-static void afs_dynroot_rm_one_dir(struct dentry *root, const char *name, size_t name_len)
+static void afs_dynroot_d_release(struct dentry *dentry)
{
- struct dentry *subdir;
-
- /* Don't want to trigger a lookup call, which will re-add the cell */
- subdir = try_lookup_one_len(name, root, name_len);
- if (IS_ERR_OR_NULL(subdir)) {
- _debug("lookup %ld", PTR_ERR(subdir));
- return;
- }
-
- _debug("rmdir %pd %u", subdir, d_count(subdir));
+ struct afs_cell *cell = dentry->d_fsdata;
- if (subdir->d_fsdata) {
- _debug("unpin %u", d_count(subdir));
- subdir->d_fsdata = NULL;
- dput(subdir);
- }
- dput(subdir);
+ afs_unuse_cell(cell, afs_cell_trace_unuse_dynroot_mntpt);
}
/*
- * Remove a manually added cell mount directory.
- * - The caller must hold net->proc_cells_lock
+ * Keep @cell symlink dentries around, but only keep cell autodirs when they're
+ * being used.
*/
-void afs_dynroot_rmdir(struct afs_net *net, struct afs_cell *cell)
+static int afs_dynroot_delete_dentry(const struct dentry *dentry)
{
- struct super_block *sb = net->dynroot_sb;
- char *dotname = cell->name - 1;
-
- if (!sb || atomic_read(&sb->s_active) == 0)
- return;
+ const struct qstr *name = &dentry->d_name;
- inode_lock(sb->s_root->d_inode);
- afs_dynroot_rm_one_dir(sb->s_root, cell->name, cell->name_len);
- afs_dynroot_rm_one_dir(sb->s_root, dotname, cell->name_len + 1);
- inode_unlock(sb->s_root->d_inode);
- _leave("");
+ if (name->len == 5 && memcmp(name->name, "@cell", 5) == 0)
+ return 0;
+ if (name->len == 6 && memcmp(name->name, ".@cell", 6) == 0)
+ return 0;
+ return 1;
}
+const struct dentry_operations afs_dynroot_dentry_operations = {
+ .d_delete = afs_dynroot_delete_dentry,
+ .d_release = afs_dynroot_d_release,
+ .d_automount = afs_d_automount,
+};
+
static void afs_atcell_delayed_put_cell(void *arg)
{
struct afs_cell *cell = arg;
@@ -314,6 +210,9 @@ static const char *afs_atcell_get_link(struct dentry *dentry, struct inode *inod
const char *name;
bool dotted = vnode->fid.vnode == 3;
+ if (!rcu_access_pointer(net->ws_cell))
+ return ERR_PTR(-ENOENT);
+
if (!dentry) {
/* We're in RCU-pathwalk. */
cell = rcu_dereference(net->ws_cell);
@@ -325,9 +224,6 @@ static const char *afs_atcell_get_link(struct dentry *dentry, struct inode *inod
return name;
}
- if (!rcu_access_pointer(net->ws_cell))
- return ERR_PTR(-ENOENT);
-
down_read(&net->cells_lock);
cell = rcu_dereference_protected(net->ws_cell, lockdep_is_held(&net->cells_lock));
@@ -347,149 +243,163 @@ static const struct inode_operations afs_atcell_inode_operations = {
};
/*
- * Look up @cell or .@cell in a dynroot directory. This is a substitution for
- * the local cell name for the net namespace.
+ * Create an inode for the @cell or .@cell symlinks.
*/
-static struct dentry *afs_dynroot_create_symlink(struct dentry *root, const char *name)
+static struct dentry *afs_lookup_atcell(struct inode *dir, struct dentry *dentry, ino_t ino)
{
struct afs_vnode *vnode;
- struct afs_fid fid = { .vnode = 2, .unique = 1, };
- struct dentry *dentry;
struct inode *inode;
+ struct afs_fid fid = { .vnode = ino, .unique = 1, };
- if (name[0] == '.')
- fid.vnode = 3;
-
- dentry = d_alloc_name(root, name);
- if (!dentry)
- return ERR_PTR(-ENOMEM);
-
- inode = iget5_locked(dentry->d_sb, fid.vnode,
+ inode = iget5_locked(dir->i_sb, fid.vnode,
afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid);
- if (!inode) {
- dput(dentry);
+ if (!inode)
return ERR_PTR(-ENOMEM);
- }
vnode = AFS_FS_I(inode);
- /* there shouldn't be an existing inode */
- if (WARN_ON_ONCE(!(inode->i_state & I_NEW))) {
- iput(inode);
- dput(dentry);
- return ERR_PTR(-EIO);
+ if (inode->i_state & I_NEW) {
+ netfs_inode_init(&vnode->netfs, NULL, false);
+ simple_inode_init_ts(inode);
+ set_nlink(inode, 1);
+ inode->i_size = 0;
+ inode->i_mode = S_IFLNK | 0555;
+ inode->i_op = &afs_atcell_inode_operations;
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
+ inode->i_blocks = 0;
+ inode->i_generation = 0;
+ inode->i_flags |= S_NOATIME;
+
+ unlock_new_inode(inode);
}
-
- netfs_inode_init(&vnode->netfs, NULL, false);
- simple_inode_init_ts(inode);
- set_nlink(inode, 1);
- inode->i_size = 0;
- inode->i_mode = S_IFLNK | 0555;
- inode->i_op = &afs_atcell_inode_operations;
- inode->i_uid = GLOBAL_ROOT_UID;
- inode->i_gid = GLOBAL_ROOT_GID;
- inode->i_blocks = 0;
- inode->i_generation = 0;
- inode->i_flags |= S_NOATIME;
-
- unlock_new_inode(inode);
- d_splice_alias(inode, dentry);
- return dentry;
+ return d_splice_alias(inode, dentry);
}
/*
- * Create @cell and .@cell symlinks.
+ * Transcribe the cell database into readdir content under the RCU read lock.
+ * Each cell produces two entries, one prefixed with a dot and one not.
*/
-static int afs_dynroot_symlink(struct afs_net *net)
+static int afs_dynroot_readdir_cells(struct afs_net *net, struct dir_context *ctx)
{
- struct super_block *sb = net->dynroot_sb;
- struct dentry *root, *symlink, *dsymlink;
- int ret;
-
- /* Let the ->lookup op do the creation */
- root = sb->s_root;
- inode_lock(root->d_inode);
- symlink = afs_dynroot_create_symlink(root, "@cell");
- if (IS_ERR(symlink)) {
- ret = PTR_ERR(symlink);
- goto unlock;
- }
+ const struct afs_cell *cell;
+ loff_t newpos;
+
+ _enter("%llu", ctx->pos);
+
+ for (;;) {
+ unsigned int ix = ctx->pos >> 1;
+
+ cell = idr_get_next(&net->cells_dyn_ino, &ix);
+ if (!cell)
+ return 0;
+ if (READ_ONCE(cell->state) == AFS_CELL_REMOVING ||
+ READ_ONCE(cell->state) == AFS_CELL_DEAD) {
+ ctx->pos += 2;
+ ctx->pos &= ~1;
+ continue;
+ }
- dsymlink = afs_dynroot_create_symlink(root, ".@cell");
- if (IS_ERR(dsymlink)) {
- ret = PTR_ERR(dsymlink);
- dput(symlink);
- goto unlock;
- }
+ newpos = ix << 1;
+ if (newpos > ctx->pos)
+ ctx->pos = newpos;
- /* Note that we're retaining extra refs on the dentries. */
- symlink->d_fsdata = (void *)1UL;
- dsymlink->d_fsdata = (void *)1UL;
- ret = 0;
-unlock:
- inode_unlock(root->d_inode);
- return ret;
+ _debug("pos %llu -> cell %u", ctx->pos, cell->dynroot_ino);
+
+ if ((ctx->pos & 1) == 0) {
+ if (!dir_emit(ctx, cell->name, cell->name_len,
+ cell->dynroot_ino, DT_DIR))
+ return 0;
+ ctx->pos++;
+ }
+ if ((ctx->pos & 1) == 1) {
+ if (!dir_emit(ctx, cell->name - 1, cell->name_len + 1,
+ cell->dynroot_ino + 1, DT_DIR))
+ return 0;
+ ctx->pos++;
+ }
+ }
+ return 0;
}
/*
- * Populate a newly created dynamic root with cell names.
+ * Read the AFS dynamic root directory. This produces a list of cellnames,
+ * dotted and undotted, along with @cell and .@cell links if configured.
*/
-int afs_dynroot_populate(struct super_block *sb)
+static int afs_dynroot_readdir(struct file *file, struct dir_context *ctx)
{
- struct afs_cell *cell;
- struct afs_net *net = afs_sb2net(sb);
- int ret;
+ struct afs_net *net = afs_d2net(file->f_path.dentry);
+ int ret = 0;
- mutex_lock(&net->proc_cells_lock);
-
- net->dynroot_sb = sb;
- ret = afs_dynroot_symlink(net);
- if (ret < 0)
- goto error;
+ if (!dir_emit_dots(file, ctx))
+ return 0;
- hlist_for_each_entry(cell, &net->proc_cells, proc_link) {
- ret = afs_dynroot_mkdir(net, cell);
- if (ret < 0)
- goto error;
+ if (ctx->pos == 2) {
+ if (rcu_access_pointer(net->ws_cell) &&
+ !dir_emit(ctx, "@cell", 5, 2, DT_LNK))
+ return 0;
+ ctx->pos = 3;
+ }
+ if (ctx->pos == 3) {
+ if (rcu_access_pointer(net->ws_cell) &&
+ !dir_emit(ctx, ".@cell", 6, 3, DT_LNK))
+ return 0;
+ ctx->pos = 4;
}
- ret = 0;
-out:
- mutex_unlock(&net->proc_cells_lock);
+ if ((unsigned long long)ctx->pos <= AFS_MAX_DYNROOT_CELL_INO) {
+ rcu_read_lock();
+ ret = afs_dynroot_readdir_cells(net, ctx);
+ rcu_read_unlock();
+ }
return ret;
-
-error:
- net->dynroot_sb = NULL;
- goto out;
}
+static const struct file_operations afs_dynroot_file_operations = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+ .iterate_shared = afs_dynroot_readdir,
+ .fsync = noop_fsync,
+};
+
/*
- * When a dynamic root that's in the process of being destroyed, depopulate it
- * of pinned directories.
+ * Create an inode for a dynamic root directory.
*/
-void afs_dynroot_depopulate(struct super_block *sb)
+struct inode *afs_dynroot_iget_root(struct super_block *sb)
{
- struct afs_net *net = afs_sb2net(sb);
- struct dentry *root = sb->s_root, *subdir;
-
- /* Prevent more subdirs from being created */
- mutex_lock(&net->proc_cells_lock);
- if (net->dynroot_sb == sb)
- net->dynroot_sb = NULL;
- mutex_unlock(&net->proc_cells_lock);
-
- if (root) {
- struct hlist_node *n;
- inode_lock(root->d_inode);
-
- /* Remove all the pins for dirs created for manually added cells */
- hlist_for_each_entry_safe(subdir, n, &root->d_children, d_sib) {
- if (subdir->d_fsdata) {
- subdir->d_fsdata = NULL;
- dput(subdir);
- }
- }
+ struct afs_super_info *as = AFS_FS_S(sb);
+ struct afs_vnode *vnode;
+ struct inode *inode;
+ struct afs_fid fid = { .vid = 0, .vnode = 1, .unique = 1,};
+
+ if (as->volume)
+ fid.vid = as->volume->vid;
+
+ inode = iget5_locked(sb, fid.vnode,
+ afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ vnode = AFS_FS_I(inode);
- inode_unlock(root->d_inode);
+ /* there shouldn't be an existing inode */
+ if (inode->i_state & I_NEW) {
+ netfs_inode_init(&vnode->netfs, NULL, false);
+ simple_inode_init_ts(inode);
+ set_nlink(inode, 2);
+ inode->i_size = 0;
+ inode->i_mode = S_IFDIR | 0555;
+ inode->i_op = &afs_dynroot_inode_operations;
+ inode->i_fop = &afs_dynroot_file_operations;
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
+ inode->i_blocks = 0;
+ inode->i_generation = 0;
+ inode->i_flags |= S_NOATIME;
+
+ set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
+ unlock_new_inode(inode);
}
+ _leave(" = %p", inode);
+ return inode;
}
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index b516d05b0fef..07a8bfbdd9b9 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -235,20 +235,20 @@ out:
* Probe all of a fileserver's addresses to find out the best route and to
* query its capabilities.
*/
-void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
- struct afs_addr_list *new_alist, struct key *key)
+int afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
+ struct afs_addr_list *new_alist, struct key *key)
{
struct afs_endpoint_state *estate, *old;
- struct afs_addr_list *alist;
+ struct afs_addr_list *old_alist = NULL, *alist;
unsigned long unprobed;
_enter("%pU", &server->uuid);
estate = kzalloc(sizeof(*estate), GFP_KERNEL);
if (!estate)
- return;
+ return -ENOMEM;
- refcount_set(&estate->ref, 1);
+ refcount_set(&estate->ref, 2);
estate->server_id = server->debug_id;
estate->rtt = UINT_MAX;
@@ -256,21 +256,31 @@ void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
old = rcu_dereference_protected(server->endpoint_state,
lockdep_is_held(&server->fs_lock));
- estate->responsive_set = old->responsive_set;
- estate->addresses = afs_get_addrlist(new_alist ?: old->addresses,
- afs_alist_trace_get_estate);
+ if (old) {
+ estate->responsive_set = old->responsive_set;
+ if (!new_alist)
+ new_alist = old->addresses;
+ }
+
+ if (old_alist != new_alist)
+ afs_set_peer_appdata(server, old_alist, new_alist);
+
+ estate->addresses = afs_get_addrlist(new_alist, afs_alist_trace_get_estate);
alist = estate->addresses;
estate->probe_seq = ++server->probe_counter;
atomic_set(&estate->nr_probing, alist->nr_addrs);
+ if (new_alist)
+ server->addr_version = new_alist->version;
rcu_assign_pointer(server->endpoint_state, estate);
- set_bit(AFS_ESTATE_SUPERSEDED, &old->flags);
write_unlock(&server->fs_lock);
+ if (old)
+ set_bit(AFS_ESTATE_SUPERSEDED, &old->flags);
trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref),
afs_estate_trace_alloc_probe);
- afs_get_address_preferences(net, alist);
+ afs_get_address_preferences(net, new_alist);
server->probed_at = jiffies;
unprobed = (1UL << alist->nr_addrs) - 1;
@@ -293,6 +303,8 @@ void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
}
afs_put_endpoint_state(old, afs_estate_trace_put_probe);
+ afs_put_endpoint_state(estate, afs_estate_trace_put_probe);
+ return 0;
}
/*
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 1d9ecd5418d8..bc9556991d7c 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -1653,7 +1653,7 @@ int afs_fs_give_up_all_callbacks(struct afs_net *net, struct afs_server *server,
bp = call->request;
*bp++ = htonl(FSGIVEUPALLCALLBACKS);
- call->server = afs_use_server(server, afs_server_trace_give_up_cb);
+ call->server = afs_use_server(server, false, afs_server_trace_use_give_up_cb);
afs_make_call(call, GFP_NOFS);
afs_wait_for_call_to_complete(call);
ret = call->error;
@@ -1760,7 +1760,7 @@ bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server,
return false;
call->key = key;
- call->server = afs_use_server(server, afs_server_trace_get_caps);
+ call->server = afs_use_server(server, false, afs_server_trace_use_get_caps);
call->peer = rxrpc_kernel_get_peer(estate->addresses->addrs[addr_index].peer);
call->probe = afs_get_endpoint_state(estate, afs_estate_trace_get_getcaps);
call->probe_index = addr_index;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index df30bd62da79..440b0e731093 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -287,9 +287,8 @@ struct afs_net {
/* Cell database */
struct rb_root cells;
+ struct idr cells_dyn_ino; /* cell->dynroot_ino mapping */
struct afs_cell __rcu *ws_cell;
- struct work_struct cells_manager;
- struct timer_list cells_timer;
atomic_t cells_outstanding;
struct rw_semaphore cells_lock;
struct mutex cells_alias_lock;
@@ -301,18 +300,11 @@ struct afs_net {
* cell, but in practice, people create aliases and subsets and there's
* no easy way to distinguish them.
*/
- seqlock_t fs_lock; /* For fs_servers, fs_probe_*, fs_proc */
- struct rb_root fs_servers; /* afs_server (by server UUID or address) */
+ seqlock_t fs_lock; /* For fs_probe_*, fs_proc */
struct list_head fs_probe_fast; /* List of afs_server to probe at 30s intervals */
struct list_head fs_probe_slow; /* List of afs_server to probe at 5m intervals */
struct hlist_head fs_proc; /* procfs servers list */
- struct hlist_head fs_addresses; /* afs_server (by lowest IPv6 addr) */
- seqlock_t fs_addr_lock; /* For fs_addresses[46] */
-
- struct work_struct fs_manager;
- struct timer_list fs_timer;
-
struct work_struct fs_prober;
struct timer_list fs_probe_timer;
atomic_t servers_outstanding;
@@ -345,13 +337,10 @@ struct afs_net {
extern const char afs_init_sysname[];
enum afs_cell_state {
- AFS_CELL_UNSET,
- AFS_CELL_ACTIVATING,
+ AFS_CELL_SETTING_UP,
AFS_CELL_ACTIVE,
- AFS_CELL_DEACTIVATING,
- AFS_CELL_INACTIVE,
- AFS_CELL_FAILED,
- AFS_CELL_REMOVED,
+ AFS_CELL_REMOVING,
+ AFS_CELL_DEAD,
};
/*
@@ -382,7 +371,9 @@ struct afs_cell {
struct afs_cell *alias_of; /* The cell this is an alias of */
struct afs_volume *root_volume; /* The root.cell volume if there is one */
struct key *anonymous_key; /* anonymous user key for this cell */
+ struct work_struct destroyer; /* Destroyer for cell */
struct work_struct manager; /* Manager for init/deinit/dns */
+ struct timer_list management_timer; /* General management timer */
struct hlist_node proc_link; /* /proc cell list link */
time64_t dns_expiry; /* Time AFSDB/SRV record expires */
time64_t last_inactive; /* Time of last drop of usage count */
@@ -398,6 +389,7 @@ struct afs_cell {
enum dns_lookup_status dns_status:8; /* Latest status of data from lookup */
unsigned int dns_lookup_count; /* Counter of DNS lookups */
unsigned int debug_id;
+ unsigned int dynroot_ino; /* Inode numbers for dynroot (a pair) */
/* The volumes belonging to this cell */
struct rw_semaphore vs_lock; /* Lock for server->volumes */
@@ -407,7 +399,7 @@ struct afs_cell {
/* Active fileserver interaction state. */
struct rb_root fs_servers; /* afs_server (by server UUID) */
- seqlock_t fs_lock; /* For fs_servers */
+ struct rw_semaphore fs_lock; /* For fs_servers */
/* VL server list. */
rwlock_t vl_servers_lock; /* Lock on vl_servers */
@@ -542,22 +534,22 @@ struct afs_server {
};
struct afs_cell *cell; /* Cell to which belongs (pins ref) */
- struct rb_node uuid_rb; /* Link in net->fs_servers */
- struct afs_server __rcu *uuid_next; /* Next server with same UUID */
- struct afs_server *uuid_prev; /* Previous server with same UUID */
- struct list_head probe_link; /* Link in net->fs_probe_list */
- struct hlist_node addr_link; /* Link in net->fs_addresses6 */
+ struct rb_node uuid_rb; /* Link in cell->fs_servers */
+ struct list_head probe_link; /* Link in net->fs_probe_* */
struct hlist_node proc_link; /* Link in net->fs_proc */
struct list_head volumes; /* RCU list of afs_server_entry objects */
- struct afs_server *gc_next; /* Next server in manager's list */
+ struct work_struct destroyer; /* Work item to try and destroy a server */
+ struct timer_list timer; /* Management timer */
time64_t unuse_time; /* Time at which last unused */
unsigned long flags;
#define AFS_SERVER_FL_RESPONDING 0 /* The server is responding */
#define AFS_SERVER_FL_UPDATING 1
#define AFS_SERVER_FL_NEEDS_UPDATE 2 /* Fileserver address list is out of date */
-#define AFS_SERVER_FL_NOT_READY 4 /* The record is not ready for use */
-#define AFS_SERVER_FL_NOT_FOUND 5 /* VL server says no such server */
-#define AFS_SERVER_FL_VL_FAIL 6 /* Failed to access VL server */
+#define AFS_SERVER_FL_UNCREATED 3 /* The record needs creating */
+#define AFS_SERVER_FL_CREATING 4 /* The record is being created */
+#define AFS_SERVER_FL_EXPIRED 5 /* The record has expired */
+#define AFS_SERVER_FL_NOT_FOUND 6 /* VL server says no such server */
+#define AFS_SERVER_FL_VL_FAIL 7 /* Failed to access VL server */
#define AFS_SERVER_FL_MAY_HAVE_CB 8 /* May have callbacks on this fileserver */
#define AFS_SERVER_FL_IS_YFS 16 /* Server is YFS not AFS */
#define AFS_SERVER_FL_NO_IBULK 17 /* Fileserver doesn't support FS.InlineBulkStatus */
@@ -567,6 +559,7 @@ struct afs_server {
atomic_t active; /* Active user count */
u32 addr_version; /* Address list version */
u16 service_id; /* Service ID we're using. */
+ short create_error; /* Creation error */
unsigned int rtt; /* Server's current RTT in uS */
unsigned int debug_id; /* Debugging ID for traces */
@@ -621,6 +614,7 @@ struct afs_volume {
afs_volid_t vid; /* The volume ID of this volume */
afs_volid_t vids[AFS_MAXTYPES]; /* All associated volume IDs */
refcount_t ref;
+ unsigned int debug_id; /* Debugging ID for traces */
time64_t update_at; /* Time at which to next update */
struct afs_cell *cell; /* Cell to which belongs (pins ref) */
struct rb_node cell_node; /* Link in cell->volumes */
@@ -700,7 +694,6 @@ struct afs_vnode {
#define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */
#define AFS_VNODE_DELETED 4 /* set if vnode deleted on server */
#define AFS_VNODE_MOUNTPOINT 5 /* set if vnode is a mountpoint symlink */
-#define AFS_VNODE_AUTOCELL 6 /* set if Vnode is an auto mount point */
#define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */
#define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */
#define AFS_VNODE_SILLY_DELETED 9 /* Set if file has been silly-deleted */
@@ -1008,6 +1001,9 @@ extern int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *addr,
__be32 xdr, u16 port);
extern int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *addr,
__be32 *xdr, u16 port);
+void afs_set_peer_appdata(struct afs_server *server,
+ struct afs_addr_list *old_alist,
+ struct afs_addr_list *new_alist);
/*
* addr_prefs.c
@@ -1044,16 +1040,17 @@ static inline bool afs_cb_is_broken(unsigned int cb_break,
extern int afs_cell_init(struct afs_net *, const char *);
extern struct afs_cell *afs_find_cell(struct afs_net *, const char *, unsigned,
enum afs_cell_trace);
-extern struct afs_cell *afs_lookup_cell(struct afs_net *, const char *, unsigned,
- const char *, bool);
+struct afs_cell *afs_lookup_cell(struct afs_net *net,
+ const char *name, unsigned int namesz,
+ const char *vllist, bool excl,
+ enum afs_cell_trace trace);
extern struct afs_cell *afs_use_cell(struct afs_cell *, enum afs_cell_trace);
-extern void afs_unuse_cell(struct afs_net *, struct afs_cell *, enum afs_cell_trace);
+void afs_unuse_cell(struct afs_cell *cell, enum afs_cell_trace reason);
extern struct afs_cell *afs_get_cell(struct afs_cell *, enum afs_cell_trace);
extern void afs_see_cell(struct afs_cell *, enum afs_cell_trace);
extern void afs_put_cell(struct afs_cell *, enum afs_cell_trace);
extern void afs_queue_cell(struct afs_cell *, enum afs_cell_trace);
-extern void afs_manage_cells(struct work_struct *);
-extern void afs_cells_timer(struct timer_list *);
+void afs_set_cell_timer(struct afs_cell *cell, unsigned int delay_secs);
extern void __net_exit afs_cell_purge(struct afs_net *);
/*
@@ -1111,11 +1108,7 @@ extern int afs_silly_iput(struct dentry *, struct inode *);
extern const struct inode_operations afs_dynroot_inode_operations;
extern const struct dentry_operations afs_dynroot_dentry_operations;
-extern struct inode *afs_try_auto_mntpt(struct dentry *, struct inode *);
-extern int afs_dynroot_mkdir(struct afs_net *, struct afs_cell *);
-extern void afs_dynroot_rmdir(struct afs_net *, struct afs_cell *);
-extern int afs_dynroot_populate(struct super_block *);
-extern void afs_dynroot_depopulate(struct super_block *);
+struct inode *afs_dynroot_iget_root(struct super_block *sb);
/*
* file.c
@@ -1207,8 +1200,8 @@ struct afs_endpoint_state *afs_get_endpoint_state(struct afs_endpoint_state *est
enum afs_estate_trace where);
void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where);
extern void afs_fileserver_probe_result(struct afs_call *);
-void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
- struct afs_addr_list *new_addrs, struct key *key);
+int afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
+ struct afs_addr_list *new_alist, struct key *key);
int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr);
extern void afs_probe_fileserver(struct afs_net *, struct afs_server *);
extern void afs_fs_probe_dispatcher(struct work_struct *);
@@ -1228,7 +1221,6 @@ int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen);
extern void afs_vnode_commit_status(struct afs_operation *, struct afs_vnode_param *);
extern int afs_fetch_status(struct afs_vnode *, struct key *, bool, afs_access_t *);
extern int afs_ilookup5_test_by_fid(struct inode *, void *);
-extern struct inode *afs_iget_pseudo_dir(struct super_block *, bool);
extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *);
extern struct inode *afs_root_iget(struct super_block *, struct key *);
extern int afs_getattr(struct mnt_idmap *idmap, const struct path *,
@@ -1510,20 +1502,30 @@ extern void __exit afs_clean_up_permit_cache(void);
*/
extern spinlock_t afs_server_peer_lock;
-extern struct afs_server *afs_find_server(struct afs_net *, const struct rxrpc_peer *);
-extern struct afs_server *afs_find_server_by_uuid(struct afs_net *, const uuid_t *);
+struct afs_server *afs_find_server(const struct rxrpc_peer *peer);
extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *, u32);
extern struct afs_server *afs_get_server(struct afs_server *, enum afs_server_trace);
-extern struct afs_server *afs_use_server(struct afs_server *, enum afs_server_trace);
-extern void afs_unuse_server(struct afs_net *, struct afs_server *, enum afs_server_trace);
-extern void afs_unuse_server_notime(struct afs_net *, struct afs_server *, enum afs_server_trace);
+struct afs_server *afs_use_server(struct afs_server *server, bool activate,
+ enum afs_server_trace reason);
+void afs_unuse_server(struct afs_net *net, struct afs_server *server,
+ enum afs_server_trace reason);
+void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server,
+ enum afs_server_trace reason);
extern void afs_put_server(struct afs_net *, struct afs_server *, enum afs_server_trace);
-extern void afs_manage_servers(struct work_struct *);
-extern void afs_servers_timer(struct timer_list *);
+void afs_purge_servers(struct afs_cell *cell);
extern void afs_fs_probe_timer(struct timer_list *);
-extern void __net_exit afs_purge_servers(struct afs_net *);
+void __net_exit afs_wait_for_servers(struct afs_net *net);
bool afs_check_server_record(struct afs_operation *op, struct afs_server *server, struct key *key);
+static inline void afs_see_server(struct afs_server *server, enum afs_server_trace trace)
+{
+ int r = refcount_read(&server->ref);
+ int a = atomic_read(&server->active);
+
+ trace_afs_server(server->debug_id, r, a, trace);
+
+}
+
static inline void afs_inc_servers_outstanding(struct afs_net *net)
{
atomic_inc(&net->servers_outstanding);
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 1ae0067f772d..c845c5daaeba 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -76,25 +76,17 @@ static int __net_init afs_net_init(struct net *net_ns)
mutex_init(&net->socket_mutex);
net->cells = RB_ROOT;
+ idr_init(&net->cells_dyn_ino);
init_rwsem(&net->cells_lock);
- INIT_WORK(&net->cells_manager, afs_manage_cells);
- timer_setup(&net->cells_timer, afs_cells_timer, 0);
-
mutex_init(&net->cells_alias_lock);
mutex_init(&net->proc_cells_lock);
INIT_HLIST_HEAD(&net->proc_cells);
seqlock_init(&net->fs_lock);
- net->fs_servers = RB_ROOT;
INIT_LIST_HEAD(&net->fs_probe_fast);
INIT_LIST_HEAD(&net->fs_probe_slow);
INIT_HLIST_HEAD(&net->fs_proc);
- INIT_HLIST_HEAD(&net->fs_addresses);
- seqlock_init(&net->fs_addr_lock);
-
- INIT_WORK(&net->fs_manager, afs_manage_servers);
- timer_setup(&net->fs_timer, afs_servers_timer, 0);
INIT_WORK(&net->fs_prober, afs_fs_probe_dispatcher);
timer_setup(&net->fs_probe_timer, afs_fs_probe_timer, 0);
atomic_set(&net->servers_outstanding, 1);
@@ -130,13 +122,14 @@ error_open_socket:
net->live = false;
afs_fs_probe_cleanup(net);
afs_cell_purge(net);
- afs_purge_servers(net);
+ afs_wait_for_servers(net);
error_cell_init:
net->live = false;
afs_proc_cleanup(net);
error_proc:
afs_put_sysnames(net->sysnames);
error_sysnames:
+ idr_destroy(&net->cells_dyn_ino);
net->live = false;
return ret;
}
@@ -151,10 +144,11 @@ static void __net_exit afs_net_exit(struct net *net_ns)
net->live = false;
afs_fs_probe_cleanup(net);
afs_cell_purge(net);
- afs_purge_servers(net);
+ afs_wait_for_servers(net);
afs_close_socket(net);
afs_proc_cleanup(net);
afs_put_sysnames(net->sysnames);
+ idr_destroy(&net->cells_dyn_ino);
kfree_rcu(rcu_access_pointer(net->address_prefs), rcu);
}
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 507c25a5b2cb..45cee6534122 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -87,7 +87,7 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
ctx->force = true;
}
if (ctx->cell) {
- afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_mntpt);
+ afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_mntpt);
ctx->cell = NULL;
}
if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) {
@@ -107,7 +107,8 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
if (size > AFS_MAXCELLNAME)
return -ENAMETOOLONG;
- cell = afs_lookup_cell(ctx->net, p, size, NULL, false);
+ cell = afs_lookup_cell(ctx->net, p, size, NULL, false,
+ afs_cell_trace_use_lookup_mntpt);
if (IS_ERR(cell)) {
pr_err("kAFS: unable to lookup cell '%pd'\n", mntpt);
return PTR_ERR(cell);
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 12c88d8be3fe..40e879c8ca77 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -122,14 +122,15 @@ static int afs_proc_cells_write(struct file *file, char *buf, size_t size)
if (strcmp(buf, "add") == 0) {
struct afs_cell *cell;
- cell = afs_lookup_cell(net, name, strlen(name), args, true);
+ cell = afs_lookup_cell(net, name, strlen(name), args, true,
+ afs_cell_trace_use_lookup_add);
if (IS_ERR(cell)) {
ret = PTR_ERR(cell);
goto done;
}
if (test_and_set_bit(AFS_CELL_FL_NO_GC, &cell->flags))
- afs_unuse_cell(net, cell, afs_cell_trace_unuse_no_pin);
+ afs_unuse_cell(cell, afs_cell_trace_unuse_no_pin);
} else {
goto inval;
}
@@ -443,8 +444,6 @@ static int afs_proc_servers_show(struct seq_file *m, void *v)
}
server = list_entry(v, struct afs_server, proc_link);
- estate = rcu_dereference(server->endpoint_state);
- alist = estate->addresses;
seq_printf(m, "%pU %3d %3d %s\n",
&server->uuid,
refcount_read(&server->ref),
@@ -454,10 +453,16 @@ static int afs_proc_servers_show(struct seq_file *m, void *v)
server->flags, server->rtt);
seq_printf(m, " - probe: last=%d\n",
(int)(jiffies - server->probed_at) / HZ);
+
+ estate = rcu_dereference(server->endpoint_state);
+ if (!estate)
+ goto out;
failed = estate->failed_set;
seq_printf(m, " - ESTATE pq=%x np=%u rsp=%lx f=%lx\n",
estate->probe_seq, atomic_read(&estate->nr_probing),
estate->responsive_set, estate->failed_set);
+
+ alist = estate->addresses;
seq_printf(m, " - ALIST v=%u ap=%u\n",
alist->version, alist->addr_pref_version);
for (i = 0; i < alist->nr_addrs; i++) {
@@ -470,6 +475,8 @@ static int afs_proc_servers_show(struct seq_file *m, void *v)
rxrpc_kernel_get_srtt(addr->peer),
addr->last_error, addr->prio);
}
+
+out:
return 0;
}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 886416ea1d96..d5e480a33859 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -179,7 +179,7 @@ static void afs_free_call(struct afs_call *call)
if (call->type->destructor)
call->type->destructor(call);
- afs_unuse_server_notime(call->net, call->server, afs_server_trace_put_call);
+ afs_unuse_server_notime(call->net, call->server, afs_server_trace_unuse_call);
kfree(call->request);
o = atomic_read(&net->nr_outstanding_calls);
@@ -766,8 +766,14 @@ static void afs_rx_discard_new_call(struct rxrpc_call *rxcall,
static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall,
unsigned long user_call_ID)
{
+ struct afs_call *call = (struct afs_call *)user_call_ID;
struct afs_net *net = afs_sock2net(sk);
+ call->peer = rxrpc_kernel_get_call_peer(sk->sk_socket, call->rxcall);
+ call->server = afs_find_server(call->peer);
+ if (!call->server)
+ trace_afs_cm_no_server(call, rxrpc_kernel_remote_srx(call->peer));
+
queue_work(afs_wq, &net->charge_preallocation_work);
}
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 4504e16b458c..c530d1ca15df 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -14,190 +14,104 @@
static unsigned afs_server_gc_delay = 10; /* Server record timeout in seconds */
static atomic_t afs_server_debug_id;
-static struct afs_server *afs_maybe_use_server(struct afs_server *,
- enum afs_server_trace);
static void __afs_put_server(struct afs_net *, struct afs_server *);
+static void afs_server_timer(struct timer_list *timer);
+static void afs_server_destroyer(struct work_struct *work);
/*
* Find a server by one of its addresses.
*/
-struct afs_server *afs_find_server(struct afs_net *net, const struct rxrpc_peer *peer)
+struct afs_server *afs_find_server(const struct rxrpc_peer *peer)
{
- const struct afs_endpoint_state *estate;
- const struct afs_addr_list *alist;
- struct afs_server *server = NULL;
- unsigned int i;
- int seq = 1;
+ struct afs_server *server = (struct afs_server *)rxrpc_kernel_get_peer_data(peer);
- rcu_read_lock();
-
- do {
- if (server)
- afs_unuse_server_notime(net, server, afs_server_trace_put_find_rsq);
- server = NULL;
- seq++; /* 2 on the 1st/lockless path, otherwise odd */
- read_seqbegin_or_lock(&net->fs_addr_lock, &seq);
-
- hlist_for_each_entry_rcu(server, &net->fs_addresses, addr_link) {
- estate = rcu_dereference(server->endpoint_state);
- alist = estate->addresses;
- for (i = 0; i < alist->nr_addrs; i++)
- if (alist->addrs[i].peer == peer)
- goto found;
- }
-
- server = NULL;
- continue;
- found:
- server = afs_maybe_use_server(server, afs_server_trace_get_by_addr);
-
- } while (need_seqretry(&net->fs_addr_lock, seq));
-
- done_seqretry(&net->fs_addr_lock, seq);
-
- rcu_read_unlock();
- return server;
+ if (!server)
+ return NULL;
+ return afs_use_server(server, false, afs_server_trace_use_cm_call);
}
/*
- * Look up a server by its UUID and mark it active.
+ * Look up a server by its UUID and mark it active. The caller must hold
+ * cell->fs_lock.
*/
-struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uuid)
+static struct afs_server *afs_find_server_by_uuid(struct afs_cell *cell, const uuid_t *uuid)
{
- struct afs_server *server = NULL;
+ struct afs_server *server;
struct rb_node *p;
- int diff, seq = 1;
+ int diff;
_enter("%pU", uuid);
- do {
- /* Unfortunately, rbtree walking doesn't give reliable results
- * under just the RCU read lock, so we have to check for
- * changes.
- */
- if (server)
- afs_unuse_server(net, server, afs_server_trace_put_uuid_rsq);
- server = NULL;
- seq++; /* 2 on the 1st/lockless path, otherwise odd */
- read_seqbegin_or_lock(&net->fs_lock, &seq);
-
- p = net->fs_servers.rb_node;
- while (p) {
- server = rb_entry(p, struct afs_server, uuid_rb);
-
- diff = memcmp(uuid, &server->uuid, sizeof(*uuid));
- if (diff < 0) {
- p = p->rb_left;
- } else if (diff > 0) {
- p = p->rb_right;
- } else {
- afs_use_server(server, afs_server_trace_get_by_uuid);
- break;
- }
-
- server = NULL;
- }
- } while (need_seqretry(&net->fs_lock, seq));
+ p = cell->fs_servers.rb_node;
+ while (p) {
+ server = rb_entry(p, struct afs_server, uuid_rb);
- done_seqretry(&net->fs_lock, seq);
+ diff = memcmp(uuid, &server->uuid, sizeof(*uuid));
+ if (diff < 0) {
+ p = p->rb_left;
+ } else if (diff > 0) {
+ p = p->rb_right;
+ } else {
+ if (test_bit(AFS_SERVER_FL_UNCREATED, &server->flags))
+ return NULL; /* Need a write lock */
+ afs_use_server(server, true, afs_server_trace_use_by_uuid);
+ return server;
+ }
+ }
- _leave(" = %p", server);
- return server;
+ return NULL;
}
/*
- * Install a server record in the namespace tree. If there's a clash, we stick
- * it into a list anchored on whichever afs_server struct is actually in the
- * tree.
+ * Install a server record in the cell tree. The caller must hold an exclusive
+ * lock on cell->fs_lock.
*/
static struct afs_server *afs_install_server(struct afs_cell *cell,
- struct afs_server *candidate)
+ struct afs_server **candidate)
{
- const struct afs_endpoint_state *estate;
- const struct afs_addr_list *alist;
- struct afs_server *server, *next;
+ struct afs_server *server;
struct afs_net *net = cell->net;
struct rb_node **pp, *p;
int diff;
_enter("%p", candidate);
- write_seqlock(&net->fs_lock);
-
/* Firstly install the server in the UUID lookup tree */
- pp = &net->fs_servers.rb_node;
+ pp = &cell->fs_servers.rb_node;
p = NULL;
while (*pp) {
p = *pp;
_debug("- consider %p", p);
server = rb_entry(p, struct afs_server, uuid_rb);
- diff = memcmp(&candidate->uuid, &server->uuid, sizeof(uuid_t));
- if (diff < 0) {
+ diff = memcmp(&(*candidate)->uuid, &server->uuid, sizeof(uuid_t));
+ if (diff < 0)
pp = &(*pp)->rb_left;
- } else if (diff > 0) {
+ else if (diff > 0)
pp = &(*pp)->rb_right;
- } else {
- if (server->cell == cell)
- goto exists;
-
- /* We have the same UUID representing servers in
- * different cells. Append the new server to the list.
- */
- for (;;) {
- next = rcu_dereference_protected(
- server->uuid_next,
- lockdep_is_held(&net->fs_lock.lock));
- if (!next)
- break;
- server = next;
- }
- rcu_assign_pointer(server->uuid_next, candidate);
- candidate->uuid_prev = server;
- server = candidate;
- goto added_dup;
- }
+ else
+ goto exists;
}
- server = candidate;
+ server = *candidate;
+ *candidate = NULL;
rb_link_node(&server->uuid_rb, p, pp);
- rb_insert_color(&server->uuid_rb, &net->fs_servers);
+ rb_insert_color(&server->uuid_rb, &cell->fs_servers);
+ write_seqlock(&net->fs_lock);
hlist_add_head_rcu(&server->proc_link, &net->fs_proc);
+ write_sequnlock(&net->fs_lock);
afs_get_cell(cell, afs_cell_trace_get_server);
-added_dup:
- write_seqlock(&net->fs_addr_lock);
- estate = rcu_dereference_protected(server->endpoint_state,
- lockdep_is_held(&net->fs_addr_lock.lock));
- alist = estate->addresses;
-
- /* Secondly, if the server has any IPv4 and/or IPv6 addresses, install
- * it in the IPv4 and/or IPv6 reverse-map lists.
- *
- * TODO: For speed we want to use something other than a flat list
- * here; even sorting the list in terms of lowest address would help a
- * bit, but anything we might want to do gets messy and memory
- * intensive.
- */
- if (alist->nr_addrs > 0)
- hlist_add_head_rcu(&server->addr_link, &net->fs_addresses);
-
- write_sequnlock(&net->fs_addr_lock);
-
exists:
- afs_get_server(server, afs_server_trace_get_install);
- write_sequnlock(&net->fs_lock);
+ afs_use_server(server, true, afs_server_trace_use_install);
return server;
}
/*
- * Allocate a new server record and mark it active.
+ * Allocate a new server record and mark it as active but uncreated.
*/
-static struct afs_server *afs_alloc_server(struct afs_cell *cell,
- const uuid_t *uuid,
- struct afs_addr_list *alist)
+static struct afs_server *afs_alloc_server(struct afs_cell *cell, const uuid_t *uuid)
{
- struct afs_endpoint_state *estate;
struct afs_server *server;
struct afs_net *net = cell->net;
@@ -205,65 +119,49 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
server = kzalloc(sizeof(struct afs_server), GFP_KERNEL);
if (!server)
- goto enomem;
-
- estate = kzalloc(sizeof(struct afs_endpoint_state), GFP_KERNEL);
- if (!estate)
- goto enomem_server;
+ return NULL;
refcount_set(&server->ref, 1);
- atomic_set(&server->active, 1);
+ atomic_set(&server->active, 0);
+ __set_bit(AFS_SERVER_FL_UNCREATED, &server->flags);
server->debug_id = atomic_inc_return(&afs_server_debug_id);
- server->addr_version = alist->version;
server->uuid = *uuid;
rwlock_init(&server->fs_lock);
+ INIT_WORK(&server->destroyer, &afs_server_destroyer);
+ timer_setup(&server->timer, afs_server_timer, 0);
INIT_LIST_HEAD(&server->volumes);
init_waitqueue_head(&server->probe_wq);
INIT_LIST_HEAD(&server->probe_link);
+ INIT_HLIST_NODE(&server->proc_link);
spin_lock_init(&server->probe_lock);
server->cell = cell;
server->rtt = UINT_MAX;
server->service_id = FS_SERVICE;
-
server->probe_counter = 1;
server->probed_at = jiffies - LONG_MAX / 2;
- refcount_set(&estate->ref, 1);
- estate->addresses = alist;
- estate->server_id = server->debug_id;
- estate->probe_seq = 1;
- rcu_assign_pointer(server->endpoint_state, estate);
afs_inc_servers_outstanding(net);
- trace_afs_server(server->debug_id, 1, 1, afs_server_trace_alloc);
- trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref),
- afs_estate_trace_alloc_server);
_leave(" = %p", server);
return server;
-
-enomem_server:
- kfree(server);
-enomem:
- _leave(" = NULL [nomem]");
- return NULL;
}
/*
* Look up an address record for a server
*/
-static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_cell *cell,
- struct key *key, const uuid_t *uuid)
+static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_server *server,
+ struct key *key)
{
struct afs_vl_cursor vc;
struct afs_addr_list *alist = NULL;
int ret;
ret = -ERESTARTSYS;
- if (afs_begin_vlserver_operation(&vc, cell, key)) {
+ if (afs_begin_vlserver_operation(&vc, server->cell, key)) {
while (afs_select_vlserver(&vc)) {
if (test_bit(AFS_VLSERVER_FL_IS_YFS, &vc.server->flags))
- alist = afs_yfsvl_get_endpoints(&vc, uuid);
+ alist = afs_yfsvl_get_endpoints(&vc, &server->uuid);
else
- alist = afs_vl_get_addrs_u(&vc, uuid);
+ alist = afs_vl_get_addrs_u(&vc, &server->uuid);
}
ret = afs_end_vlserver_operation(&vc);
@@ -273,72 +171,122 @@ static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_cell *cell,
}
/*
- * Get or create a fileserver record.
+ * Get or create a fileserver record and return it with an active-use count on
+ * it.
*/
struct afs_server *afs_lookup_server(struct afs_cell *cell, struct key *key,
const uuid_t *uuid, u32 addr_version)
{
- struct afs_addr_list *alist;
- struct afs_server *server, *candidate;
+ struct afs_addr_list *alist = NULL;
+ struct afs_server *server, *candidate = NULL;
+ bool creating = false;
+ int ret;
_enter("%p,%pU", cell->net, uuid);
- server = afs_find_server_by_uuid(cell->net, uuid);
+ down_read(&cell->fs_lock);
+ server = afs_find_server_by_uuid(cell, uuid);
+ /* Won't see servers marked uncreated. */
+ up_read(&cell->fs_lock);
+
if (server) {
+ timer_delete_sync(&server->timer);
+ if (test_bit(AFS_SERVER_FL_CREATING, &server->flags))
+ goto wait_for_creation;
if (server->addr_version != addr_version)
set_bit(AFS_SERVER_FL_NEEDS_UPDATE, &server->flags);
return server;
}
- alist = afs_vl_lookup_addrs(cell, key, uuid);
- if (IS_ERR(alist))
- return ERR_CAST(alist);
-
- candidate = afs_alloc_server(cell, uuid, alist);
+ candidate = afs_alloc_server(cell, uuid);
if (!candidate) {
afs_put_addrlist(alist, afs_alist_trace_put_server_oom);
return ERR_PTR(-ENOMEM);
}
- server = afs_install_server(cell, candidate);
- if (server != candidate) {
- afs_put_addrlist(alist, afs_alist_trace_put_server_dup);
+ down_write(&cell->fs_lock);
+ server = afs_install_server(cell, &candidate);
+ if (test_bit(AFS_SERVER_FL_CREATING, &server->flags)) {
+ /* We need to wait for creation to complete. */
+ up_write(&cell->fs_lock);
+ goto wait_for_creation;
+ }
+ if (test_bit(AFS_SERVER_FL_UNCREATED, &server->flags)) {
+ set_bit(AFS_SERVER_FL_CREATING, &server->flags);
+ clear_bit(AFS_SERVER_FL_UNCREATED, &server->flags);
+ creating = true;
+ }
+ up_write(&cell->fs_lock);
+ timer_delete_sync(&server->timer);
+
+ /* If we get to create the server, we look up the addresses and then
+ * immediately dispatch an asynchronous probe to each interface on the
+ * fileserver. This will make sure the repeat-probing service is
+ * started.
+ */
+ if (creating) {
+ alist = afs_vl_lookup_addrs(server, key);
+ if (IS_ERR(alist)) {
+ ret = PTR_ERR(alist);
+ goto create_failed;
+ }
+
+ ret = afs_fs_probe_fileserver(cell->net, server, alist, key);
+ if (ret)
+ goto create_failed;
+
+ clear_and_wake_up_bit(AFS_SERVER_FL_CREATING, &server->flags);
+ }
+
+out:
+ afs_put_addrlist(alist, afs_alist_trace_put_server_create);
+ if (candidate) {
+ kfree(rcu_access_pointer(server->endpoint_state));
kfree(candidate);
- } else {
- /* Immediately dispatch an asynchronous probe to each interface
- * on the fileserver. This will make sure the repeat-probing
- * service is started.
- */
- afs_fs_probe_fileserver(cell->net, server, alist, key);
+ afs_dec_servers_outstanding(cell->net);
+ }
+ return server ?: ERR_PTR(ret);
+
+wait_for_creation:
+ afs_see_server(server, afs_server_trace_wait_create);
+ wait_on_bit(&server->flags, AFS_SERVER_FL_CREATING, TASK_UNINTERRUPTIBLE);
+ if (test_bit_acquire(AFS_SERVER_FL_UNCREATED, &server->flags)) {
+ /* Barrier: read flag before error */
+ ret = READ_ONCE(server->create_error);
+ afs_put_server(cell->net, server, afs_server_trace_unuse_create_fail);
+ server = NULL;
+ goto out;
}
- return server;
-}
+ ret = 0;
+ goto out;
-/*
- * Set the server timer to fire after a given delay, assuming it's not already
- * set for an earlier time.
- */
-static void afs_set_server_timer(struct afs_net *net, time64_t delay)
-{
- if (net->live) {
- afs_inc_servers_outstanding(net);
- if (timer_reduce(&net->fs_timer, jiffies + delay * HZ))
- afs_dec_servers_outstanding(net);
+create_failed:
+ down_write(&cell->fs_lock);
+
+ WRITE_ONCE(server->create_error, ret);
+ smp_wmb(); /* Barrier: set error before flag. */
+ set_bit(AFS_SERVER_FL_UNCREATED, &server->flags);
+
+ clear_and_wake_up_bit(AFS_SERVER_FL_CREATING, &server->flags);
+
+ if (test_bit(AFS_SERVER_FL_UNCREATED, &server->flags)) {
+ clear_bit(AFS_SERVER_FL_UNCREATED, &server->flags);
+ creating = true;
}
+ afs_unuse_server(cell->net, server, afs_server_trace_unuse_create_fail);
+ server = NULL;
+
+ up_write(&cell->fs_lock);
+ goto out;
}
/*
- * Server management timer. We have an increment on fs_outstanding that we
- * need to pass along to the work item.
+ * Set/reduce a server's timer.
*/
-void afs_servers_timer(struct timer_list *timer)
+static void afs_set_server_timer(struct afs_server *server, unsigned int delay_secs)
{
- struct afs_net *net = container_of(timer, struct afs_net, fs_timer);
-
- _enter("");
- if (!queue_work(afs_wq, &net->fs_manager))
- afs_dec_servers_outstanding(net);
+ mod_timer(&server->timer, jiffies + delay_secs * HZ);
}
/*
@@ -357,32 +305,20 @@ struct afs_server *afs_get_server(struct afs_server *server,
}
/*
- * Try to get a reference on a server object.
+ * Get an active count on a server object and maybe remove from the inactive
+ * list.
*/
-static struct afs_server *afs_maybe_use_server(struct afs_server *server,
- enum afs_server_trace reason)
-{
- unsigned int a;
- int r;
-
- if (!__refcount_inc_not_zero(&server->ref, &r))
- return NULL;
-
- a = atomic_inc_return(&server->active);
- trace_afs_server(server->debug_id, r + 1, a, reason);
- return server;
-}
-
-/*
- * Get an active count on a server object.
- */
-struct afs_server *afs_use_server(struct afs_server *server, enum afs_server_trace reason)
+struct afs_server *afs_use_server(struct afs_server *server, bool activate,
+ enum afs_server_trace reason)
{
unsigned int a;
int r;
__refcount_inc(&server->ref, &r);
a = atomic_inc_return(&server->active);
+ if (a == 1 && activate &&
+ !test_bit(AFS_SERVER_FL_EXPIRED, &server->flags))
+ del_timer(&server->timer);
trace_afs_server(server->debug_id, r + 1, a, reason);
return server;
@@ -415,13 +351,16 @@ void afs_put_server(struct afs_net *net, struct afs_server *server,
void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server,
enum afs_server_trace reason)
{
- if (server) {
- unsigned int active = atomic_dec_return(&server->active);
+ if (!server)
+ return;
- if (active == 0)
- afs_set_server_timer(net, afs_server_gc_delay);
- afs_put_server(net, server, reason);
+ if (atomic_dec_and_test(&server->active)) {
+ if (test_bit(AFS_SERVER_FL_EXPIRED, &server->flags) ||
+ READ_ONCE(server->cell->state) >= AFS_CELL_REMOVING)
+ schedule_work(&server->destroyer);
}
+
+ afs_put_server(net, server, reason);
}
/*
@@ -430,10 +369,22 @@ void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server,
void afs_unuse_server(struct afs_net *net, struct afs_server *server,
enum afs_server_trace reason)
{
- if (server) {
- server->unuse_time = ktime_get_real_seconds();
- afs_unuse_server_notime(net, server, reason);
+ if (!server)
+ return;
+
+ if (atomic_dec_and_test(&server->active)) {
+ if (!test_bit(AFS_SERVER_FL_EXPIRED, &server->flags) &&
+ READ_ONCE(server->cell->state) < AFS_CELL_REMOVING) {
+ time64_t unuse_time = ktime_get_real_seconds();
+
+ server->unuse_time = unuse_time;
+ afs_set_server_timer(server, afs_server_gc_delay);
+ } else {
+ schedule_work(&server->destroyer);
+ }
}
+
+ afs_put_server(net, server, reason);
}
static void afs_server_rcu(struct rcu_head *rcu)
@@ -463,159 +414,119 @@ static void afs_give_up_callbacks(struct afs_net *net, struct afs_server *server
}
/*
- * destroy a dead server
+ * Check to see if the server record has expired.
*/
-static void afs_destroy_server(struct afs_net *net, struct afs_server *server)
+static bool afs_has_server_expired(const struct afs_server *server)
{
- if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags))
- afs_give_up_callbacks(net, server);
+ time64_t expires_at;
- afs_put_server(net, server, afs_server_trace_destroy);
+ if (atomic_read(&server->active))
+ return false;
+
+ if (server->cell->net->live ||
+ server->cell->state >= AFS_CELL_REMOVING) {
+ trace_afs_server(server->debug_id, refcount_read(&server->ref),
+ 0, afs_server_trace_purging);
+ return true;
+ }
+
+ expires_at = server->unuse_time;
+ if (!test_bit(AFS_SERVER_FL_VL_FAIL, &server->flags) &&
+ !test_bit(AFS_SERVER_FL_NOT_FOUND, &server->flags))
+ expires_at += afs_server_gc_delay;
+
+ return ktime_get_real_seconds() > expires_at;
}
/*
- * Garbage collect any expired servers.
+ * Remove a server record from it's parent cell's database.
*/
-static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list)
+static bool afs_remove_server_from_cell(struct afs_server *server)
{
- struct afs_server *server, *next, *prev;
- int active;
-
- while ((server = gc_list)) {
- gc_list = server->gc_next;
-
- write_seqlock(&net->fs_lock);
-
- active = atomic_read(&server->active);
- if (active == 0) {
- trace_afs_server(server->debug_id, refcount_read(&server->ref),
- active, afs_server_trace_gc);
- next = rcu_dereference_protected(
- server->uuid_next, lockdep_is_held(&net->fs_lock.lock));
- prev = server->uuid_prev;
- if (!prev) {
- /* The one at the front is in the tree */
- if (!next) {
- rb_erase(&server->uuid_rb, &net->fs_servers);
- } else {
- rb_replace_node_rcu(&server->uuid_rb,
- &next->uuid_rb,
- &net->fs_servers);
- next->uuid_prev = NULL;
- }
- } else {
- /* This server is not at the front */
- rcu_assign_pointer(prev->uuid_next, next);
- if (next)
- next->uuid_prev = prev;
- }
-
- list_del(&server->probe_link);
- hlist_del_rcu(&server->proc_link);
- if (!hlist_unhashed(&server->addr_link))
- hlist_del_rcu(&server->addr_link);
- }
- write_sequnlock(&net->fs_lock);
+ struct afs_cell *cell = server->cell;
+
+ down_write(&cell->fs_lock);
- if (active == 0)
- afs_destroy_server(net, server);
+ if (!afs_has_server_expired(server)) {
+ up_write(&cell->fs_lock);
+ return false;
}
+
+ set_bit(AFS_SERVER_FL_EXPIRED, &server->flags);
+ _debug("expire %pU %u", &server->uuid, atomic_read(&server->active));
+ afs_see_server(server, afs_server_trace_see_expired);
+ rb_erase(&server->uuid_rb, &cell->fs_servers);
+ up_write(&cell->fs_lock);
+ return true;
}
-/*
- * Manage the records of servers known to be within a network namespace. This
- * includes garbage collecting unused servers.
- *
- * Note also that we were given an increment on net->servers_outstanding by
- * whoever queued us that we need to deal with before returning.
- */
-void afs_manage_servers(struct work_struct *work)
+static void afs_server_destroyer(struct work_struct *work)
{
- struct afs_net *net = container_of(work, struct afs_net, fs_manager);
- struct afs_server *gc_list = NULL;
- struct rb_node *cursor;
- time64_t now = ktime_get_real_seconds(), next_manage = TIME64_MAX;
- bool purging = !net->live;
-
- _enter("");
+ struct afs_endpoint_state *estate;
+ struct afs_server *server = container_of(work, struct afs_server, destroyer);
+ struct afs_net *net = server->cell->net;
- /* Trawl the server list looking for servers that have expired from
- * lack of use.
- */
- read_seqlock_excl(&net->fs_lock);
+ afs_see_server(server, afs_server_trace_see_destroyer);
- for (cursor = rb_first(&net->fs_servers); cursor; cursor = rb_next(cursor)) {
- struct afs_server *server =
- rb_entry(cursor, struct afs_server, uuid_rb);
- int active = atomic_read(&server->active);
+ if (test_bit(AFS_SERVER_FL_EXPIRED, &server->flags))
+ return;
- _debug("manage %pU %u", &server->uuid, active);
+ if (!afs_remove_server_from_cell(server))
+ return;
- if (purging) {
- trace_afs_server(server->debug_id, refcount_read(&server->ref),
- active, afs_server_trace_purging);
- if (active != 0)
- pr_notice("Can't purge s=%08x\n", server->debug_id);
- }
+ timer_shutdown_sync(&server->timer);
+ cancel_work(&server->destroyer);
- if (active == 0) {
- time64_t expire_at = server->unuse_time;
-
- if (!test_bit(AFS_SERVER_FL_VL_FAIL, &server->flags) &&
- !test_bit(AFS_SERVER_FL_NOT_FOUND, &server->flags))
- expire_at += afs_server_gc_delay;
- if (purging || expire_at <= now) {
- server->gc_next = gc_list;
- gc_list = server;
- } else if (expire_at < next_manage) {
- next_manage = expire_at;
- }
- }
- }
+ if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags))
+ afs_give_up_callbacks(net, server);
- read_sequnlock_excl(&net->fs_lock);
+ /* Unbind the rxrpc_peer records from the server. */
+ estate = rcu_access_pointer(server->endpoint_state);
+ if (estate)
+ afs_set_peer_appdata(server, estate->addresses, NULL);
- /* Update the timer on the way out. We have to pass an increment on
- * servers_outstanding in the namespace that we are in to the timer or
- * the work scheduler.
- */
- if (!purging && next_manage < TIME64_MAX) {
- now = ktime_get_real_seconds();
+ write_seqlock(&net->fs_lock);
+ list_del_init(&server->probe_link);
+ if (!hlist_unhashed(&server->proc_link))
+ hlist_del_rcu(&server->proc_link);
+ write_sequnlock(&net->fs_lock);
- if (next_manage - now <= 0) {
- if (queue_work(afs_wq, &net->fs_manager))
- afs_inc_servers_outstanding(net);
- } else {
- afs_set_server_timer(net, next_manage - now);
- }
- }
+ afs_put_server(net, server, afs_server_trace_destroy);
+}
- afs_gc_servers(net, gc_list);
+static void afs_server_timer(struct timer_list *timer)
+{
+ struct afs_server *server = container_of(timer, struct afs_server, timer);
- afs_dec_servers_outstanding(net);
- _leave(" [%d]", atomic_read(&net->servers_outstanding));
+ afs_see_server(server, afs_server_trace_see_timer);
+ if (!test_bit(AFS_SERVER_FL_EXPIRED, &server->flags))
+ schedule_work(&server->destroyer);
}
-static void afs_queue_server_manager(struct afs_net *net)
+/*
+ * Wake up all the servers in a cell so that they can purge themselves.
+ */
+void afs_purge_servers(struct afs_cell *cell)
{
- afs_inc_servers_outstanding(net);
- if (!queue_work(afs_wq, &net->fs_manager))
- afs_dec_servers_outstanding(net);
+ struct afs_server *server;
+ struct rb_node *rb;
+
+ down_read(&cell->fs_lock);
+ for (rb = rb_first(&cell->fs_servers); rb; rb = rb_next(rb)) {
+ server = rb_entry(rb, struct afs_server, uuid_rb);
+ afs_see_server(server, afs_server_trace_see_purge);
+ schedule_work(&server->destroyer);
+ }
+ up_read(&cell->fs_lock);
}
/*
- * Purge list of servers.
+ * Wait for outstanding servers.
*/
-void afs_purge_servers(struct afs_net *net)
+void afs_wait_for_servers(struct afs_net *net)
{
_enter("");
- if (del_timer_sync(&net->fs_timer))
- afs_dec_servers_outstanding(net);
-
- afs_queue_server_manager(net);
-
- _debug("wait");
atomic_dec(&net->servers_outstanding);
wait_var_event(&net->servers_outstanding,
!atomic_read(&net->servers_outstanding));
@@ -639,7 +550,7 @@ static noinline bool afs_update_server_record(struct afs_operation *op,
atomic_read(&server->active),
afs_server_trace_update);
- alist = afs_vl_lookup_addrs(op->volume->cell, op->key, &server->uuid);
+ alist = afs_vl_lookup_addrs(server, op->key);
if (IS_ERR(alist)) {
rcu_read_lock();
estate = rcu_dereference(server->endpoint_state);
diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c
index d20cd902ef94..20d5474837df 100644
--- a/fs/afs/server_list.c
+++ b/fs/afs/server_list.c
@@ -16,7 +16,7 @@ void afs_put_serverlist(struct afs_net *net, struct afs_server_list *slist)
if (slist && refcount_dec_and_test(&slist->usage)) {
for (i = 0; i < slist->nr_servers; i++)
afs_unuse_server(net, slist->servers[i].server,
- afs_server_trace_put_slist);
+ afs_server_trace_unuse_slist);
kfree_rcu(slist, rcu);
}
}
@@ -97,8 +97,8 @@ struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume,
break;
if (j < slist->nr_servers) {
if (slist->servers[j].server == server) {
- afs_unuse_server(volume->cell->net, server,
- afs_server_trace_put_slist_isort);
+ afs_unuse_server_notime(volume->cell->net, server,
+ afs_server_trace_unuse_slist_isort);
continue;
}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index a9bee610674e..25b306db6992 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -194,8 +194,6 @@ static int afs_show_options(struct seq_file *m, struct dentry *root)
if (as->dyn_root)
seq_puts(m, ",dyn");
- if (test_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(d_inode(root))->flags))
- seq_puts(m, ",autocell");
switch (as->flock_mode) {
case afs_flock_mode_unset: break;
case afs_flock_mode_local: p = "local"; break;
@@ -292,13 +290,14 @@ static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param)
/* lookup the cell record */
if (cellname) {
cell = afs_lookup_cell(ctx->net, cellname, cellnamesz,
- NULL, false);
+ NULL, false,
+ afs_cell_trace_use_lookup_mount);
if (IS_ERR(cell)) {
pr_err("kAFS: unable to lookup cell '%*.*s'\n",
cellnamesz, cellnamesz, cellname ?: "");
return PTR_ERR(cell);
}
- afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_parse);
+ afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_parse);
afs_see_cell(cell, afs_cell_trace_see_source);
ctx->cell = cell;
}
@@ -395,7 +394,7 @@ static int afs_validate_fc(struct fs_context *fc)
ctx->key = NULL;
cell = afs_use_cell(ctx->cell->alias_of,
afs_cell_trace_use_fc_alias);
- afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc);
+ afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_fc);
ctx->cell = cell;
goto reget_key;
}
@@ -468,7 +467,7 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
/* allocate the root inode and dentry */
if (as->dyn_root) {
- inode = afs_iget_pseudo_dir(sb, true);
+ inode = afs_dynroot_iget_root(sb);
} else {
sprintf(sb->s_id, "%llu", as->volume->vid);
afs_activate_volume(as->volume);
@@ -478,9 +477,6 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (ctx->autocell || as->dyn_root)
- set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags);
-
ret = -ENOMEM;
sb->s_root = d_make_root(inode);
if (!sb->s_root)
@@ -488,9 +484,6 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
if (as->dyn_root) {
sb->s_d_op = &afs_dynroot_dentry_operations;
- ret = afs_dynroot_populate(sb);
- if (ret < 0)
- goto error;
} else {
sb->s_d_op = &afs_fs_dentry_operations;
rcu_assign_pointer(as->volume->sb, sb);
@@ -527,9 +520,8 @@ static struct afs_super_info *afs_alloc_sbi(struct fs_context *fc)
static void afs_destroy_sbi(struct afs_super_info *as)
{
if (as) {
- struct afs_net *net = afs_net(as->net_ns);
afs_put_volume(as->volume, afs_volume_trace_put_destroy_sbi);
- afs_unuse_cell(net, as->cell, afs_cell_trace_unuse_sbi);
+ afs_unuse_cell(as->cell, afs_cell_trace_unuse_sbi);
put_net(as->net_ns);
kfree(as);
}
@@ -539,9 +531,6 @@ static void afs_kill_super(struct super_block *sb)
{
struct afs_super_info *as = AFS_FS_S(sb);
- if (as->dyn_root)
- afs_dynroot_depopulate(sb);
-
/* Clear the callback interests (which will do ilookup5) before
* deactivating the superblock.
*/
@@ -615,7 +604,7 @@ static void afs_free_fc(struct fs_context *fc)
afs_destroy_sbi(fc->s_fs_info);
afs_put_volume(ctx->volume, afs_volume_trace_put_free_fc);
- afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc);
+ afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_fc);
key_put(ctx->key);
kfree(ctx);
}
diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c
index f9e76b604f31..709b4cdb723e 100644
--- a/fs/afs/vl_alias.c
+++ b/fs/afs/vl_alias.c
@@ -205,11 +205,11 @@ static int afs_query_for_alias(struct afs_cell *cell, struct key *key)
goto is_alias;
if (mutex_lock_interruptible(&cell->net->proc_cells_lock) < 0) {
- afs_unuse_cell(cell->net, p, afs_cell_trace_unuse_check_alias);
+ afs_unuse_cell(p, afs_cell_trace_unuse_check_alias);
return -ERESTARTSYS;
}
- afs_unuse_cell(cell->net, p, afs_cell_trace_unuse_check_alias);
+ afs_unuse_cell(p, afs_cell_trace_unuse_check_alias);
}
mutex_unlock(&cell->net->proc_cells_lock);
@@ -269,7 +269,8 @@ static int yfs_check_canonical_cell_name(struct afs_cell *cell, struct key *key)
if (!name_len || name_len > AFS_MAXCELLNAME)
master = ERR_PTR(-EOPNOTSUPP);
else
- master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, false);
+ master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, false,
+ afs_cell_trace_use_lookup_canonical);
kfree(cell_name);
if (IS_ERR(master))
return PTR_ERR(master);
diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c
index d8f79f6ada3d..6ad9688d8f4b 100644
--- a/fs/afs/vl_rotate.c
+++ b/fs/afs/vl_rotate.c
@@ -48,7 +48,7 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
cell->dns_expiry <= ktime_get_real_seconds()) {
dns_lookup_count = smp_load_acquire(&cell->dns_lookup_count);
set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags);
- afs_queue_cell(cell, afs_cell_trace_get_queue_dns);
+ afs_queue_cell(cell, afs_cell_trace_queue_dns);
if (cell->dns_source == DNS_RECORD_UNAVAILABLE) {
if (wait_var_event_interruptible(
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index af3a3f57c1b3..0efff3d25133 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -10,6 +10,7 @@
#include "internal.h"
static unsigned __read_mostly afs_volume_record_life = 60 * 60;
+static atomic_t afs_volume_debug_id;
static void afs_destroy_volume(struct work_struct *work);
@@ -59,7 +60,7 @@ static void afs_remove_volume_from_cell(struct afs_volume *volume)
struct afs_cell *cell = volume->cell;
if (!hlist_unhashed(&volume->proc_link)) {
- trace_afs_volume(volume->vid, refcount_read(&cell->ref),
+ trace_afs_volume(volume->debug_id, volume->vid, refcount_read(&volume->ref),
afs_volume_trace_remove);
write_seqlock(&cell->volume_lock);
hlist_del_rcu(&volume->proc_link);
@@ -84,6 +85,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
if (!volume)
goto error_0;
+ volume->debug_id = atomic_inc_return(&afs_volume_debug_id);
volume->vid = vldb->vid[params->type];
volume->update_at = ktime_get_real_seconds() + afs_volume_record_life;
volume->cell = afs_get_cell(params->cell, afs_cell_trace_get_vol);
@@ -115,7 +117,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
*_slist = slist;
rcu_assign_pointer(volume->servers, slist);
- trace_afs_volume(volume->vid, 1, afs_volume_trace_alloc);
+ trace_afs_volume(volume->debug_id, volume->vid, 1, afs_volume_trace_alloc);
return volume;
error_1:
@@ -247,7 +249,7 @@ static void afs_destroy_volume(struct work_struct *work)
afs_remove_volume_from_cell(volume);
afs_put_serverlist(volume->cell->net, slist);
afs_put_cell(volume->cell, afs_cell_trace_put_vol);
- trace_afs_volume(volume->vid, refcount_read(&volume->ref),
+ trace_afs_volume(volume->debug_id, volume->vid, refcount_read(&volume->ref),
afs_volume_trace_free);
kfree_rcu(volume, rcu);
@@ -262,7 +264,7 @@ bool afs_try_get_volume(struct afs_volume *volume, enum afs_volume_trace reason)
int r;
if (__refcount_inc_not_zero(&volume->ref, &r)) {
- trace_afs_volume(volume->vid, r + 1, reason);
+ trace_afs_volume(volume->debug_id, volume->vid, r + 1, reason);
return true;
}
return false;
@@ -278,7 +280,7 @@ struct afs_volume *afs_get_volume(struct afs_volume *volume,
int r;
__refcount_inc(&volume->ref, &r);
- trace_afs_volume(volume->vid, r + 1, reason);
+ trace_afs_volume(volume->debug_id, volume->vid, r + 1, reason);
}
return volume;
}
@@ -290,12 +292,13 @@ struct afs_volume *afs_get_volume(struct afs_volume *volume,
void afs_put_volume(struct afs_volume *volume, enum afs_volume_trace reason)
{
if (volume) {
+ unsigned int debug_id = volume->debug_id;
afs_volid_t vid = volume->vid;
bool zero;
int r;
zero = __refcount_dec_and_test(&volume->ref, &r);
- trace_afs_volume(vid, r - 1, reason);
+ trace_afs_volume(debug_id, vid, r - 1, reason);
if (zero)
schedule_work(&volume->destructor);
}
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 77c7991d89aa..23cea74f9933 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -218,6 +218,8 @@ void autofs_clean_ino(struct autofs_info *);
static inline int autofs_check_pipe(struct file *pipe)
{
+ if (pipe->f_mode & FMODE_PATH)
+ return -EINVAL;
if (!(pipe->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
if (!S_ISFIFO(file_inode(pipe)->i_mode))
diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
index 6d57efbb8110..c5a6aae12d2c 100644
--- a/fs/autofs/dev-ioctl.c
+++ b/fs/autofs/dev-ioctl.c
@@ -442,7 +442,6 @@ static int autofs_dev_ioctl_timeout(struct file *fp,
sbi->exp_timeout = timeout * HZ;
} else {
struct dentry *base = fp->f_path.dentry;
- struct inode *inode = base->d_inode;
int path_len = param->size - AUTOFS_DEV_IOCTL_SIZE - 1;
struct dentry *dentry;
struct autofs_info *ino;
@@ -460,9 +459,7 @@ static int autofs_dev_ioctl_timeout(struct file *fp,
"the parent autofs mount timeout which could "
"prevent shutdown\n");
- inode_lock_shared(inode);
dentry = try_lookup_one_len(param->path, base, path_len);
- inode_unlock_shared(inode);
if (IS_ERR_OR_NULL(dentry))
return dentry ? PTR_ERR(dentry) : -ENOENT;
ino = autofs_dentry_ino(dentry);
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 530d18827e35..174c7205fee4 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -15,8 +15,8 @@ static int autofs_dir_symlink(struct mnt_idmap *, struct inode *,
struct dentry *, const char *);
static int autofs_dir_unlink(struct inode *, struct dentry *);
static int autofs_dir_rmdir(struct inode *, struct dentry *);
-static int autofs_dir_mkdir(struct mnt_idmap *, struct inode *,
- struct dentry *, umode_t);
+static struct dentry *autofs_dir_mkdir(struct mnt_idmap *, struct inode *,
+ struct dentry *, umode_t);
static long autofs_root_ioctl(struct file *, unsigned int, unsigned long);
#ifdef CONFIG_COMPAT
static long autofs_root_compat_ioctl(struct file *,
@@ -720,9 +720,9 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry)
return 0;
}
-static int autofs_dir_mkdir(struct mnt_idmap *idmap,
- struct inode *dir, struct dentry *dentry,
- umode_t mode)
+static struct dentry *autofs_dir_mkdir(struct mnt_idmap *idmap,
+ struct inode *dir, struct dentry *dentry,
+ umode_t mode)
{
struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
struct autofs_info *ino = autofs_dentry_ino(dentry);
@@ -739,7 +739,7 @@ static int autofs_dir_mkdir(struct mnt_idmap *idmap,
inode = autofs_get_inode(dir->i_sb, S_IFDIR | mode);
if (!inode)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
d_add(dentry, inode);
if (sbi->version < 5)
@@ -751,7 +751,7 @@ static int autofs_dir_mkdir(struct mnt_idmap *idmap,
inc_nlink(dir);
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
- return 0;
+ return NULL;
}
/* Get/set timeout ioctl() operation */
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 316d88da2ce1..0ef9bcb744dd 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -58,10 +58,10 @@ static int bad_inode_symlink(struct mnt_idmap *idmap,
return -EIO;
}
-static int bad_inode_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *bad_inode_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
- return -EIO;
+ return ERR_PTR(-EIO);
}
static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry)
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 15725b4ce393..595b57fabc9a 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -511,10 +511,6 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
ret = -EXDEV;
goto err;
}
- if (!d_is_positive(victim)) {
- ret = -ENOENT;
- goto err;
- }
ret = __bch2_unlink(dir, victim, true);
if (!ret) {
fsnotify_rmdir(dir, victim);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 90ade8f648d9..b2669d7ffec5 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -858,10 +858,10 @@ err:
return bch2_err_class(ret);
}
-static int bch2_mkdir(struct mnt_idmap *idmap,
- struct inode *vdir, struct dentry *dentry, umode_t mode)
+static struct dentry *bch2_mkdir(struct mnt_idmap *idmap,
+ struct inode *vdir, struct dentry *dentry, umode_t mode)
{
- return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0);
+ return ERR_PTR(bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0));
}
static int bch2_rename2(struct mnt_idmap *idmap,
@@ -2396,7 +2396,7 @@ static struct file_system_type bcache_fs_type = {
.name = "bcachefs",
.init_fs_context = bch2_init_fs_context,
.kill_sb = bch2_kill_sb,
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_LBS,
};
MODULE_ALIAS_FS("bcachefs");
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 8054f44d39cf..584fa89bc877 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -762,8 +762,7 @@ static int parse_elf_property(const char *data, size_t *off, size_t datasz,
}
#define NOTE_DATA_SZ SZ_1K
-#define GNU_PROPERTY_TYPE_0_NAME "GNU"
-#define NOTE_NAME_SZ (sizeof(GNU_PROPERTY_TYPE_0_NAME))
+#define NOTE_NAME_SZ (sizeof(NN_GNU_PROPERTY_TYPE_0))
static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr,
struct arch_elf_state *arch)
@@ -800,7 +799,7 @@ static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr,
if (note.nhdr.n_type != NT_GNU_PROPERTY_TYPE_0 ||
note.nhdr.n_namesz != NOTE_NAME_SZ ||
strncmp(note.data + sizeof(note.nhdr),
- GNU_PROPERTY_TYPE_0_NAME, n - sizeof(note.nhdr)))
+ NN_GNU_PROPERTY_TYPE_0, n - sizeof(note.nhdr)))
return -ENOEXEC;
off = round_up(sizeof(note.nhdr) + NOTE_NAME_SZ,
@@ -1603,14 +1602,14 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm)
do
i += 2;
while (auxv[i - 2] != AT_NULL);
- fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv);
+ fill_note(note, NN_AUXV, NT_AUXV, i * sizeof(elf_addr_t), auxv);
}
static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata,
const kernel_siginfo_t *siginfo)
{
copy_siginfo_to_external(csigdata, siginfo);
- fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata);
+ fill_note(note, NN_SIGINFO, NT_SIGINFO, sizeof(*csigdata), csigdata);
}
/*
@@ -1706,7 +1705,7 @@ static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm
}
size = name_curpos - (char *)data;
- fill_note(note, "CORE", NT_FILE, size, data);
+ fill_note(note, NN_FILE, NT_FILE, size, data);
return 0;
}
@@ -1767,7 +1766,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
regset_get(t->task, &view->regsets[0],
sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg);
- fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
+ fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS,
PRSTATUS_SIZE, &t->prstatus);
info->size += notesize(&t->notes[0]);
@@ -1801,7 +1800,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
if (is_fpreg)
SET_PR_FPVALID(&t->prstatus);
- fill_note(&t->notes[note_iter], is_fpreg ? "CORE" : "LINUX",
+ fill_note(&t->notes[note_iter], is_fpreg ? NN_PRFPREG : "LINUX",
note_type, ret, data);
info->size += notesize(&t->notes[note_iter]);
@@ -1821,7 +1820,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
fill_prstatus(&t->prstatus.common, p, signr);
elf_core_copy_task_regs(p, &t->prstatus.pr_reg);
- fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
+ fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, sizeof(t->prstatus),
&(t->prstatus));
info->size += notesize(&t->notes[0]);
@@ -1832,7 +1831,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
}
t->prstatus.pr_fpvalid = 1;
- fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(*fpu), fpu);
+ fill_note(&t->notes[1], NN_PRFPREG, NT_PRFPREG, sizeof(*fpu), fpu);
info->size += notesize(&t->notes[1]);
return 1;
@@ -1852,7 +1851,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
if (!psinfo)
return 0;
- fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
+ fill_note(&info->psinfo, NN_PRPSINFO, NT_PRPSINFO, sizeof(*psinfo), psinfo);
#ifdef CORE_DUMP_USE_REGSET
view = task_user_regset_view(dump_task);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index c13ee8180b17..9133f3827f90 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1024,7 +1024,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
/* deal with each load segment separately */
phdr = params->phdrs;
for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) {
- unsigned long maddr, disp, excess, excess1;
+ unsigned long maddr, disp, excess;
int prot = 0, flags;
if (phdr->p_type != PT_LOAD)
@@ -1120,9 +1120,10 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
* extant in the file
*/
excess = phdr->p_memsz - phdr->p_filesz;
- excess1 = PAGE_SIZE - ((maddr + phdr->p_filesz) & ~PAGE_MASK);
#ifdef CONFIG_MMU
+ unsigned long excess1
+ = PAGE_SIZE - ((maddr + phdr->p_filesz) & ~PAGE_MASK);
if (excess > excess1) {
unsigned long xaddr = maddr + phdr->p_filesz + excess1;
unsigned long xmaddr;
@@ -1397,7 +1398,7 @@ static struct elf_thread_status *elf_dump_thread_status(long signr, struct task_
regset_get(p, &view->regsets[0],
sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg);
- fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
+ fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, sizeof(t->prstatus),
&t->prstatus);
t->num_notes++;
*sz += notesize(&t->notes[0]);
@@ -1415,7 +1416,7 @@ static struct elf_thread_status *elf_dump_thread_status(long signr, struct task_
}
if (t->prstatus.pr_fpvalid) {
- fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu),
+ fill_note(&t->notes[1], NN_PRFPREG, NT_PRFPREG, sizeof(t->fpu),
&t->fpu);
t->num_notes++;
*sz += notesize(&t->notes[1]);
@@ -1530,7 +1531,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
*/
fill_psinfo(psinfo, current->group_leader, current->mm);
- fill_note(&psinfo_note, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
+ fill_note(&psinfo_note, NN_PRPSINFO, NT_PRPSINFO, sizeof(*psinfo), psinfo);
thread_status_size += notesize(&psinfo_note);
auxv = (elf_addr_t *) current->mm->saved_auxv;
@@ -1538,7 +1539,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
do
i += 2;
while (auxv[i - 2] != AT_NULL);
- fill_note(&auxv_note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv);
+ fill_note(&auxv_note, NN_AUXV, NT_AUXV, i * sizeof(elf_addr_t), auxv);
thread_status_size += notesize(&auxv_note);
offset = sizeof(*elf); /* ELF header */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 38756f8cef46..a9e56c994e9e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6744,18 +6744,18 @@ fail:
return err;
}
-static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
inode = new_inode(dir->i_sb);
if (!inode)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
inode_init_owner(idmap, inode, dir, S_IFDIR | mode);
inode->i_op = &btrfs_dir_inode_operations;
inode->i_fop = &btrfs_dir_file_operations;
- return btrfs_create_common(dir, dentry, inode);
+ return ERR_PTR(btrfs_create_common(dir, dentry, inode));
}
static noinline int uncompress_inline(struct btrfs_path *path,
diff --git a/fs/buffer.c b/fs/buffer.c
index cc8452f60251..194eacbefc95 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2361,9 +2361,8 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
{
struct inode *inode = folio->mapping->host;
sector_t iblock, lblock;
- struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+ struct buffer_head *bh, *head, *prev = NULL;
size_t blocksize;
- int nr, i;
int fully_mapped = 1;
bool page_error = false;
loff_t limit = i_size_read(inode);
@@ -2372,16 +2371,12 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
limit = inode->i_sb->s_maxbytes;
- VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
-
head = folio_create_buffers(folio, inode, 0);
blocksize = head->b_size;
iblock = div_u64(folio_pos(folio), blocksize);
lblock = div_u64(limit + blocksize - 1, blocksize);
bh = head;
- nr = 0;
- i = 0;
do {
if (buffer_uptodate(bh))
@@ -2398,7 +2393,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
page_error = true;
}
if (!buffer_mapped(bh)) {
- folio_zero_range(folio, i * blocksize,
+ folio_zero_range(folio, bh_offset(bh),
blocksize);
if (!err)
set_buffer_uptodate(bh);
@@ -2411,40 +2406,33 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
if (buffer_uptodate(bh))
continue;
}
- arr[nr++] = bh;
- } while (i++, iblock++, (bh = bh->b_this_page) != head);
-
- if (fully_mapped)
- folio_set_mappedtodisk(folio);
-
- if (!nr) {
- /*
- * All buffers are uptodate or get_block() returned an
- * error when trying to map them - we can finish the read.
- */
- folio_end_read(folio, !page_error);
- return 0;
- }
- /* Stage two: lock the buffers */
- for (i = 0; i < nr; i++) {
- bh = arr[i];
lock_buffer(bh);
+ if (buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ continue;
+ }
+
mark_buffer_async_read(bh);
- }
+ if (prev)
+ submit_bh(REQ_OP_READ, prev);
+ prev = bh;
+ } while (iblock++, (bh = bh->b_this_page) != head);
+
+ if (fully_mapped)
+ folio_set_mappedtodisk(folio);
/*
- * Stage 3: start the IO. Check for uptodateness
- * inside the buffer lock in case another process reading
- * the underlying blockdev brought it uptodate (the sct fix).
+ * All buffers are uptodate or get_block() returned an error
+ * when trying to map them - we must finish the read because
+ * end_buffer_async_read() will never be called on any buffer
+ * in this folio.
*/
- for (i = 0; i < nr; i++) {
- bh = arr[i];
- if (buffer_uptodate(bh))
- end_buffer_async_read(bh, 1);
- else
- submit_bh(REQ_OP_READ, bh);
- }
+ if (prev)
+ submit_bh(REQ_OP_READ, prev);
+ else
+ folio_end_read(folio, !page_error);
+
return 0;
}
EXPORT_SYMBOL(block_read_full_folio);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 7cf59713f0f7..83a60126de0f 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -128,18 +128,19 @@ retry:
ret = security_path_mkdir(&path, subdir, 0700);
if (ret < 0)
goto mkdir_error;
- ret = cachefiles_inject_write_error();
- if (ret == 0)
- ret = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700);
- if (ret < 0) {
+ subdir = ERR_PTR(cachefiles_inject_write_error());
+ if (!IS_ERR(subdir))
+ subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700);
+ ret = PTR_ERR(subdir);
+ if (IS_ERR(subdir)) {
trace_cachefiles_vfs_error(NULL, d_inode(dir), ret,
cachefiles_trace_mkdir_error);
goto mkdir_error;
}
trace_cachefiles_mkdir(dir, subdir);
- if (unlikely(d_unhashed(subdir))) {
- cachefiles_put_directory(subdir);
+ if (unlikely(d_unhashed(subdir) || d_is_negative(subdir))) {
+ dput(subdir);
goto retry;
}
ASSERT(d_backing_inode(subdir));
@@ -195,7 +196,8 @@ mark_error:
mkdir_error:
inode_unlock(d_inode(dir));
- dput(subdir);
+ if (!IS_ERR(subdir))
+ dput(subdir);
pr_err("mkdir %s failed with error %d\n", dirname, ret);
return ERR_PTR(ret);
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
index fe3de9ad57bf..d9bc67176128 100644
--- a/fs/cachefiles/ondemand.c
+++ b/fs/cachefiles/ondemand.c
@@ -317,8 +317,9 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req,
goto err_free_id;
}
- anon_file->file = anon_inode_getfile("[cachefiles]",
- &cachefiles_ondemand_fd_fops, object, O_WRONLY);
+ anon_file->file = anon_inode_getfile_fmode("[cachefiles]",
+ &cachefiles_ondemand_fd_fops, object,
+ O_WRONLY, FMODE_PWRITE | FMODE_LSEEK);
if (IS_ERR(anon_file->file)) {
ret = PTR_ERR(anon_file->file);
goto err_put_fd;
@@ -333,8 +334,6 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req,
goto err_put_file;
}
- anon_file->file->f_mode |= FMODE_PWRITE | FMODE_LSEEK;
-
load = (void *)req->msg.data;
load->fd = anon_file->fd;
object->ondemand->ondemand_id = object_id;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index f5224a566b69..29be367905a1 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -82,6 +82,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
{
struct inode *inode = mapping->host;
struct ceph_client *cl = ceph_inode_to_client(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_inode_info *ci;
struct ceph_snap_context *snapc;
@@ -92,6 +93,8 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
return false;
}
+ atomic64_inc(&mdsc->dirty_folios);
+
ci = ceph_inode(inode);
/* dirty the head */
@@ -568,7 +571,36 @@ struct ceph_writeback_ctl
u64 truncate_size;
u32 truncate_seq;
bool size_stable;
+
bool head_snapc;
+ struct ceph_snap_context *snapc;
+ struct ceph_snap_context *last_snapc;
+
+ bool done;
+ bool should_loop;
+ bool range_whole;
+ pgoff_t start_index;
+ pgoff_t index;
+ pgoff_t end;
+ xa_mark_t tag;
+
+ pgoff_t strip_unit_end;
+ unsigned int wsize;
+ unsigned int nr_folios;
+ unsigned int max_pages;
+ unsigned int locked_pages;
+
+ int op_idx;
+ int num_ops;
+ u64 offset;
+ u64 len;
+
+ struct folio_batch fbatch;
+ unsigned int processed_in_fbatch;
+
+ bool from_pool;
+ struct page **pages;
+ struct page **data_pages;
};
/*
@@ -666,22 +698,23 @@ static u64 get_writepages_data_length(struct inode *inode,
}
/*
- * Write a single page, but leave the page locked.
+ * Write a folio, but leave it locked.
*
* If we get a write error, mark the mapping for error, but still adjust the
- * dirty page accounting (i.e., page is no longer dirty).
+ * dirty page accounting (i.e., folio is no longer dirty).
*/
-static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
+static int write_folio_nounlock(struct folio *folio,
+ struct writeback_control *wbc)
{
- struct folio *folio = page_folio(page);
- struct inode *inode = page->mapping->host;
+ struct page *page = &folio->page;
+ struct inode *inode = folio->mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
struct ceph_client *cl = fsc->client;
struct ceph_snap_context *snapc, *oldest;
- loff_t page_off = page_offset(page);
+ loff_t page_off = folio_pos(folio);
int err;
- loff_t len = thp_size(page);
+ loff_t len = folio_size(folio);
loff_t wlen;
struct ceph_writeback_ctl ceph_wbc;
struct ceph_osd_client *osdc = &fsc->client->osdc;
@@ -689,27 +722,27 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
bool caching = ceph_is_cache_enabled(inode);
struct page *bounce_page = NULL;
- doutc(cl, "%llx.%llx page %p idx %lu\n", ceph_vinop(inode), page,
- page->index);
+ doutc(cl, "%llx.%llx folio %p idx %lu\n", ceph_vinop(inode), folio,
+ folio->index);
if (ceph_inode_is_shutdown(inode))
return -EIO;
/* verify this is a writeable snap context */
- snapc = page_snap_context(page);
+ snapc = page_snap_context(&folio->page);
if (!snapc) {
- doutc(cl, "%llx.%llx page %p not dirty?\n", ceph_vinop(inode),
- page);
+ doutc(cl, "%llx.%llx folio %p not dirty?\n", ceph_vinop(inode),
+ folio);
return 0;
}
oldest = get_oldest_context(inode, &ceph_wbc, snapc);
if (snapc->seq > oldest->seq) {
- doutc(cl, "%llx.%llx page %p snapc %p not writeable - noop\n",
- ceph_vinop(inode), page, snapc);
+ doutc(cl, "%llx.%llx folio %p snapc %p not writeable - noop\n",
+ ceph_vinop(inode), folio, snapc);
/* we should only noop if called by kswapd */
WARN_ON(!(current->flags & PF_MEMALLOC));
ceph_put_snap_context(oldest);
- redirty_page_for_writepage(wbc, page);
+ folio_redirty_for_writepage(wbc, folio);
return 0;
}
ceph_put_snap_context(oldest);
@@ -726,8 +759,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
len = ceph_wbc.i_size - page_off;
wlen = IS_ENCRYPTED(inode) ? round_up(len, CEPH_FSCRYPT_BLOCK_SIZE) : len;
- doutc(cl, "%llx.%llx page %p index %lu on %llu~%llu snapc %p seq %lld\n",
- ceph_vinop(inode), page, page->index, page_off, wlen, snapc,
+ doutc(cl, "%llx.%llx folio %p index %lu on %llu~%llu snapc %p seq %lld\n",
+ ceph_vinop(inode), folio, folio->index, page_off, wlen, snapc,
snapc->seq);
if (atomic_long_inc_return(&fsc->writeback_count) >
@@ -740,32 +773,32 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
ceph_wbc.truncate_seq,
ceph_wbc.truncate_size, true);
if (IS_ERR(req)) {
- redirty_page_for_writepage(wbc, page);
+ folio_redirty_for_writepage(wbc, folio);
return PTR_ERR(req);
}
if (wlen < len)
len = wlen;
- set_page_writeback(page);
+ folio_start_writeback(folio);
if (caching)
- ceph_set_page_fscache(page);
+ ceph_set_page_fscache(&folio->page);
ceph_fscache_write_to_cache(inode, page_off, len, caching);
if (IS_ENCRYPTED(inode)) {
- bounce_page = fscrypt_encrypt_pagecache_blocks(page,
+ bounce_page = fscrypt_encrypt_pagecache_blocks(folio,
CEPH_FSCRYPT_BLOCK_SIZE, 0,
GFP_NOFS);
if (IS_ERR(bounce_page)) {
- redirty_page_for_writepage(wbc, page);
- end_page_writeback(page);
+ folio_redirty_for_writepage(wbc, folio);
+ folio_end_writeback(folio);
ceph_osdc_put_request(req);
return PTR_ERR(bounce_page);
}
}
/* it may be a short write due to an object boundary */
- WARN_ON_ONCE(len > thp_size(page));
+ WARN_ON_ONCE(len > folio_size(folio));
osd_req_op_extent_osd_data_pages(req, 0,
bounce_page ? &bounce_page : &page, wlen, 0,
false, false);
@@ -791,25 +824,25 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
if (err == -ERESTARTSYS) {
/* killed by SIGKILL */
doutc(cl, "%llx.%llx interrupted page %p\n",
- ceph_vinop(inode), page);
- redirty_page_for_writepage(wbc, page);
- end_page_writeback(page);
+ ceph_vinop(inode), folio);
+ folio_redirty_for_writepage(wbc, folio);
+ folio_end_writeback(folio);
return err;
}
if (err == -EBLOCKLISTED)
fsc->blocklisted = true;
- doutc(cl, "%llx.%llx setting page/mapping error %d %p\n",
- ceph_vinop(inode), err, page);
+ doutc(cl, "%llx.%llx setting mapping error %d %p\n",
+ ceph_vinop(inode), err, folio);
mapping_set_error(&inode->i_data, err);
wbc->pages_skipped++;
} else {
doutc(cl, "%llx.%llx cleaned page %p\n",
- ceph_vinop(inode), page);
+ ceph_vinop(inode), folio);
err = 0; /* vfs expects us to return 0 */
}
- oldest = detach_page_private(page);
+ oldest = folio_detach_private(folio);
WARN_ON_ONCE(oldest != snapc);
- end_page_writeback(page);
+ folio_end_writeback(folio);
ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
ceph_put_snap_context(snapc); /* page's reference */
@@ -820,32 +853,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
return err;
}
-static int ceph_writepage(struct page *page, struct writeback_control *wbc)
-{
- int err;
- struct inode *inode = page->mapping->host;
- BUG_ON(!inode);
- ihold(inode);
-
- if (wbc->sync_mode == WB_SYNC_NONE &&
- ceph_inode_to_fs_client(inode)->write_congested) {
- redirty_page_for_writepage(wbc, page);
- return AOP_WRITEPAGE_ACTIVATE;
- }
-
- folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */
-
- err = writepage_nounlock(page, wbc);
- if (err == -ERESTARTSYS) {
- /* direct memory reclaimer was killed by SIGKILL. return 0
- * to prevent caller from setting mapping/page error */
- err = 0;
- }
- unlock_page(page);
- iput(inode);
- return err;
-}
-
/*
* async writeback completion handler.
*
@@ -865,6 +872,7 @@ static void writepages_finish(struct ceph_osd_request *req)
struct ceph_snap_context *snapc = req->r_snapc;
struct address_space *mapping = inode->i_mapping;
struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
unsigned int len = 0;
bool remove_page;
@@ -920,6 +928,12 @@ static void writepages_finish(struct ceph_osd_request *req)
ceph_put_snap_context(detach_page_private(page));
end_page_writeback(page);
+
+ if (atomic64_dec_return(&mdsc->dirty_folios) <= 0) {
+ wake_up_all(&mdsc->flush_end_wq);
+ WARN_ON(atomic64_read(&mdsc->dirty_folios) < 0);
+ }
+
doutc(cl, "unlocking %p\n", page);
if (remove_page)
@@ -949,36 +963,13 @@ static void writepages_finish(struct ceph_osd_request *req)
ceph_dec_osd_stopping_blocker(fsc->mdsc);
}
-/*
- * initiate async writeback
- */
-static int ceph_writepages_start(struct address_space *mapping,
- struct writeback_control *wbc)
+static inline
+bool is_forced_umount(struct address_space *mapping)
{
struct inode *inode = mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
struct ceph_client *cl = fsc->client;
- struct ceph_vino vino = ceph_vino(inode);
- pgoff_t index, start_index, end = -1;
- struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
- struct folio_batch fbatch;
- int rc = 0;
- unsigned int wsize = i_blocksize(inode);
- struct ceph_osd_request *req = NULL;
- struct ceph_writeback_ctl ceph_wbc;
- bool should_loop, range_whole = false;
- bool done = false;
- bool caching = ceph_is_cache_enabled(inode);
- xa_mark_t tag;
-
- if (wbc->sync_mode == WB_SYNC_NONE &&
- fsc->write_congested)
- return 0;
-
- doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode),
- wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
- (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
if (ceph_inode_is_shutdown(inode)) {
if (ci->i_wrbuffer_ref > 0) {
@@ -987,388 +978,733 @@ static int ceph_writepages_start(struct address_space *mapping,
ceph_vinop(inode), ceph_ino(inode));
}
mapping_set_error(mapping, -EIO);
- return -EIO; /* we're in a forced umount, don't write! */
+ return true;
}
+
+ return false;
+}
+
+static inline
+unsigned int ceph_define_write_size(struct address_space *mapping)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ unsigned int wsize = i_blocksize(inode);
+
if (fsc->mount_options->wsize < wsize)
wsize = fsc->mount_options->wsize;
- folio_batch_init(&fbatch);
+ return wsize;
+}
+
+static inline
+void ceph_folio_batch_init(struct ceph_writeback_ctl *ceph_wbc)
+{
+ folio_batch_init(&ceph_wbc->fbatch);
+ ceph_wbc->processed_in_fbatch = 0;
+}
+
+static inline
+void ceph_folio_batch_reinit(struct ceph_writeback_ctl *ceph_wbc)
+{
+ folio_batch_release(&ceph_wbc->fbatch);
+ ceph_folio_batch_init(ceph_wbc);
+}
+
+static inline
+void ceph_init_writeback_ctl(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ceph_writeback_ctl *ceph_wbc)
+{
+ ceph_wbc->snapc = NULL;
+ ceph_wbc->last_snapc = NULL;
+
+ ceph_wbc->strip_unit_end = 0;
+ ceph_wbc->wsize = ceph_define_write_size(mapping);
- start_index = wbc->range_cyclic ? mapping->writeback_index : 0;
- index = start_index;
+ ceph_wbc->nr_folios = 0;
+ ceph_wbc->max_pages = 0;
+ ceph_wbc->locked_pages = 0;
+
+ ceph_wbc->done = false;
+ ceph_wbc->should_loop = false;
+ ceph_wbc->range_whole = false;
+
+ ceph_wbc->start_index = wbc->range_cyclic ? mapping->writeback_index : 0;
+ ceph_wbc->index = ceph_wbc->start_index;
+ ceph_wbc->end = -1;
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) {
- tag = PAGECACHE_TAG_TOWRITE;
+ ceph_wbc->tag = PAGECACHE_TAG_TOWRITE;
} else {
- tag = PAGECACHE_TAG_DIRTY;
+ ceph_wbc->tag = PAGECACHE_TAG_DIRTY;
}
-retry:
+
+ ceph_wbc->op_idx = -1;
+ ceph_wbc->num_ops = 0;
+ ceph_wbc->offset = 0;
+ ceph_wbc->len = 0;
+ ceph_wbc->from_pool = false;
+
+ ceph_folio_batch_init(ceph_wbc);
+
+ ceph_wbc->pages = NULL;
+ ceph_wbc->data_pages = NULL;
+}
+
+static inline
+int ceph_define_writeback_range(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ceph_writeback_ctl *ceph_wbc)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+
/* find oldest snap context with dirty data */
- snapc = get_oldest_context(inode, &ceph_wbc, NULL);
- if (!snapc) {
+ ceph_wbc->snapc = get_oldest_context(inode, ceph_wbc, NULL);
+ if (!ceph_wbc->snapc) {
/* hmm, why does writepages get called when there
is no dirty data? */
doutc(cl, " no snap context with dirty data?\n");
- goto out;
+ return -ENODATA;
}
- doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n", snapc,
- snapc->seq, snapc->num_snaps);
- should_loop = false;
- if (ceph_wbc.head_snapc && snapc != last_snapc) {
+ doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n",
+ ceph_wbc->snapc, ceph_wbc->snapc->seq,
+ ceph_wbc->snapc->num_snaps);
+
+ ceph_wbc->should_loop = false;
+
+ if (ceph_wbc->head_snapc && ceph_wbc->snapc != ceph_wbc->last_snapc) {
/* where to start/end? */
if (wbc->range_cyclic) {
- index = start_index;
- end = -1;
- if (index > 0)
- should_loop = true;
- doutc(cl, " cyclic, start at %lu\n", index);
+ ceph_wbc->index = ceph_wbc->start_index;
+ ceph_wbc->end = -1;
+ if (ceph_wbc->index > 0)
+ ceph_wbc->should_loop = true;
+ doutc(cl, " cyclic, start at %lu\n", ceph_wbc->index);
} else {
- index = wbc->range_start >> PAGE_SHIFT;
- end = wbc->range_end >> PAGE_SHIFT;
+ ceph_wbc->index = wbc->range_start >> PAGE_SHIFT;
+ ceph_wbc->end = wbc->range_end >> PAGE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
- range_whole = true;
- doutc(cl, " not cyclic, %lu to %lu\n", index, end);
+ ceph_wbc->range_whole = true;
+ doutc(cl, " not cyclic, %lu to %lu\n",
+ ceph_wbc->index, ceph_wbc->end);
}
- } else if (!ceph_wbc.head_snapc) {
+ } else if (!ceph_wbc->head_snapc) {
/* Do not respect wbc->range_{start,end}. Dirty pages
* in that range can be associated with newer snapc.
* They are not writeable until we write all dirty pages
* associated with 'snapc' get written */
- if (index > 0)
- should_loop = true;
+ if (ceph_wbc->index > 0)
+ ceph_wbc->should_loop = true;
doutc(cl, " non-head snapc, range whole\n");
}
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
- tag_pages_for_writeback(mapping, index, end);
+ ceph_put_snap_context(ceph_wbc->last_snapc);
+ ceph_wbc->last_snapc = ceph_wbc->snapc;
- ceph_put_snap_context(last_snapc);
- last_snapc = snapc;
+ return 0;
+}
- while (!done && index <= end) {
- int num_ops = 0, op_idx;
- unsigned i, nr_folios, max_pages, locked_pages = 0;
- struct page **pages = NULL, **data_pages;
- struct page *page;
- pgoff_t strip_unit_end = 0;
- u64 offset = 0, len = 0;
- bool from_pool = false;
+static inline
+bool has_writeback_done(struct ceph_writeback_ctl *ceph_wbc)
+{
+ return ceph_wbc->done && ceph_wbc->index > ceph_wbc->end;
+}
- max_pages = wsize >> PAGE_SHIFT;
+static inline
+bool can_next_page_be_processed(struct ceph_writeback_ctl *ceph_wbc,
+ unsigned index)
+{
+ return index < ceph_wbc->nr_folios &&
+ ceph_wbc->locked_pages < ceph_wbc->max_pages;
+}
-get_more_pages:
- nr_folios = filemap_get_folios_tag(mapping, &index,
- end, tag, &fbatch);
- doutc(cl, "pagevec_lookup_range_tag got %d\n", nr_folios);
- if (!nr_folios && !locked_pages)
- break;
- for (i = 0; i < nr_folios && locked_pages < max_pages; i++) {
- struct folio *folio = fbatch.folios[i];
+static
+int ceph_check_page_before_write(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ceph_writeback_ctl *ceph_wbc,
+ struct folio *folio)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+ struct ceph_snap_context *pgsnapc;
- page = &folio->page;
- doutc(cl, "? %p idx %lu\n", page, page->index);
- if (locked_pages == 0)
- lock_page(page); /* first page */
- else if (!trylock_page(page))
- break;
+ /* only dirty folios, or our accounting breaks */
+ if (unlikely(!folio_test_dirty(folio) || folio->mapping != mapping)) {
+ doutc(cl, "!dirty or !mapping %p\n", folio);
+ return -ENODATA;
+ }
- /* only dirty pages, or our accounting breaks */
- if (unlikely(!PageDirty(page)) ||
- unlikely(page->mapping != mapping)) {
- doutc(cl, "!dirty or !mapping %p\n", page);
- unlock_page(page);
- continue;
- }
- /* only if matching snap context */
- pgsnapc = page_snap_context(page);
- if (pgsnapc != snapc) {
- doutc(cl, "page snapc %p %lld != oldest %p %lld\n",
- pgsnapc, pgsnapc->seq, snapc, snapc->seq);
- if (!should_loop &&
- !ceph_wbc.head_snapc &&
- wbc->sync_mode != WB_SYNC_NONE)
- should_loop = true;
- unlock_page(page);
- continue;
+ /* only if matching snap context */
+ pgsnapc = page_snap_context(&folio->page);
+ if (pgsnapc != ceph_wbc->snapc) {
+ doutc(cl, "folio snapc %p %lld != oldest %p %lld\n",
+ pgsnapc, pgsnapc->seq,
+ ceph_wbc->snapc, ceph_wbc->snapc->seq);
+
+ if (!ceph_wbc->should_loop && !ceph_wbc->head_snapc &&
+ wbc->sync_mode != WB_SYNC_NONE)
+ ceph_wbc->should_loop = true;
+
+ return -ENODATA;
+ }
+
+ if (folio_pos(folio) >= ceph_wbc->i_size) {
+ doutc(cl, "folio at %lu beyond eof %llu\n",
+ folio->index, ceph_wbc->i_size);
+
+ if ((ceph_wbc->size_stable ||
+ folio_pos(folio) >= i_size_read(inode)) &&
+ folio_clear_dirty_for_io(folio))
+ folio_invalidate(folio, 0, folio_size(folio));
+
+ return -ENODATA;
+ }
+
+ if (ceph_wbc->strip_unit_end &&
+ (folio->index > ceph_wbc->strip_unit_end)) {
+ doutc(cl, "end of strip unit %p\n", folio);
+ return -E2BIG;
+ }
+
+ return 0;
+}
+
+static inline
+void __ceph_allocate_page_array(struct ceph_writeback_ctl *ceph_wbc,
+ unsigned int max_pages)
+{
+ ceph_wbc->pages = kmalloc_array(max_pages,
+ sizeof(*ceph_wbc->pages),
+ GFP_NOFS);
+ if (!ceph_wbc->pages) {
+ ceph_wbc->from_pool = true;
+ ceph_wbc->pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS);
+ BUG_ON(!ceph_wbc->pages);
+ }
+}
+
+static inline
+void ceph_allocate_page_array(struct address_space *mapping,
+ struct ceph_writeback_ctl *ceph_wbc,
+ struct folio *folio)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ u64 objnum;
+ u64 objoff;
+ u32 xlen;
+
+ /* prepare async write request */
+ ceph_wbc->offset = (u64)folio_pos(folio);
+ ceph_calc_file_object_mapping(&ci->i_layout,
+ ceph_wbc->offset, ceph_wbc->wsize,
+ &objnum, &objoff, &xlen);
+
+ ceph_wbc->num_ops = 1;
+ ceph_wbc->strip_unit_end = folio->index + ((xlen - 1) >> PAGE_SHIFT);
+
+ BUG_ON(ceph_wbc->pages);
+ ceph_wbc->max_pages = calc_pages_for(0, (u64)xlen);
+ __ceph_allocate_page_array(ceph_wbc, ceph_wbc->max_pages);
+
+ ceph_wbc->len = 0;
+}
+
+static inline
+bool is_folio_index_contiguous(const struct ceph_writeback_ctl *ceph_wbc,
+ const struct folio *folio)
+{
+ return folio->index == (ceph_wbc->offset + ceph_wbc->len) >> PAGE_SHIFT;
+}
+
+static inline
+bool is_num_ops_too_big(struct ceph_writeback_ctl *ceph_wbc)
+{
+ return ceph_wbc->num_ops >=
+ (ceph_wbc->from_pool ? CEPH_OSD_SLAB_OPS : CEPH_OSD_MAX_OPS);
+}
+
+static inline
+bool is_write_congestion_happened(struct ceph_fs_client *fsc)
+{
+ return atomic_long_inc_return(&fsc->writeback_count) >
+ CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb);
+}
+
+static inline int move_dirty_folio_in_page_array(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ceph_writeback_ctl *ceph_wbc, struct folio *folio)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+ struct page **pages = ceph_wbc->pages;
+ unsigned int index = ceph_wbc->locked_pages;
+ gfp_t gfp_flags = ceph_wbc->locked_pages ? GFP_NOWAIT : GFP_NOFS;
+
+ if (IS_ENCRYPTED(inode)) {
+ pages[index] = fscrypt_encrypt_pagecache_blocks(folio,
+ PAGE_SIZE,
+ 0,
+ gfp_flags);
+ if (IS_ERR(pages[index])) {
+ if (PTR_ERR(pages[index]) == -EINVAL) {
+ pr_err_client(cl, "inode->i_blkbits=%hhu\n",
+ inode->i_blkbits);
}
- if (page_offset(page) >= ceph_wbc.i_size) {
- doutc(cl, "folio at %lu beyond eof %llu\n",
- folio->index, ceph_wbc.i_size);
- if ((ceph_wbc.size_stable ||
- folio_pos(folio) >= i_size_read(inode)) &&
- folio_clear_dirty_for_io(folio))
- folio_invalidate(folio, 0,
- folio_size(folio));
+
+ /* better not fail on first page! */
+ BUG_ON(ceph_wbc->locked_pages == 0);
+
+ pages[index] = NULL;
+ return PTR_ERR(pages[index]);
+ }
+ } else {
+ pages[index] = &folio->page;
+ }
+
+ ceph_wbc->locked_pages++;
+
+ return 0;
+}
+
+static
+int ceph_process_folio_batch(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ceph_writeback_ctl *ceph_wbc)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+ struct folio *folio = NULL;
+ unsigned i;
+ int rc = 0;
+
+ for (i = 0; can_next_page_be_processed(ceph_wbc, i); i++) {
+ folio = ceph_wbc->fbatch.folios[i];
+
+ if (!folio)
+ continue;
+
+ doutc(cl, "? %p idx %lu, folio_test_writeback %#x, "
+ "folio_test_dirty %#x, folio_test_locked %#x\n",
+ folio, folio->index, folio_test_writeback(folio),
+ folio_test_dirty(folio),
+ folio_test_locked(folio));
+
+ if (folio_test_writeback(folio) ||
+ folio_test_private_2(folio) /* [DEPRECATED] */) {
+ doutc(cl, "waiting on writeback %p\n", folio);
+ folio_wait_writeback(folio);
+ folio_wait_private_2(folio); /* [DEPRECATED] */
+ continue;
+ }
+
+ if (ceph_wbc->locked_pages == 0)
+ folio_lock(folio);
+ else if (!folio_trylock(folio))
+ break;
+
+ rc = ceph_check_page_before_write(mapping, wbc,
+ ceph_wbc, folio);
+ if (rc == -ENODATA) {
+ rc = 0;
+ folio_unlock(folio);
+ ceph_wbc->fbatch.folios[i] = NULL;
+ continue;
+ } else if (rc == -E2BIG) {
+ rc = 0;
+ folio_unlock(folio);
+ ceph_wbc->fbatch.folios[i] = NULL;
+ break;
+ }
+
+ if (!folio_clear_dirty_for_io(folio)) {
+ doutc(cl, "%p !folio_clear_dirty_for_io\n", folio);
+ folio_unlock(folio);
+ ceph_wbc->fbatch.folios[i] = NULL;
+ continue;
+ }
+
+ /*
+ * We have something to write. If this is
+ * the first locked page this time through,
+ * calculate max possible write size and
+ * allocate a page array
+ */
+ if (ceph_wbc->locked_pages == 0) {
+ ceph_allocate_page_array(mapping, ceph_wbc, folio);
+ } else if (!is_folio_index_contiguous(ceph_wbc, folio)) {
+ if (is_num_ops_too_big(ceph_wbc)) {
+ folio_redirty_for_writepage(wbc, folio);
folio_unlock(folio);
- continue;
- }
- if (strip_unit_end && (page->index > strip_unit_end)) {
- doutc(cl, "end of strip unit %p\n", page);
- unlock_page(page);
break;
}
- if (folio_test_writeback(folio) ||
- folio_test_private_2(folio) /* [DEPRECATED] */) {
- if (wbc->sync_mode == WB_SYNC_NONE) {
- doutc(cl, "%p under writeback\n", folio);
- folio_unlock(folio);
- continue;
- }
- doutc(cl, "waiting on writeback %p\n", folio);
- folio_wait_writeback(folio);
- folio_wait_private_2(folio); /* [DEPRECATED] */
- }
- if (!clear_page_dirty_for_io(page)) {
- doutc(cl, "%p !clear_page_dirty_for_io\n", page);
- unlock_page(page);
- continue;
- }
+ ceph_wbc->num_ops++;
+ ceph_wbc->offset = (u64)folio_pos(folio);
+ ceph_wbc->len = 0;
+ }
- /*
- * We have something to write. If this is
- * the first locked page this time through,
- * calculate max possinle write size and
- * allocate a page array
- */
- if (locked_pages == 0) {
- u64 objnum;
- u64 objoff;
- u32 xlen;
-
- /* prepare async write request */
- offset = (u64)page_offset(page);
- ceph_calc_file_object_mapping(&ci->i_layout,
- offset, wsize,
- &objnum, &objoff,
- &xlen);
- len = xlen;
-
- num_ops = 1;
- strip_unit_end = page->index +
- ((len - 1) >> PAGE_SHIFT);
-
- BUG_ON(pages);
- max_pages = calc_pages_for(0, (u64)len);
- pages = kmalloc_array(max_pages,
- sizeof(*pages),
- GFP_NOFS);
- if (!pages) {
- from_pool = true;
- pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS);
- BUG_ON(!pages);
- }
-
- len = 0;
- } else if (page->index !=
- (offset + len) >> PAGE_SHIFT) {
- if (num_ops >= (from_pool ? CEPH_OSD_SLAB_OPS :
- CEPH_OSD_MAX_OPS)) {
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- break;
- }
-
- num_ops++;
- offset = (u64)page_offset(page);
- len = 0;
- }
+ /* note position of first page in fbatch */
+ doutc(cl, "%llx.%llx will write folio %p idx %lu\n",
+ ceph_vinop(inode), folio, folio->index);
- /* note position of first page in fbatch */
- doutc(cl, "%llx.%llx will write page %p idx %lu\n",
- ceph_vinop(inode), page, page->index);
-
- if (atomic_long_inc_return(&fsc->writeback_count) >
- CONGESTION_ON_THRESH(
- fsc->mount_options->congestion_kb))
- fsc->write_congested = true;
-
- if (IS_ENCRYPTED(inode)) {
- pages[locked_pages] =
- fscrypt_encrypt_pagecache_blocks(page,
- PAGE_SIZE, 0,
- locked_pages ? GFP_NOWAIT : GFP_NOFS);
- if (IS_ERR(pages[locked_pages])) {
- if (PTR_ERR(pages[locked_pages]) == -EINVAL)
- pr_err_client(cl,
- "inode->i_blkbits=%hhu\n",
- inode->i_blkbits);
- /* better not fail on first page! */
- BUG_ON(locked_pages == 0);
- pages[locked_pages] = NULL;
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- break;
- }
- ++locked_pages;
- } else {
- pages[locked_pages++] = page;
- }
+ fsc->write_congested = is_write_congestion_happened(fsc);
- fbatch.folios[i] = NULL;
- len += thp_size(page);
+ rc = move_dirty_folio_in_page_array(mapping, wbc, ceph_wbc,
+ folio);
+ if (rc) {
+ folio_redirty_for_writepage(wbc, folio);
+ folio_unlock(folio);
+ break;
}
- /* did we get anything? */
- if (!locked_pages)
- goto release_folios;
- if (i) {
- unsigned j, n = 0;
- /* shift unused page to beginning of fbatch */
- for (j = 0; j < nr_folios; j++) {
- if (!fbatch.folios[j])
- continue;
- if (n < j)
- fbatch.folios[n] = fbatch.folios[j];
- n++;
- }
- fbatch.nr = n;
+ ceph_wbc->fbatch.folios[i] = NULL;
+ ceph_wbc->len += folio_size(folio);
+ }
- if (nr_folios && i == nr_folios &&
- locked_pages < max_pages) {
- doutc(cl, "reached end fbatch, trying for more\n");
- folio_batch_release(&fbatch);
- goto get_more_pages;
- }
+ ceph_wbc->processed_in_fbatch = i;
+
+ return rc;
+}
+
+static inline
+void ceph_shift_unused_folios_left(struct folio_batch *fbatch)
+{
+ unsigned j, n = 0;
+
+ /* shift unused page to beginning of fbatch */
+ for (j = 0; j < folio_batch_count(fbatch); j++) {
+ if (!fbatch->folios[j])
+ continue;
+
+ if (n < j) {
+ fbatch->folios[n] = fbatch->folios[j];
}
+ n++;
+ }
+
+ fbatch->nr = n;
+}
+
+static
+int ceph_submit_write(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ceph_writeback_ctl *ceph_wbc)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+ struct ceph_vino vino = ceph_vino(inode);
+ struct ceph_osd_request *req = NULL;
+ struct page *page = NULL;
+ bool caching = ceph_is_cache_enabled(inode);
+ u64 offset;
+ u64 len;
+ unsigned i;
+
new_request:
- offset = ceph_fscrypt_page_offset(pages[0]);
- len = wsize;
+ offset = ceph_fscrypt_page_offset(ceph_wbc->pages[0]);
+ len = ceph_wbc->wsize;
+ req = ceph_osdc_new_request(&fsc->client->osdc,
+ &ci->i_layout, vino,
+ offset, &len, 0, ceph_wbc->num_ops,
+ CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
+ ceph_wbc->snapc, ceph_wbc->truncate_seq,
+ ceph_wbc->truncate_size, false);
+ if (IS_ERR(req)) {
req = ceph_osdc_new_request(&fsc->client->osdc,
- &ci->i_layout, vino,
- offset, &len, 0, num_ops,
- CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
- snapc, ceph_wbc.truncate_seq,
- ceph_wbc.truncate_size, false);
- if (IS_ERR(req)) {
- req = ceph_osdc_new_request(&fsc->client->osdc,
- &ci->i_layout, vino,
- offset, &len, 0,
- min(num_ops,
- CEPH_OSD_SLAB_OPS),
- CEPH_OSD_OP_WRITE,
- CEPH_OSD_FLAG_WRITE,
- snapc, ceph_wbc.truncate_seq,
- ceph_wbc.truncate_size, true);
- BUG_ON(IS_ERR(req));
+ &ci->i_layout, vino,
+ offset, &len, 0,
+ min(ceph_wbc->num_ops,
+ CEPH_OSD_SLAB_OPS),
+ CEPH_OSD_OP_WRITE,
+ CEPH_OSD_FLAG_WRITE,
+ ceph_wbc->snapc,
+ ceph_wbc->truncate_seq,
+ ceph_wbc->truncate_size,
+ true);
+ BUG_ON(IS_ERR(req));
+ }
+
+ page = ceph_wbc->pages[ceph_wbc->locked_pages - 1];
+ BUG_ON(len < ceph_fscrypt_page_offset(page) + thp_size(page) - offset);
+
+ if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
+ for (i = 0; i < folio_batch_count(&ceph_wbc->fbatch); i++) {
+ struct folio *folio = ceph_wbc->fbatch.folios[i];
+
+ if (!folio)
+ continue;
+
+ page = &folio->page;
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
}
- BUG_ON(len < ceph_fscrypt_page_offset(pages[locked_pages - 1]) +
- thp_size(pages[locked_pages - 1]) - offset);
- if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
- rc = -EIO;
- goto release_folios;
+ for (i = 0; i < ceph_wbc->locked_pages; i++) {
+ page = ceph_fscrypt_pagecache_page(ceph_wbc->pages[i]);
+
+ if (!page)
+ continue;
+
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
}
- req->r_callback = writepages_finish;
- req->r_inode = inode;
-
- /* Format the osd request message and submit the write */
- len = 0;
- data_pages = pages;
- op_idx = 0;
- for (i = 0; i < locked_pages; i++) {
- struct page *page = ceph_fscrypt_pagecache_page(pages[i]);
-
- u64 cur_offset = page_offset(page);
- /*
- * Discontinuity in page range? Ceph can handle that by just passing
- * multiple extents in the write op.
- */
- if (offset + len != cur_offset) {
- /* If it's full, stop here */
- if (op_idx + 1 == req->r_num_ops)
- break;
-
- /* Kick off an fscache write with what we have so far. */
- ceph_fscache_write_to_cache(inode, offset, len, caching);
-
- /* Start a new extent */
- osd_req_op_extent_dup_last(req, op_idx,
- cur_offset - offset);
- doutc(cl, "got pages at %llu~%llu\n", offset,
- len);
- osd_req_op_extent_osd_data_pages(req, op_idx,
- data_pages, len, 0,
- from_pool, false);
- osd_req_op_extent_update(req, op_idx, len);
-
- len = 0;
- offset = cur_offset;
- data_pages = pages + i;
- op_idx++;
- }
- set_page_writeback(page);
- if (caching)
- ceph_set_page_fscache(page);
- len += thp_size(page);
+ ceph_osdc_put_request(req);
+ return -EIO;
+ }
+
+ req->r_callback = writepages_finish;
+ req->r_inode = inode;
+
+ /* Format the osd request message and submit the write */
+ len = 0;
+ ceph_wbc->data_pages = ceph_wbc->pages;
+ ceph_wbc->op_idx = 0;
+ for (i = 0; i < ceph_wbc->locked_pages; i++) {
+ u64 cur_offset;
+
+ page = ceph_fscrypt_pagecache_page(ceph_wbc->pages[i]);
+ cur_offset = page_offset(page);
+
+ /*
+ * Discontinuity in page range? Ceph can handle that by just passing
+ * multiple extents in the write op.
+ */
+ if (offset + len != cur_offset) {
+ /* If it's full, stop here */
+ if (ceph_wbc->op_idx + 1 == req->r_num_ops)
+ break;
+
+ /* Kick off an fscache write with what we have so far. */
+ ceph_fscache_write_to_cache(inode, offset, len, caching);
+
+ /* Start a new extent */
+ osd_req_op_extent_dup_last(req, ceph_wbc->op_idx,
+ cur_offset - offset);
+
+ doutc(cl, "got pages at %llu~%llu\n", offset, len);
+
+ osd_req_op_extent_osd_data_pages(req, ceph_wbc->op_idx,
+ ceph_wbc->data_pages,
+ len, 0,
+ ceph_wbc->from_pool,
+ false);
+ osd_req_op_extent_update(req, ceph_wbc->op_idx, len);
+
+ len = 0;
+ offset = cur_offset;
+ ceph_wbc->data_pages = ceph_wbc->pages + i;
+ ceph_wbc->op_idx++;
}
- ceph_fscache_write_to_cache(inode, offset, len, caching);
-
- if (ceph_wbc.size_stable) {
- len = min(len, ceph_wbc.i_size - offset);
- } else if (i == locked_pages) {
- /* writepages_finish() clears writeback pages
- * according to the data length, so make sure
- * data length covers all locked pages */
- u64 min_len = len + 1 - thp_size(page);
- len = get_writepages_data_length(inode, pages[i - 1],
- offset);
- len = max(len, min_len);
+
+ set_page_writeback(page);
+
+ if (caching)
+ ceph_set_page_fscache(page);
+
+ len += thp_size(page);
+ }
+
+ ceph_fscache_write_to_cache(inode, offset, len, caching);
+
+ if (ceph_wbc->size_stable) {
+ len = min(len, ceph_wbc->i_size - offset);
+ } else if (i == ceph_wbc->locked_pages) {
+ /* writepages_finish() clears writeback pages
+ * according to the data length, so make sure
+ * data length covers all locked pages */
+ u64 min_len = len + 1 - thp_size(page);
+ len = get_writepages_data_length(inode,
+ ceph_wbc->pages[i - 1],
+ offset);
+ len = max(len, min_len);
+ }
+
+ if (IS_ENCRYPTED(inode))
+ len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE);
+
+ doutc(cl, "got pages at %llu~%llu\n", offset, len);
+
+ if (IS_ENCRYPTED(inode) &&
+ ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK)) {
+ pr_warn_client(cl,
+ "bad encrypted write offset=%lld len=%llu\n",
+ offset, len);
+ }
+
+ osd_req_op_extent_osd_data_pages(req, ceph_wbc->op_idx,
+ ceph_wbc->data_pages, len,
+ 0, ceph_wbc->from_pool, false);
+ osd_req_op_extent_update(req, ceph_wbc->op_idx, len);
+
+ BUG_ON(ceph_wbc->op_idx + 1 != req->r_num_ops);
+
+ ceph_wbc->from_pool = false;
+ if (i < ceph_wbc->locked_pages) {
+ BUG_ON(ceph_wbc->num_ops <= req->r_num_ops);
+ ceph_wbc->num_ops -= req->r_num_ops;
+ ceph_wbc->locked_pages -= i;
+
+ /* allocate new pages array for next request */
+ ceph_wbc->data_pages = ceph_wbc->pages;
+ __ceph_allocate_page_array(ceph_wbc, ceph_wbc->locked_pages);
+ memcpy(ceph_wbc->pages, ceph_wbc->data_pages + i,
+ ceph_wbc->locked_pages * sizeof(*ceph_wbc->pages));
+ memset(ceph_wbc->data_pages + i, 0,
+ ceph_wbc->locked_pages * sizeof(*ceph_wbc->pages));
+ } else {
+ BUG_ON(ceph_wbc->num_ops != req->r_num_ops);
+ /* request message now owns the pages array */
+ ceph_wbc->pages = NULL;
+ }
+
+ req->r_mtime = inode_get_mtime(inode);
+ ceph_osdc_start_request(&fsc->client->osdc, req);
+ req = NULL;
+
+ wbc->nr_to_write -= i;
+ if (ceph_wbc->pages)
+ goto new_request;
+
+ return 0;
+}
+
+static
+void ceph_wait_until_current_writes_complete(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ceph_writeback_ctl *ceph_wbc)
+{
+ struct page *page;
+ unsigned i, nr;
+
+ if (wbc->sync_mode != WB_SYNC_NONE &&
+ ceph_wbc->start_index == 0 && /* all dirty pages were checked */
+ !ceph_wbc->head_snapc) {
+ ceph_wbc->index = 0;
+
+ while ((ceph_wbc->index <= ceph_wbc->end) &&
+ (nr = filemap_get_folios_tag(mapping,
+ &ceph_wbc->index,
+ (pgoff_t)-1,
+ PAGECACHE_TAG_WRITEBACK,
+ &ceph_wbc->fbatch))) {
+ for (i = 0; i < nr; i++) {
+ page = &ceph_wbc->fbatch.folios[i]->page;
+ if (page_snap_context(page) != ceph_wbc->snapc)
+ continue;
+ wait_on_page_writeback(page);
+ }
+
+ folio_batch_release(&ceph_wbc->fbatch);
+ cond_resched();
}
- if (IS_ENCRYPTED(inode))
- len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE);
+ }
+}
+
+/*
+ * initiate async writeback
+ */
+static int ceph_writepages_start(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+ struct ceph_writeback_ctl ceph_wbc;
+ int rc = 0;
- doutc(cl, "got pages at %llu~%llu\n", offset, len);
+ if (wbc->sync_mode == WB_SYNC_NONE && fsc->write_congested)
+ return 0;
- if (IS_ENCRYPTED(inode) &&
- ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK))
- pr_warn_client(cl,
- "bad encrypted write offset=%lld len=%llu\n",
- offset, len);
-
- osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len,
- 0, from_pool, false);
- osd_req_op_extent_update(req, op_idx, len);
-
- BUG_ON(op_idx + 1 != req->r_num_ops);
-
- from_pool = false;
- if (i < locked_pages) {
- BUG_ON(num_ops <= req->r_num_ops);
- num_ops -= req->r_num_ops;
- locked_pages -= i;
-
- /* allocate new pages array for next request */
- data_pages = pages;
- pages = kmalloc_array(locked_pages, sizeof(*pages),
- GFP_NOFS);
- if (!pages) {
- from_pool = true;
- pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS);
- BUG_ON(!pages);
+ doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode),
+ wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
+ (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
+
+ if (is_forced_umount(mapping)) {
+ /* we're in a forced umount, don't write! */
+ return -EIO;
+ }
+
+ ceph_init_writeback_ctl(mapping, wbc, &ceph_wbc);
+
+ if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
+ rc = -EIO;
+ goto out;
+ }
+
+retry:
+ rc = ceph_define_writeback_range(mapping, wbc, &ceph_wbc);
+ if (rc == -ENODATA) {
+ /* hmm, why does writepages get called when there
+ is no dirty data? */
+ rc = 0;
+ goto dec_osd_stopping_blocker;
+ }
+
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ tag_pages_for_writeback(mapping, ceph_wbc.index, ceph_wbc.end);
+
+ while (!has_writeback_done(&ceph_wbc)) {
+ ceph_wbc.locked_pages = 0;
+ ceph_wbc.max_pages = ceph_wbc.wsize >> PAGE_SHIFT;
+
+get_more_pages:
+ ceph_folio_batch_reinit(&ceph_wbc);
+
+ ceph_wbc.nr_folios = filemap_get_folios_tag(mapping,
+ &ceph_wbc.index,
+ ceph_wbc.end,
+ ceph_wbc.tag,
+ &ceph_wbc.fbatch);
+ doutc(cl, "pagevec_lookup_range_tag for tag %#x got %d\n",
+ ceph_wbc.tag, ceph_wbc.nr_folios);
+
+ if (!ceph_wbc.nr_folios && !ceph_wbc.locked_pages)
+ break;
+
+process_folio_batch:
+ rc = ceph_process_folio_batch(mapping, wbc, &ceph_wbc);
+ if (rc)
+ goto release_folios;
+
+ /* did we get anything? */
+ if (!ceph_wbc.locked_pages)
+ goto release_folios;
+
+ if (ceph_wbc.processed_in_fbatch) {
+ ceph_shift_unused_folios_left(&ceph_wbc.fbatch);
+
+ if (folio_batch_count(&ceph_wbc.fbatch) == 0 &&
+ ceph_wbc.locked_pages < ceph_wbc.max_pages) {
+ doutc(cl, "reached end fbatch, trying for more\n");
+ goto get_more_pages;
}
- memcpy(pages, data_pages + i,
- locked_pages * sizeof(*pages));
- memset(data_pages + i, 0,
- locked_pages * sizeof(*pages));
- } else {
- BUG_ON(num_ops != req->r_num_ops);
- index = pages[i - 1]->index + 1;
- /* request message now owns the pages array */
- pages = NULL;
}
- req->r_mtime = inode_get_mtime(inode);
- ceph_osdc_start_request(&fsc->client->osdc, req);
- req = NULL;
+ rc = ceph_submit_write(mapping, wbc, &ceph_wbc);
+ if (rc)
+ goto release_folios;
+
+ ceph_wbc.locked_pages = 0;
+ ceph_wbc.strip_unit_end = 0;
- wbc->nr_to_write -= i;
- if (pages)
- goto new_request;
+ if (folio_batch_count(&ceph_wbc.fbatch) > 0) {
+ ceph_wbc.nr_folios =
+ folio_batch_count(&ceph_wbc.fbatch);
+ goto process_folio_batch;
+ }
/*
* We stop writing back only if we are not doing
@@ -1377,61 +1713,44 @@ new_request:
* we tagged for writeback prior to entering this loop.
*/
if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE)
- done = true;
+ ceph_wbc.done = true;
release_folios:
doutc(cl, "folio_batch release on %d folios (%p)\n",
- (int)fbatch.nr, fbatch.nr ? fbatch.folios[0] : NULL);
- folio_batch_release(&fbatch);
+ (int)ceph_wbc.fbatch.nr,
+ ceph_wbc.fbatch.nr ? ceph_wbc.fbatch.folios[0] : NULL);
+ folio_batch_release(&ceph_wbc.fbatch);
}
- if (should_loop && !done) {
+ if (ceph_wbc.should_loop && !ceph_wbc.done) {
/* more to do; loop back to beginning of file */
doutc(cl, "looping back to beginning of file\n");
- end = start_index - 1; /* OK even when start_index == 0 */
+ /* OK even when start_index == 0 */
+ ceph_wbc.end = ceph_wbc.start_index - 1;
/* to write dirty pages associated with next snapc,
* we need to wait until current writes complete */
- if (wbc->sync_mode != WB_SYNC_NONE &&
- start_index == 0 && /* all dirty pages were checked */
- !ceph_wbc.head_snapc) {
- struct page *page;
- unsigned i, nr;
- index = 0;
- while ((index <= end) &&
- (nr = filemap_get_folios_tag(mapping, &index,
- (pgoff_t)-1,
- PAGECACHE_TAG_WRITEBACK,
- &fbatch))) {
- for (i = 0; i < nr; i++) {
- page = &fbatch.folios[i]->page;
- if (page_snap_context(page) != snapc)
- continue;
- wait_on_page_writeback(page);
- }
- folio_batch_release(&fbatch);
- cond_resched();
- }
- }
+ ceph_wait_until_current_writes_complete(mapping, wbc, &ceph_wbc);
- start_index = 0;
- index = 0;
+ ceph_wbc.start_index = 0;
+ ceph_wbc.index = 0;
goto retry;
}
- if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
- mapping->writeback_index = index;
+ if (wbc->range_cyclic || (ceph_wbc.range_whole && wbc->nr_to_write > 0))
+ mapping->writeback_index = ceph_wbc.index;
+
+dec_osd_stopping_blocker:
+ ceph_dec_osd_stopping_blocker(fsc->mdsc);
out:
- ceph_osdc_put_request(req);
- ceph_put_snap_context(last_snapc);
+ ceph_put_snap_context(ceph_wbc.last_snapc);
doutc(cl, "%llx.%llx dend - startone, rc = %d\n", ceph_vinop(inode),
rc);
+
return rc;
}
-
-
/*
* See if a given @snapc is either writeable, or already written.
*/
@@ -1447,56 +1766,56 @@ static int context_is_writeable_or_written(struct inode *inode,
/**
* ceph_find_incompatible - find an incompatible context and return it
- * @page: page being dirtied
+ * @folio: folio being dirtied
*
- * We are only allowed to write into/dirty a page if the page is
+ * We are only allowed to write into/dirty a folio if the folio is
* clean, or already dirty within the same snap context. Returns a
* conflicting context if there is one, NULL if there isn't, or a
* negative error code on other errors.
*
- * Must be called with page lock held.
+ * Must be called with folio lock held.
*/
static struct ceph_snap_context *
-ceph_find_incompatible(struct page *page)
+ceph_find_incompatible(struct folio *folio)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
if (ceph_inode_is_shutdown(inode)) {
- doutc(cl, " %llx.%llx page %p is shutdown\n",
- ceph_vinop(inode), page);
+ doutc(cl, " %llx.%llx folio %p is shutdown\n",
+ ceph_vinop(inode), folio);
return ERR_PTR(-ESTALE);
}
for (;;) {
struct ceph_snap_context *snapc, *oldest;
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
- snapc = page_snap_context(page);
+ snapc = page_snap_context(&folio->page);
if (!snapc || snapc == ci->i_head_snapc)
break;
/*
- * this page is already dirty in another (older) snap
+ * this folio is already dirty in another (older) snap
* context! is it writeable now?
*/
oldest = get_oldest_context(inode, NULL, NULL);
if (snapc->seq > oldest->seq) {
/* not writeable -- return it for the caller to deal with */
ceph_put_snap_context(oldest);
- doutc(cl, " %llx.%llx page %p snapc %p not current or oldest\n",
- ceph_vinop(inode), page, snapc);
+ doutc(cl, " %llx.%llx folio %p snapc %p not current or oldest\n",
+ ceph_vinop(inode), folio, snapc);
return ceph_get_snap_context(snapc);
}
ceph_put_snap_context(oldest);
- /* yay, writeable, do it now (without dropping page lock) */
- doutc(cl, " %llx.%llx page %p snapc %p not current, but oldest\n",
- ceph_vinop(inode), page, snapc);
- if (clear_page_dirty_for_io(page)) {
- int r = writepage_nounlock(page, NULL);
+ /* yay, writeable, do it now (without dropping folio lock) */
+ doutc(cl, " %llx.%llx folio %p snapc %p not current, but oldest\n",
+ ceph_vinop(inode), folio, snapc);
+ if (folio_clear_dirty_for_io(folio)) {
+ int r = write_folio_nounlock(folio, NULL);
if (r < 0)
return ERR_PTR(r);
}
@@ -1511,7 +1830,7 @@ static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_snap_context *snapc;
- snapc = ceph_find_incompatible(folio_page(*foliop, 0));
+ snapc = ceph_find_incompatible(*foliop);
if (snapc) {
int r;
@@ -1594,7 +1913,6 @@ out:
const struct address_space_operations ceph_aops = {
.read_folio = netfs_read_folio,
.readahead = netfs_readahead,
- .writepage = ceph_writepage,
.writepages = ceph_writepages_start,
.write_begin = ceph_write_begin,
.write_end = ceph_write_end,
@@ -1602,6 +1920,7 @@ const struct address_space_operations ceph_aops = {
.invalidate_folio = ceph_invalidate_folio,
.release_folio = netfs_release_folio,
.direct_IO = noop_direct_IO,
+ .migrate_folio = filemap_migrate_folio,
};
static void ceph_block_sigs(sigset_t *oldset)
@@ -1718,8 +2037,8 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_file_info *fi = vma->vm_file->private_data;
struct ceph_cap_flush *prealloc_cf;
- struct page *page = vmf->page;
- loff_t off = page_offset(page);
+ struct folio *folio = page_folio(vmf->page);
+ loff_t off = folio_pos(folio);
loff_t size = i_size_read(inode);
size_t len;
int want, got, err;
@@ -1736,10 +2055,10 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
ceph_block_sigs(&oldset);
- if (off + thp_size(page) <= size)
- len = thp_size(page);
+ if (off + folio_size(folio) <= size)
+ len = folio_size(folio);
else
- len = offset_in_thp(page, size);
+ len = offset_in_folio(folio, size);
doutc(cl, "%llx.%llx %llu~%zd getting caps i_size %llu\n",
ceph_vinop(inode), off, len, size);
@@ -1756,30 +2075,30 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
doutc(cl, "%llx.%llx %llu~%zd got cap refs on %s\n", ceph_vinop(inode),
off, len, ceph_cap_string(got));
- /* Update time before taking page lock */
+ /* Update time before taking folio lock */
file_update_time(vma->vm_file);
inode_inc_iversion_raw(inode);
do {
struct ceph_snap_context *snapc;
- lock_page(page);
+ folio_lock(folio);
- if (page_mkwrite_check_truncate(page, inode) < 0) {
- unlock_page(page);
+ if (folio_mkwrite_check_truncate(folio, inode) < 0) {
+ folio_unlock(folio);
ret = VM_FAULT_NOPAGE;
break;
}
- snapc = ceph_find_incompatible(page);
+ snapc = ceph_find_incompatible(folio);
if (!snapc) {
- /* success. we'll keep the page locked. */
- set_page_dirty(page);
+ /* success. we'll keep the folio locked. */
+ folio_mark_dirty(folio);
ret = VM_FAULT_LOCKED;
break;
}
- unlock_page(page);
+ folio_unlock(folio);
if (IS_ERR(snapc)) {
ret = VM_FAULT_SIGBUS;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 62e99e65250d..a321aa6d0ed2 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -141,17 +141,18 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
if (ptr_pos >= i_size_read(dir))
return NULL;
- if (!cache_ctl->page || ptr_pgoff != cache_ctl->page->index) {
+ if (!cache_ctl->folio || ptr_pgoff != cache_ctl->folio->index) {
ceph_readdir_cache_release(cache_ctl);
- cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff);
- if (!cache_ctl->page) {
- doutc(cl, " page %lu not found\n", ptr_pgoff);
+ cache_ctl->folio = filemap_lock_folio(&dir->i_data, ptr_pgoff);
+ if (IS_ERR(cache_ctl->folio)) {
+ cache_ctl->folio = NULL;
+ doutc(cl, " folio %lu not found\n", ptr_pgoff);
return ERR_PTR(-EAGAIN);
}
/* reading/filling the cache are serialized by
- i_rwsem, no need to use page lock */
- unlock_page(cache_ctl->page);
- cache_ctl->dentries = kmap(cache_ctl->page);
+ i_rwsem, no need to use folio lock */
+ folio_unlock(cache_ctl->folio);
+ cache_ctl->dentries = kmap_local_folio(cache_ctl->folio, 0);
}
cache_ctl->index = idx & idx_mask;
@@ -1092,19 +1093,20 @@ out:
return err;
}
-static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
struct ceph_acl_sec_ctx as_ctx = {};
+ struct dentry *ret;
int err;
int op;
err = ceph_wait_on_conflict_unlink(dentry);
if (err)
- return err;
+ return ERR_PTR(err);
if (ceph_snap(dir) == CEPH_SNAPDIR) {
/* mkdir .snap/foo is a MKSNAP */
@@ -1116,32 +1118,32 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir,
ceph_vinop(dir), dentry, dentry, mode);
op = CEPH_MDS_OP_MKDIR;
} else {
- err = -EROFS;
+ ret = ERR_PTR(-EROFS);
goto out;
}
if (op == CEPH_MDS_OP_MKDIR &&
ceph_quota_is_max_files_exceeded(dir)) {
- err = -EDQUOT;
+ ret = ERR_PTR(-EDQUOT);
goto out;
}
if ((op == CEPH_MDS_OP_MKSNAP) && IS_ENCRYPTED(dir) &&
!fscrypt_has_encryption_key(dir)) {
- err = -ENOKEY;
+ ret = ERR_PTR(-ENOKEY);
goto out;
}
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req)) {
- err = PTR_ERR(req);
+ ret = ERR_CAST(req);
goto out;
}
mode |= S_IFDIR;
req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx);
if (IS_ERR(req->r_new_inode)) {
- err = PTR_ERR(req->r_new_inode);
+ ret = ERR_CAST(req->r_new_inode);
req->r_new_inode = NULL;
goto out_req;
}
@@ -1165,15 +1167,22 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir,
!req->r_reply_info.head->is_target &&
!req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
+ ret = ERR_PTR(err);
out_req:
+ if (!IS_ERR(ret) && req->r_dentry != dentry)
+ /* Some other dentry was spliced in */
+ ret = dget(req->r_dentry);
ceph_mdsc_put_request(req);
out:
- if (!err)
+ if (!IS_ERR(ret)) {
+ if (ret)
+ dentry = ret;
ceph_init_inode_acls(d_inode(dentry), &as_ctx);
- else
+ } else {
d_drop(dentry);
+ }
ceph_release_acl_sec_ctx(&as_ctx);
- return err;
+ return ret;
}
static int ceph_link(struct dentry *old_dentry, struct inode *dir,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 7dd6c2275085..6ac2bd555e86 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1845,10 +1845,9 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl)
{
- if (ctl->page) {
- kunmap(ctl->page);
- put_page(ctl->page);
- ctl->page = NULL;
+ if (ctl->folio) {
+ folio_release_kmap(ctl->folio, ctl->dentries);
+ ctl->folio = NULL;
}
}
@@ -1862,20 +1861,26 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
unsigned idx = ctl->index % nsize;
pgoff_t pgoff = ctl->index / nsize;
- if (!ctl->page || pgoff != ctl->page->index) {
+ if (!ctl->folio || pgoff != ctl->folio->index) {
ceph_readdir_cache_release(ctl);
+ fgf_t fgf = FGP_LOCK;
+
if (idx == 0)
- ctl->page = grab_cache_page(&dir->i_data, pgoff);
- else
- ctl->page = find_lock_page(&dir->i_data, pgoff);
- if (!ctl->page) {
+ fgf |= FGP_ACCESSED | FGP_CREAT;
+
+ ctl->folio = __filemap_get_folio(&dir->i_data, pgoff,
+ fgf, mapping_gfp_mask(&dir->i_data));
+ if (IS_ERR(ctl->folio)) {
+ int err = PTR_ERR(ctl->folio);
+
+ ctl->folio = NULL;
ctl->index = -1;
- return idx == 0 ? -ENOMEM : 0;
+ return idx == 0 ? err : 0;
}
/* reading/filling the cache are serialized by
- * i_rwsem, no need to use page lock */
- unlock_page(ctl->page);
- ctl->dentries = kmap(ctl->page);
+ * i_rwsem, no need to use folio lock */
+ folio_unlock(ctl->folio);
+ ctl->dentries = kmap_local_folio(ctl->folio, 0);
if (idx == 0)
memset(ctl->dentries, 0, PAGE_SIZE);
}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 54b3421501e9..230e0c3f341f 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -5489,6 +5489,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
spin_lock_init(&mdsc->stopping_lock);
atomic_set(&mdsc->stopping_blockers, 0);
init_completion(&mdsc->stopping_waiter);
+ atomic64_set(&mdsc->dirty_folios, 0);
+ init_waitqueue_head(&mdsc->flush_end_wq);
init_waitqueue_head(&mdsc->session_close_wq);
INIT_LIST_HEAD(&mdsc->waiting_for_map);
mdsc->quotarealms_inodes = RB_ROOT;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 7c9fee9e80d4..3e2a6fa7c19a 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -458,6 +458,9 @@ struct ceph_mds_client {
atomic_t stopping_blockers;
struct completion stopping_waiter;
+ atomic64_t dirty_folios;
+ wait_queue_head_t flush_end_wq;
+
atomic64_t quotarealms_count; /* # realms with quota */
/*
* We keep a list of inodes we don't see in the mountpoint but that we
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 4344e1f11806..f3951253e393 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1563,6 +1563,17 @@ static void ceph_kill_sb(struct super_block *s)
*/
sync_filesystem(s);
+ if (atomic64_read(&mdsc->dirty_folios) > 0) {
+ wait_queue_head_t *wq = &mdsc->flush_end_wq;
+ long timeleft = wait_event_killable_timeout(*wq,
+ atomic64_read(&mdsc->dirty_folios) <= 0,
+ fsc->client->options->mount_timeout);
+ if (!timeleft) /* timed out */
+ pr_warn_client(cl, "umount timed out, %ld\n", timeleft);
+ else if (timeleft < 0) /* killed */
+ pr_warn_client(cl, "umount was killed, %ld\n", timeleft);
+ }
+
spin_lock(&mdsc->stopping_lock);
mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHING;
wait = !!atomic_read(&mdsc->stopping_blockers);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 7fa1e7be50e4..bb0db0cc8003 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -903,7 +903,7 @@ ceph_find_rw_context(struct ceph_file_info *cf)
}
struct ceph_readdir_cache_control {
- struct page *page;
+ struct folio *folio;
struct dentry **dentries;
int index;
};
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index a3e2dfeedfbf..ab69d8f0cec2 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -166,8 +166,8 @@ err_out:
return error;
}
-static int coda_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *de, umode_t mode)
+static struct dentry *coda_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *de, umode_t mode)
{
struct inode *inode;
struct coda_vattr attrs;
@@ -177,14 +177,14 @@ static int coda_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct CodaFid newfid;
if (is_root_inode(dir) && coda_iscontrol(name, len))
- return -EPERM;
+ return ERR_PTR(-EPERM);
attrs.va_mode = mode;
- error = venus_mkdir(dir->i_sb, coda_i2f(dir),
+ error = venus_mkdir(dir->i_sb, coda_i2f(dir),
name, len, &newfid, &attrs);
if (error)
goto err_out;
-
+
inode = coda_iget(dir->i_sb, &newfid, &attrs);
if (IS_ERR(inode)) {
error = PTR_ERR(inode);
@@ -195,10 +195,10 @@ static int coda_mkdir(struct mnt_idmap *idmap, struct inode *dir,
coda_dir_inc_nlink(dir);
coda_dir_update_mtime(dir);
d_instantiate(de, inode);
- return 0;
+ return NULL;
err_out:
d_drop(de);
- return error;
+ return ERR_PTR(error);
}
/* try to make de an entry in dir_inodde linked to source_de */
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 7d10278db30d..5568cb74b322 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1280,8 +1280,8 @@ out_root_unlock:
}
EXPORT_SYMBOL(configfs_depend_item_unlocked);
-static int configfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *configfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int ret = 0;
int module_got = 0;
@@ -1461,7 +1461,7 @@ out_put:
put_fragment(frag);
out:
- return ret;
+ return ERR_PTR(ret);
}
static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
diff --git a/fs/coredump.c b/fs/coredump.c
index 4375c70144d0..d6a92cd6018e 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -926,14 +926,23 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
{
unsigned long addr;
struct page *dump_page;
+ int locked, ret;
dump_page = dump_page_alloc();
if (!dump_page)
return 0;
+ ret = 0;
+ locked = 0;
for (addr = start; addr < start + len; addr += PAGE_SIZE) {
struct page *page;
+ if (!locked) {
+ if (mmap_read_lock_killable(current->mm))
+ goto out;
+ locked = 1;
+ }
+
/*
* To avoid having to allocate page tables for virtual address
* ranges that have never been used yet, and also to make it
@@ -941,21 +950,38 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
* NULL when encountering an empty page table entry that would
* otherwise have been filled with the zero page.
*/
- page = get_dump_page(addr);
+ page = get_dump_page(addr, &locked);
if (page) {
+ if (locked) {
+ mmap_read_unlock(current->mm);
+ locked = 0;
+ }
int stop = !dump_emit_page(cprm, dump_page_copy(page, dump_page));
put_page(page);
- if (stop) {
- dump_page_free(dump_page);
- return 0;
- }
+ if (stop)
+ goto out;
} else {
dump_skip(cprm, PAGE_SIZE);
}
+
+ if (dump_interrupted())
+ goto out;
+
+ if (!need_resched())
+ continue;
+ if (locked) {
+ mmap_read_unlock(current->mm);
+ locked = 0;
+ }
cond_resched();
}
+ ret = 1;
+out:
+ if (locked)
+ mmap_read_unlock(current->mm);
+
dump_page_free(dump_page);
- return 1;
+ return ret;
}
#endif
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 328470d40dec..b74b5937e695 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -153,8 +153,8 @@ int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
}
/**
- * fscrypt_encrypt_pagecache_blocks() - Encrypt data from a pagecache page
- * @page: the locked pagecache page containing the data to encrypt
+ * fscrypt_encrypt_pagecache_blocks() - Encrypt data from a pagecache folio
+ * @folio: the locked pagecache folio containing the data to encrypt
* @len: size of the data to encrypt, in bytes
* @offs: offset within @page of the data to encrypt, in bytes
* @gfp_flags: memory allocation flags; see details below
@@ -177,23 +177,21 @@ int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
*
* Return: the new encrypted bounce page on success; an ERR_PTR() on failure
*/
-struct page *fscrypt_encrypt_pagecache_blocks(struct page *page,
- unsigned int len,
- unsigned int offs,
- gfp_t gfp_flags)
-
+struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio,
+ size_t len, size_t offs, gfp_t gfp_flags)
{
- const struct inode *inode = page->mapping->host;
+ const struct inode *inode = folio->mapping->host;
const struct fscrypt_inode_info *ci = inode->i_crypt_info;
const unsigned int du_bits = ci->ci_data_unit_bits;
const unsigned int du_size = 1U << du_bits;
struct page *ciphertext_page;
- u64 index = ((u64)page->index << (PAGE_SHIFT - du_bits)) +
+ u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) +
(offs >> du_bits);
unsigned int i;
int err;
- if (WARN_ON_ONCE(!PageLocked(page)))
+ VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
+ if (WARN_ON_ONCE(!folio_test_locked(folio)))
return ERR_PTR(-EINVAL);
if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, du_size)))
@@ -205,7 +203,7 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct page *page,
for (i = offs; i < offs + len; i += du_size, index++) {
err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, index,
- page, ciphertext_page,
+ &folio->page, ciphertext_page,
du_size, i, gfp_flags);
if (err) {
fscrypt_free_bounce_page(ciphertext_page);
@@ -213,7 +211,7 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct page *page,
}
}
SetPagePrivate(ciphertext_page);
- set_page_private(ciphertext_page, (unsigned long)page);
+ set_page_private(ciphertext_page, (unsigned long)folio);
return ciphertext_page;
}
EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks);
diff --git a/fs/dax.c b/fs/dax.c
index 21b47402b3dc..7fd4cd9a51f2 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1258,7 +1258,7 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
}
#endif /* CONFIG_FS_DAX_PMD */
-static s64 dax_unshare_iter(struct iomap_iter *iter)
+static int dax_unshare_iter(struct iomap_iter *iter)
{
struct iomap *iomap = &iter->iomap;
const struct iomap *srcmap = iomap_iter_srcmap(iter);
@@ -1266,11 +1266,11 @@ static s64 dax_unshare_iter(struct iomap_iter *iter)
u64 copy_len = iomap_length(iter);
u32 mod;
int id = 0;
- s64 ret = 0;
+ s64 ret;
void *daddr = NULL, *saddr = NULL;
if (!iomap_want_unshare_iter(iter))
- return iomap_length(iter);
+ return iomap_iter_advance_full(iter);
/*
* Extend the file range to be aligned to fsblock/pagesize, because
@@ -1300,14 +1300,14 @@ static s64 dax_unshare_iter(struct iomap_iter *iter)
if (ret < 0)
goto out_unlock;
- if (copy_mc_to_kernel(daddr, saddr, copy_len) == 0)
- ret = iomap_length(iter);
- else
+ if (copy_mc_to_kernel(daddr, saddr, copy_len) != 0)
ret = -EIO;
out_unlock:
dax_read_unlock(id);
- return dax_mem2blk_err(ret);
+ if (ret < 0)
+ return dax_mem2blk_err(ret);
+ return iomap_iter_advance_full(iter);
}
int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
@@ -1326,7 +1326,7 @@ int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
iter.len = min(len, size - pos);
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = dax_unshare_iter(&iter);
+ iter.status = dax_unshare_iter(&iter);
return ret;
}
EXPORT_SYMBOL_GPL(dax_file_unshare);
@@ -1354,17 +1354,16 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
return ret;
}
-static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
+static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
{
const struct iomap *iomap = &iter->iomap;
const struct iomap *srcmap = iomap_iter_srcmap(iter);
- loff_t pos = iter->pos;
u64 length = iomap_length(iter);
- s64 written = 0;
+ int ret;
/* already zeroed? we're done. */
if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
- return length;
+ return iomap_iter_advance(iter, &length);
/*
* invalidate the pages whose sharing state is to be changed
@@ -1372,33 +1371,35 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
*/
if (iomap->flags & IOMAP_F_SHARED)
invalidate_inode_pages2_range(iter->inode->i_mapping,
- pos >> PAGE_SHIFT,
- (pos + length - 1) >> PAGE_SHIFT);
+ iter->pos >> PAGE_SHIFT,
+ (iter->pos + length - 1) >> PAGE_SHIFT);
do {
+ loff_t pos = iter->pos;
unsigned offset = offset_in_page(pos);
- unsigned size = min_t(u64, PAGE_SIZE - offset, length);
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
- long rc;
int id;
+ length = min_t(u64, PAGE_SIZE - offset, length);
+
id = dax_read_lock();
- if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
- rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
+ if (IS_ALIGNED(pos, PAGE_SIZE) && length == PAGE_SIZE)
+ ret = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
else
- rc = dax_memzero(iter, pos, size);
+ ret = dax_memzero(iter, pos, length);
dax_read_unlock(id);
- if (rc < 0)
- return rc;
- pos += size;
- length -= size;
- written += size;
+ if (ret < 0)
+ return ret;
+
+ ret = iomap_iter_advance(iter, &length);
+ if (ret)
+ return ret;
} while (length > 0);
if (did_zero)
*did_zero = true;
- return written;
+ return ret;
}
int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
@@ -1413,7 +1414,7 @@ int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
int ret;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = dax_zero_iter(&iter, did_zero);
+ iter.status = dax_zero_iter(&iter, did_zero);
return ret;
}
EXPORT_SYMBOL_GPL(dax_zero_range);
@@ -1431,8 +1432,7 @@ int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
}
EXPORT_SYMBOL_GPL(dax_truncate_page);
-static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
- struct iov_iter *iter)
+static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter)
{
const struct iomap *iomap = &iomi->iomap;
const struct iomap *srcmap = iomap_iter_srcmap(iomi);
@@ -1451,8 +1451,10 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
if (pos >= end)
return 0;
- if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
- return iov_iter_zero(min(length, end - pos), iter);
+ if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) {
+ done = iov_iter_zero(min(length, end - pos), iter);
+ return iomap_iter_advance(iomi, &done);
+ }
}
/*
@@ -1485,7 +1487,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
}
id = dax_read_lock();
- while (pos < end) {
+ while ((pos = iomi->pos) < end) {
unsigned offset = pos & (PAGE_SIZE - 1);
const size_t size = ALIGN(length + offset, PAGE_SIZE);
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
@@ -1535,18 +1537,16 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
map_len, iter);
- pos += xfer;
- length -= xfer;
- done += xfer;
-
- if (xfer == 0)
+ length = xfer;
+ ret = iomap_iter_advance(iomi, &length);
+ if (!ret && xfer == 0)
ret = -EFAULT;
if (xfer < map_len)
break;
}
dax_read_unlock(id);
- return done ? done : ret;
+ return ret;
}
/**
@@ -1586,7 +1586,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
iomi.flags |= IOMAP_NOWAIT;
while ((ret = iomap_iter(&iomi, ops)) > 0)
- iomi.processed = dax_iomap_iter(&iomi, iter);
+ iomi.status = dax_iomap_iter(&iomi, iter);
done = iomi.pos - iocb->ki_pos;
iocb->ki_pos = iomi.pos;
@@ -1757,7 +1757,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
while ((error = iomap_iter(&iter, ops)) > 0) {
if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) {
- iter.processed = -EIO; /* fs corruption? */
+ iter.status = -EIO; /* fs corruption? */
continue;
}
@@ -1769,8 +1769,10 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
ret |= VM_FAULT_MAJOR;
}
- if (!(ret & VM_FAULT_ERROR))
- iter.processed = PAGE_SIZE;
+ if (!(ret & VM_FAULT_ERROR)) {
+ u64 length = PAGE_SIZE;
+ iter.status = iomap_iter_advance(&iter, &length);
+ }
}
if (iomap_errp)
@@ -1883,8 +1885,10 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
continue; /* actually breaks out of the loop */
ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true);
- if (ret != VM_FAULT_FALLBACK)
- iter.processed = PMD_SIZE;
+ if (ret != VM_FAULT_FALLBACK) {
+ u64 length = PMD_SIZE;
+ iter.status = iomap_iter_advance(&iter, &length);
+ }
}
unlock_entry:
@@ -1999,12 +2003,13 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order,
}
EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
-static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
+static int dax_range_compare_iter(struct iomap_iter *it_src,
struct iomap_iter *it_dest, u64 len, bool *same)
{
const struct iomap *smap = &it_src->iomap;
const struct iomap *dmap = &it_dest->iomap;
loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
+ u64 dest_len;
void *saddr, *daddr;
int id, ret;
@@ -2012,7 +2017,7 @@ static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
*same = true;
- return len;
+ goto advance;
}
if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
@@ -2035,7 +2040,13 @@ static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
if (!*same)
len = 0;
dax_read_unlock(id);
- return len;
+
+advance:
+ dest_len = len;
+ ret = iomap_iter_advance(it_src, &len);
+ if (!ret)
+ ret = iomap_iter_advance(it_dest, &dest_len);
+ return ret;
out_unlock:
dax_read_unlock(id);
@@ -2058,15 +2069,15 @@ int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
.len = len,
.flags = IOMAP_DAX,
};
- int ret, compared = 0;
+ int ret, status;
while ((ret = iomap_iter(&src_iter, ops)) > 0 &&
(ret = iomap_iter(&dst_iter, ops)) > 0) {
- compared = dax_range_compare_iter(&src_iter, &dst_iter,
+ status = dax_range_compare_iter(&src_iter, &dst_iter,
min(src_iter.len, dst_iter.len), same);
- if (compared < 0)
+ if (status < 0)
return ret;
- src_iter.processed = dst_iter.processed = compared;
+ src_iter.status = dst_iter.status = status;
}
return ret;
}
diff --git a/fs/dcache.c b/fs/dcache.c
index e3634916ffb9..623947d6a676 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2480,7 +2480,8 @@ static inline void end_dir_add(struct inode *dir, unsigned int n,
{
smp_store_release(&dir->i_dir_seq, n + 2);
preempt_enable_nested();
- wake_up_all(d_wait);
+ if (wq_has_sleeper(d_wait))
+ wake_up_all(d_wait);
}
static void d_wait_lookup(struct dentry *dentry)
@@ -2687,52 +2688,6 @@ void d_add(struct dentry *entry, struct inode *inode)
}
EXPORT_SYMBOL(d_add);
-/**
- * d_exact_alias - find and hash an exact unhashed alias
- * @entry: dentry to add
- * @inode: The inode to go with this dentry
- *
- * If an unhashed dentry with the same name/parent and desired
- * inode already exists, hash and return it. Otherwise, return
- * NULL.
- *
- * Parent directory should be locked.
- */
-struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
-{
- struct dentry *alias;
- unsigned int hash = entry->d_name.hash;
-
- spin_lock(&inode->i_lock);
- hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
- /*
- * Don't need alias->d_lock here, because aliases with
- * d_parent == entry->d_parent are not subject to name or
- * parent changes, because the parent inode i_mutex is held.
- */
- if (alias->d_name.hash != hash)
- continue;
- if (alias->d_parent != entry->d_parent)
- continue;
- if (!d_same_name(alias, entry->d_parent, &entry->d_name))
- continue;
- spin_lock(&alias->d_lock);
- if (!d_unhashed(alias)) {
- spin_unlock(&alias->d_lock);
- alias = NULL;
- } else {
- dget_dlock(alias);
- __d_rehash(alias);
- spin_unlock(&alias->d_lock);
- }
- spin_unlock(&inode->i_lock);
- return alias;
- }
- spin_unlock(&inode->i_lock);
- return NULL;
-}
-EXPORT_SYMBOL(d_exact_alias);
-
static void swap_names(struct dentry *dentry, struct dentry *target)
{
if (unlikely(dname_external(target))) {
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 1096ff8562fa..42e4d6eeb29f 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -12,6 +12,8 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/fs.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include <linux/slab.h>
@@ -21,7 +23,6 @@
#include <linux/magic.h>
#include <linux/idr.h>
#include <linux/devpts_fs.h>
-#include <linux/parser.h>
#include <linux/fsnotify.h>
#include <linux/seq_file.h>
@@ -87,14 +88,14 @@ enum {
Opt_err
};
-static const match_table_t tokens = {
- {Opt_uid, "uid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_mode, "mode=%o"},
- {Opt_ptmxmode, "ptmxmode=%o"},
- {Opt_newinstance, "newinstance"},
- {Opt_max, "max=%d"},
- {Opt_err, NULL}
+static const struct fs_parameter_spec devpts_param_specs[] = {
+ fsparam_u32 ("gid", Opt_gid),
+ fsparam_s32 ("max", Opt_max),
+ fsparam_u32oct ("mode", Opt_mode),
+ fsparam_flag ("newinstance", Opt_newinstance),
+ fsparam_u32oct ("ptmxmode", Opt_ptmxmode),
+ fsparam_u32 ("uid", Opt_uid),
+ {}
};
struct pts_fs_info {
@@ -214,93 +215,48 @@ void devpts_release(struct pts_fs_info *fsi)
deactivate_super(fsi->sb);
}
-#define PARSE_MOUNT 0
-#define PARSE_REMOUNT 1
-
/*
- * parse_mount_options():
- * Set @opts to mount options specified in @data. If an option is not
- * specified in @data, set it to its default value.
- *
- * Note: @data may be NULL (in which case all options are set to default).
+ * devpts_parse_param - Parse mount parameters
*/
-static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
+static int devpts_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p;
- kuid_t uid;
- kgid_t gid;
-
- opts->setuid = 0;
- opts->setgid = 0;
- opts->uid = GLOBAL_ROOT_UID;
- opts->gid = GLOBAL_ROOT_GID;
- opts->mode = DEVPTS_DEFAULT_MODE;
- opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
- opts->max = NR_UNIX98_PTY_MAX;
-
- /* Only allow instances mounted from the initial mount
- * namespace to tap the reserve pool of ptys.
- */
- if (op == PARSE_MOUNT)
- opts->reserve =
- (current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns);
-
- while ((p = strsep(&data, ",")) != NULL) {
- substring_t args[MAX_OPT_ARGS];
- int token;
- int option;
-
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_uid:
- if (match_int(&args[0], &option))
- return -EINVAL;
- uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(uid))
- return -EINVAL;
- opts->uid = uid;
- opts->setuid = 1;
- break;
- case Opt_gid:
- if (match_int(&args[0], &option))
- return -EINVAL;
- gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(gid))
- return -EINVAL;
- opts->gid = gid;
- opts->setgid = 1;
- break;
- case Opt_mode:
- if (match_octal(&args[0], &option))
- return -EINVAL;
- opts->mode = option & S_IALLUGO;
- break;
- case Opt_ptmxmode:
- if (match_octal(&args[0], &option))
- return -EINVAL;
- opts->ptmxmode = option & S_IALLUGO;
- break;
- case Opt_newinstance:
- break;
- case Opt_max:
- if (match_int(&args[0], &option) ||
- option < 0 || option > NR_UNIX98_PTY_MAX)
- return -EINVAL;
- opts->max = option;
- break;
- default:
- pr_err("called with bogus options\n");
- return -EINVAL;
- }
+ struct pts_fs_info *fsi = fc->s_fs_info;
+ struct pts_mount_opts *opts = &fsi->mount_opts;
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, devpts_param_specs, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_uid:
+ opts->uid = result.uid;
+ opts->setuid = 1;
+ break;
+ case Opt_gid:
+ opts->gid = result.gid;
+ opts->setgid = 1;
+ break;
+ case Opt_mode:
+ opts->mode = result.uint_32 & S_IALLUGO;
+ break;
+ case Opt_ptmxmode:
+ opts->ptmxmode = result.uint_32 & S_IALLUGO;
+ break;
+ case Opt_newinstance:
+ break;
+ case Opt_max:
+ if (result.uint_32 > NR_UNIX98_PTY_MAX)
+ return invalf(fc, "max out of range");
+ opts->max = result.uint_32;
+ break;
}
return 0;
}
-static int mknod_ptmx(struct super_block *sb)
+static int mknod_ptmx(struct super_block *sb, struct fs_context *fc)
{
int mode;
int rc = -ENOMEM;
@@ -362,13 +318,23 @@ static void update_ptmx_mode(struct pts_fs_info *fsi)
}
}
-static int devpts_remount(struct super_block *sb, int *flags, char *data)
+static int devpts_reconfigure(struct fs_context *fc)
{
- int err;
- struct pts_fs_info *fsi = DEVPTS_SB(sb);
- struct pts_mount_opts *opts = &fsi->mount_opts;
+ struct pts_fs_info *fsi = DEVPTS_SB(fc->root->d_sb);
+ struct pts_fs_info *new = fc->s_fs_info;
- err = parse_mount_options(data, PARSE_REMOUNT, opts);
+ /* Apply the revised options. We don't want to change ->reserve.
+ * Ideally, we'd update each option conditionally on it having been
+ * explicitly changed, but the default is to reset everything so that
+ * would break UAPI...
+ */
+ fsi->mount_opts.setuid = new->mount_opts.setuid;
+ fsi->mount_opts.setgid = new->mount_opts.setgid;
+ fsi->mount_opts.uid = new->mount_opts.uid;
+ fsi->mount_opts.gid = new->mount_opts.gid;
+ fsi->mount_opts.mode = new->mount_opts.mode;
+ fsi->mount_opts.ptmxmode = new->mount_opts.ptmxmode;
+ fsi->mount_opts.max = new->mount_opts.max;
/*
* parse_mount_options() restores options to default values
@@ -378,7 +344,7 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
*/
update_ptmx_mode(fsi);
- return err;
+ return 0;
}
static int devpts_show_options(struct seq_file *seq, struct dentry *root)
@@ -402,31 +368,13 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root)
static const struct super_operations devpts_sops = {
.statfs = simple_statfs,
- .remount_fs = devpts_remount,
.show_options = devpts_show_options,
};
-static void *new_pts_fs_info(struct super_block *sb)
-{
- struct pts_fs_info *fsi;
-
- fsi = kzalloc(sizeof(struct pts_fs_info), GFP_KERNEL);
- if (!fsi)
- return NULL;
-
- ida_init(&fsi->allocated_ptys);
- fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE;
- fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
- fsi->sb = sb;
-
- return fsi;
-}
-
-static int
-devpts_fill_super(struct super_block *s, void *data, int silent)
+static int devpts_fill_super(struct super_block *s, struct fs_context *fc)
{
+ struct pts_fs_info *fsi = DEVPTS_SB(s);
struct inode *inode;
- int error;
s->s_iflags &= ~SB_I_NODEV;
s->s_blocksize = 1024;
@@ -435,20 +383,11 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
s->s_op = &devpts_sops;
s->s_d_op = &simple_dentry_operations;
s->s_time_gran = 1;
+ fsi->sb = s;
- error = -ENOMEM;
- s->s_fs_info = new_pts_fs_info(s);
- if (!s->s_fs_info)
- goto fail;
-
- error = parse_mount_options(data, PARSE_MOUNT, &DEVPTS_SB(s)->mount_opts);
- if (error)
- goto fail;
-
- error = -ENOMEM;
inode = new_inode(s);
if (!inode)
- goto fail;
+ return -ENOMEM;
inode->i_ino = 1;
simple_inode_init_ts(inode);
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
@@ -459,31 +398,60 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
s->s_root = d_make_root(inode);
if (!s->s_root) {
pr_err("get root dentry failed\n");
- goto fail;
+ return -ENOMEM;
}
- error = mknod_ptmx(s);
- if (error)
- goto fail_dput;
-
- return 0;
-fail_dput:
- dput(s->s_root);
- s->s_root = NULL;
-fail:
- return error;
+ return mknod_ptmx(s, fc);
}
/*
- * devpts_mount()
+ * devpts_get_tree()
*
* Mount a new (private) instance of devpts. PTYs created in this
* instance are independent of the PTYs in other devpts instances.
*/
-static struct dentry *devpts_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int devpts_get_tree(struct fs_context *fc)
+{
+ return get_tree_nodev(fc, devpts_fill_super);
+}
+
+static void devpts_free_fc(struct fs_context *fc)
+{
+ kfree(fc->s_fs_info);
+}
+
+static const struct fs_context_operations devpts_context_ops = {
+ .free = devpts_free_fc,
+ .parse_param = devpts_parse_param,
+ .get_tree = devpts_get_tree,
+ .reconfigure = devpts_reconfigure,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+static int devpts_init_fs_context(struct fs_context *fc)
{
- return mount_nodev(fs_type, flags, data, devpts_fill_super);
+ struct pts_fs_info *fsi;
+
+ fsi = kzalloc(sizeof(struct pts_fs_info), GFP_KERNEL);
+ if (!fsi)
+ return -ENOMEM;
+
+ ida_init(&fsi->allocated_ptys);
+ fsi->mount_opts.uid = GLOBAL_ROOT_UID;
+ fsi->mount_opts.gid = GLOBAL_ROOT_GID;
+ fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE;
+ fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
+ fsi->mount_opts.max = NR_UNIX98_PTY_MAX;
+
+ if (fc->purpose == FS_CONTEXT_FOR_MOUNT &&
+ current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns)
+ fsi->mount_opts.reserve = true;
+
+ fc->s_fs_info = fsi;
+ fc->ops = &devpts_context_ops;
+ return 0;
}
static void devpts_kill_sb(struct super_block *sb)
@@ -498,7 +466,8 @@ static void devpts_kill_sb(struct super_block *sb)
static struct file_system_type devpts_fs_type = {
.name = "devpts",
- .mount = devpts_mount,
+ .init_fs_context = devpts_init_fs_context,
+ .parameters = devpts_param_specs,
.kill_sb = devpts_kill_sb,
.fs_flags = FS_USERNS_MOUNT,
};
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index a9819ddb1ab8..51a5c54eb740 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -503,18 +503,24 @@ out_lock:
return rc;
}
-static int ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int rc;
struct dentry *lower_dentry;
struct inode *lower_dir;
rc = lock_parent(dentry, &lower_dentry, &lower_dir);
- if (!rc)
- rc = vfs_mkdir(&nop_mnt_idmap, lower_dir,
- lower_dentry, mode);
- if (rc || d_really_is_negative(lower_dentry))
+ if (rc)
+ goto out;
+
+ lower_dentry = vfs_mkdir(&nop_mnt_idmap, lower_dir,
+ lower_dentry, mode);
+ rc = PTR_ERR(lower_dentry);
+ if (IS_ERR(lower_dentry))
+ goto out;
+ rc = 0;
+ if (d_unhashed(lower_dentry))
goto out;
rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
if (rc)
@@ -526,7 +532,7 @@ out:
inode_unlock(lower_dir);
if (d_really_is_negative(dentry))
d_drop(dentry);
- return rc;
+ return ERR_PTR(rc);
}
static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 0b1c878317ab..e7b7f426fecf 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -172,7 +172,6 @@ const struct super_operations ecryptfs_sops = {
.destroy_inode = ecryptfs_destroy_inode,
.free_inode = ecryptfs_free_inode,
.statfs = ecryptfs_statfs,
- .remount_fs = NULL,
.evict_inode = ecryptfs_evict_inode,
.show_options = ecryptfs_show_options
};
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 6eae8cf655c1..0486e9b68bc6 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -421,7 +421,7 @@ static bool efivarfs_actor(struct dir_context *ctx, const char *name, int len,
if (err)
size = 0;
- inode_lock(inode);
+ inode_lock_nested(inode, I_MUTEX_CHILD);
i_size_write(inode, size);
inode_unlock(inode);
@@ -474,12 +474,25 @@ static int efivarfs_check_missing(efi_char16_t *name16, efi_guid_t vendor,
return err;
}
+static void efivarfs_deactivate_super_work(struct work_struct *work)
+{
+ struct super_block *s = container_of(work, struct super_block,
+ destroy_work);
+ /*
+ * note: here s->destroy_work is free for reuse (which
+ * will happen in deactivate_super)
+ */
+ deactivate_super(s);
+}
+
+static struct file_system_type efivarfs_type;
+
static int efivarfs_pm_notify(struct notifier_block *nb, unsigned long action,
void *ptr)
{
struct efivarfs_fs_info *sfi = container_of(nb, struct efivarfs_fs_info,
pm_nb);
- struct path path = { .mnt = NULL, .dentry = sfi->sb->s_root, };
+ struct path path;
struct efivarfs_ctx ectx = {
.ctx = {
.actor = efivarfs_actor,
@@ -487,6 +500,7 @@ static int efivarfs_pm_notify(struct notifier_block *nb, unsigned long action,
.sb = sfi->sb,
};
struct file *file;
+ struct super_block *s = sfi->sb;
static bool rescan_done = true;
if (action == PM_HIBERNATION_PREPARE) {
@@ -499,11 +513,43 @@ static int efivarfs_pm_notify(struct notifier_block *nb, unsigned long action,
if (rescan_done)
return NOTIFY_DONE;
+ /* ensure single superblock is alive and pin it */
+ if (!atomic_inc_not_zero(&s->s_active))
+ return NOTIFY_DONE;
+
pr_info("efivarfs: resyncing variable state\n");
- /* O_NOATIME is required to prevent oops on NULL mnt */
+ path.dentry = sfi->sb->s_root;
+
+ /*
+ * do not add SB_KERNMOUNT which a single superblock could
+ * expose to userspace and which also causes MNT_INTERNAL, see
+ * below
+ */
+ path.mnt = vfs_kern_mount(&efivarfs_type, 0,
+ efivarfs_type.name, NULL);
+ if (IS_ERR(path.mnt)) {
+ pr_err("efivarfs: internal mount failed\n");
+ /*
+ * We may be the last pinner of the superblock but
+ * calling efivarfs_kill_sb from within the notifier
+ * here would deadlock trying to unregister it
+ */
+ INIT_WORK(&s->destroy_work, efivarfs_deactivate_super_work);
+ schedule_work(&s->destroy_work);
+ return PTR_ERR(path.mnt);
+ }
+
+ /* path.mnt now has pin on superblock, so this must be above one */
+ atomic_dec(&s->s_active);
+
file = kernel_file_open(&path, O_RDONLY | O_DIRECTORY | O_NOATIME,
current_cred());
+ /*
+ * safe even if last put because no MNT_INTERNAL means this
+ * will do delayed deactivate_super and not deadlock
+ */
+ mntput(path.mnt);
if (IS_ERR(file))
return NOTIFY_DONE;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 76129bfcd663..af42b2c7d235 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -406,14 +406,13 @@ static int do_eventfd(unsigned int count, int flags)
if (fd < 0)
goto err;
- file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags);
+ file = anon_inode_getfile_fmode("[eventfd]", &eventfd_fops,
+ ctx, flags, FMODE_NOWAIT);
if (IS_ERR(file)) {
put_unused_fd(fd);
fd = PTR_ERR(file);
goto err;
}
-
- file->f_mode |= FMODE_NOWAIT;
fd_install(fd, file);
return fd;
err:
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 7c0980db77b3..9b06a0ab9c32 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -438,7 +438,7 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time)
*
* we must do our busy polling with irqs enabled
*/
-static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
+static bool ep_busy_loop(struct eventpoll *ep)
{
unsigned int napi_id = READ_ONCE(ep->napi_id);
u16 budget = READ_ONCE(ep->busy_poll_budget);
@@ -448,7 +448,7 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
budget = BUSY_POLL_BUDGET;
if (napi_id >= MIN_NAPI_ID && ep_busy_loop_on(ep)) {
- napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end,
+ napi_busy_loop(napi_id, ep_busy_loop_end,
ep, prefer_busy_poll, budget);
if (ep_events_available(ep))
return true;
@@ -560,7 +560,7 @@ static void ep_resume_napi_irqs(struct eventpoll *ep)
#else
-static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
+static inline bool ep_busy_loop(struct eventpoll *ep)
{
return false;
}
@@ -1980,6 +1980,22 @@ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
return ret;
}
+static int ep_try_send_events(struct eventpoll *ep,
+ struct epoll_event __user *events, int maxevents)
+{
+ int res;
+
+ /*
+ * Try to transfer events to user space. In case we get 0 events and
+ * there's still timeout left over, we go trying again in search of
+ * more luck.
+ */
+ res = ep_send_events(ep, events, maxevents);
+ if (res > 0)
+ ep_suspend_napi_irqs(ep);
+ return res;
+}
+
/**
* ep_poll - Retrieves ready events, and delivers them to the caller-supplied
* event buffer.
@@ -2031,23 +2047,15 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
while (1) {
if (eavail) {
- /*
- * Try to transfer events to user space. In case we get
- * 0 events and there's still timeout left over, we go
- * trying again in search of more luck.
- */
- res = ep_send_events(ep, events, maxevents);
- if (res) {
- if (res > 0)
- ep_suspend_napi_irqs(ep);
+ res = ep_try_send_events(ep, events, maxevents);
+ if (res)
return res;
- }
}
if (timed_out)
return 0;
- eavail = ep_busy_loop(ep, timed_out);
+ eavail = ep_busy_loop(ep);
if (eavail)
continue;
@@ -2445,6 +2453,47 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
return do_epoll_ctl(epfd, op, fd, &epds, false);
}
+static int ep_check_params(struct file *file, struct epoll_event __user *evs,
+ int maxevents)
+{
+ /* The maximum number of event must be greater than zero */
+ if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
+ return -EINVAL;
+
+ /* Verify that the area passed by the user is writeable */
+ if (!access_ok(evs, maxevents * sizeof(struct epoll_event)))
+ return -EFAULT;
+
+ /*
+ * We have to check that the file structure underneath the fd
+ * the user passed to us _is_ an eventpoll file.
+ */
+ if (!is_file_epoll(file))
+ return -EINVAL;
+
+ return 0;
+}
+
+int epoll_sendevents(struct file *file, struct epoll_event __user *events,
+ int maxevents)
+{
+ struct eventpoll *ep;
+ int ret;
+
+ ret = ep_check_params(file, events, maxevents);
+ if (unlikely(ret))
+ return ret;
+
+ ep = file->private_data;
+ /*
+ * Racy call, but that's ok - it should get retried based on
+ * poll readiness anyway.
+ */
+ if (ep_events_available(ep))
+ return ep_try_send_events(ep, events, maxevents);
+ return 0;
+}
+
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_wait(2).
@@ -2453,26 +2502,16 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
int maxevents, struct timespec64 *to)
{
struct eventpoll *ep;
-
- /* The maximum number of event must be greater than zero */
- if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
- return -EINVAL;
-
- /* Verify that the area passed by the user is writeable */
- if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
- return -EFAULT;
+ int ret;
/* Get the "struct file *" for the eventpoll file */
CLASS(fd, f)(epfd);
if (fd_empty(f))
return -EBADF;
- /*
- * We have to check that the file structure underneath the fd
- * the user passed to us _is_ an eventpoll file.
- */
- if (!is_file_epoll(fd_file(f)))
- return -EINVAL;
+ ret = ep_check_params(fd_file(f), events, maxevents);
+ if (unlikely(ret))
+ return ret;
/*
* At this point it is safe to assume that the "private_data" contains
diff --git a/fs/exec.c b/fs/exec.c
index 506cd411f4ac..f45859ad13ac 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -755,8 +755,6 @@ int setup_arg_pages(struct linux_binprm *bprm,
mm->arg_start = bprm->p;
#endif
- if (bprm->loader)
- bprm->loader -= stack_shift;
bprm->exec -= stack_shift;
if (mmap_write_lock_killable(mm))
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 8b30027d8251..fede0283d6e2 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -840,8 +840,8 @@ unlock:
return err;
}
-static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct super_block *sb = dir->i_sb;
struct inode *inode;
@@ -851,7 +851,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
loff_t size = i_size_read(dir);
if (unlikely(exfat_forced_shutdown(sb)))
- return -EIO;
+ return ERR_PTR(-EIO);
mutex_lock(&EXFAT_SB(sb)->s_lock);
exfat_set_volume_dirty(sb);
@@ -882,7 +882,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
unlock:
mutex_unlock(&EXFAT_SB(sb)->s_lock);
- return err;
+ return ERR_PTR(err);
}
static int exfat_check_dir_empty(struct super_block *sb,
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 0c899cfba578..b5845c4846b8 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -126,10 +126,8 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
int err;
parent = ERR_PTR(-EACCES);
- inode_lock(dentry->d_inode);
if (mnt->mnt_sb->s_export_op->get_parent)
parent = mnt->mnt_sb->s_export_op->get_parent(dentry);
- inode_unlock(dentry->d_inode);
if (IS_ERR(parent)) {
dprintk("get_parent of %lu failed, err %ld\n",
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 8346ab9534c1..bde617a66cec 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -225,15 +225,16 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
return err;
}
-static int ext2_mkdir(struct mnt_idmap * idmap,
- struct inode * dir, struct dentry * dentry, umode_t mode)
+static struct dentry *ext2_mkdir(struct mnt_idmap * idmap,
+ struct inode * dir, struct dentry * dentry,
+ umode_t mode)
{
struct inode * inode;
int err;
err = dquot_initialize(dir);
if (err)
- return err;
+ return ERR_PTR(err);
inode_inc_link_count(dir);
@@ -258,7 +259,7 @@ static int ext2_mkdir(struct mnt_idmap * idmap,
d_instantiate_new(dentry, inode);
out:
- return err;
+ return ERR_PTR(err);
out_fail:
inode_dec_link_count(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7c54ae5fcbd4..d04d8a7f12e7 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3290,6 +3290,10 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
if (map->m_flags & EXT4_MAP_NEW)
iomap->flags |= IOMAP_F_NEW;
+ /* HW-offload atomics are always used */
+ if (flags & IOMAP_ATOMIC)
+ iomap->flags |= IOMAP_F_ATOMIC_BIO;
+
if (flags & IOMAP_DAX)
iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
else
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 536d56d15072..716cc6096870 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3004,19 +3004,19 @@ out:
return err;
}
-static int ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
handle_t *handle;
struct inode *inode;
int err, err2 = 0, credits, retries = 0;
if (EXT4_DIR_LINK_MAX(dir))
- return -EMLINK;
+ return ERR_PTR(-EMLINK);
err = dquot_initialize(dir);
if (err)
- return err;
+ return ERR_PTR(err);
credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
@@ -3066,7 +3066,7 @@ out_stop:
out_retry:
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
- return err;
+ return ERR_PTR(err);
}
/*
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 69b8a7221a2b..37abee5016c3 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -522,7 +522,7 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
if (io->io_bio)
gfp_flags = GFP_NOWAIT | __GFP_NOWARN;
retry_encrypt:
- bounce_page = fscrypt_encrypt_pagecache_blocks(&folio->page,
+ bounce_page = fscrypt_encrypt_pagecache_blocks(folio,
enc_bytes, 0, gfp_flags);
if (IS_ERR(bounce_page)) {
ret = PTR_ERR(bounce_page);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index de4da6d9cd93..ece5208223c1 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2500,7 +2500,7 @@ int f2fs_encrypt_one_page(struct f2fs_io_info *fio)
return 0;
retry_encrypt:
- fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page,
+ fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page_folio(page),
PAGE_SIZE, 0, gfp_flags);
if (IS_ERR(fio->encrypted_page)) {
/* flush pending IOs and wait for a while in the ENOMEM case */
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index a278c7da8177..24dca4dc85a9 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -684,23 +684,23 @@ out_free_encrypted_link:
return err;
}
-static int f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
int err;
if (unlikely(f2fs_cp_error(sbi)))
- return -EIO;
+ return ERR_PTR(-EIO);
err = f2fs_dquot_initialize(dir);
if (err)
- return err;
+ return ERR_PTR(err);
inode = f2fs_new_inode(idmap, dir, S_IFDIR | mode, NULL);
if (IS_ERR(inode))
- return PTR_ERR(inode);
+ return ERR_CAST(inode);
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
@@ -722,12 +722,12 @@ static int f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
f2fs_sync_fs(sbi->sb, 1);
f2fs_balance_fs(sbi, true);
- return 0;
+ return NULL;
out_fail:
clear_inode_flag(inode, FI_INC_LINK);
f2fs_handle_failed_inode(inode);
- return err;
+ return ERR_PTR(err);
}
static int f2fs_rmdir(struct inode *dir, struct dentry *dentry)
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index f06f6ba643cc..23e9b9371ec3 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -339,8 +339,8 @@ out:
}
/***** Make a directory */
-static int msdos_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *msdos_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct super_block *sb = dir->i_sb;
struct fat_slot_info sinfo;
@@ -389,13 +389,13 @@ static int msdos_mkdir(struct mnt_idmap *idmap, struct inode *dir,
mutex_unlock(&MSDOS_SB(sb)->s_lock);
fat_flush_inodes(sb, dir, inode);
- return 0;
+ return NULL;
out_free:
fat_free_clusters(dir, cluster);
out:
mutex_unlock(&MSDOS_SB(sb)->s_lock);
- return err;
+ return ERR_PTR(err);
}
/***** Unlink a file */
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 926c26e90ef8..dd910edd2404 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -841,8 +841,8 @@ out:
return err;
}
-static int vfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *vfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct super_block *sb = dir->i_sb;
struct inode *inode;
@@ -877,13 +877,13 @@ static int vfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
d_instantiate(dentry, inode);
mutex_unlock(&MSDOS_SB(sb)->s_lock);
- return 0;
+ return NULL;
out_free:
fat_free_clusters(dir, cluster);
out:
mutex_unlock(&MSDOS_SB(sb)->s_lock);
- return err;
+ return ERR_PTR(err);
}
static int vfat_get_dotdot_de(struct inode *inode, struct buffer_head **bh,
diff --git a/fs/file.c b/fs/file.c
index d868cdb95d1e..dc3f7e120e3e 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -26,6 +26,28 @@
#include "internal.h"
+bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt)
+{
+ /*
+ * If the reference count was already in the dead zone, then this
+ * put() operation is imbalanced. Warn, put the reference count back to
+ * DEAD and tell the caller to not deconstruct the object.
+ */
+ if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) {
+ atomic_long_set(&ref->refcnt, FILE_REF_DEAD);
+ return false;
+ }
+
+ /*
+ * This is a put() operation on a saturated refcount. Restore the
+ * mean saturation value and tell the caller to not deconstruct the
+ * object.
+ */
+ if (cnt > FILE_REF_MAXREF)
+ atomic_long_set(&ref->refcnt, FILE_REF_SATURATED);
+ return false;
+}
+
/**
* __file_ref_put - Slowpath of file_ref_put()
* @ref: Pointer to the reference count
@@ -67,24 +89,7 @@ bool __file_ref_put(file_ref_t *ref, unsigned long cnt)
return true;
}
- /*
- * If the reference count was already in the dead zone, then this
- * put() operation is imbalanced. Warn, put the reference count back to
- * DEAD and tell the caller to not deconstruct the object.
- */
- if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) {
- atomic_long_set(&ref->refcnt, FILE_REF_DEAD);
- return false;
- }
-
- /*
- * This is a put() operation on a saturated refcount. Restore the
- * mean saturation value and tell the caller to not deconstruct the
- * object.
- */
- if (cnt > FILE_REF_MAXREF)
- atomic_long_set(&ref->refcnt, FILE_REF_SATURATED);
- return false;
+ return __file_ref_put_badval(ref, cnt);
}
EXPORT_SYMBOL_GPL(__file_ref_put);
@@ -418,17 +423,25 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho
old_fds = old_fdt->fd;
new_fds = new_fdt->fd;
+ /*
+ * We may be racing against fd allocation from other threads using this
+ * files_struct, despite holding ->file_lock.
+ *
+ * alloc_fd() might have already claimed a slot, while fd_install()
+ * did not populate it yet. Note the latter operates locklessly, so
+ * the file can show up as we are walking the array below.
+ *
+ * At the same time we know no files will disappear as all other
+ * operations take the lock.
+ *
+ * Instead of trying to placate userspace racing with itself, we
+ * ref the file if we see it and mark the fd slot as unused otherwise.
+ */
for (i = open_files; i != 0; i--) {
- struct file *f = *old_fds++;
+ struct file *f = rcu_dereference_raw(*old_fds++);
if (f) {
get_file(f);
} else {
- /*
- * The fd may be claimed in the fd bitmap but not yet
- * instantiated in the files array if a sibling thread
- * is partway through open(). So make sure that this
- * fd is available to the new process.
- */
__clear_open_fd(open_files - i, new_fdt);
}
rcu_assign_pointer(*new_fds++, f);
@@ -577,6 +590,7 @@ repeat:
__set_open_fd(fd, fdt, flags & O_CLOEXEC);
error = fd;
+ VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
out:
spin_unlock(&files->file_lock);
@@ -612,22 +626,14 @@ void put_unused_fd(unsigned int fd)
EXPORT_SYMBOL(put_unused_fd);
-/*
- * Install a file pointer in the fd array.
- *
- * The VFS is full of places where we drop the files lock between
- * setting the open_fds bitmap and installing the file in the file
- * array. At any such point, we are vulnerable to a dup2() race
- * installing a file in the array before us. We need to detect this and
- * fput() the struct file we are about to overwrite in this case.
- *
- * It should never happen - if we allow dup2() do it, _really_ bad things
- * will follow.
+/**
+ * fd_install - install a file pointer in the fd array
+ * @fd: file descriptor to install the file in
+ * @file: the file to install
*
* This consumes the "file" refcount, so callers should treat it
* as if they had called fput(file).
*/
-
void fd_install(unsigned int fd, struct file *file)
{
struct files_struct *files = current->files;
@@ -642,7 +648,7 @@ void fd_install(unsigned int fd, struct file *file)
rcu_read_unlock_sched();
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
- WARN_ON(fdt->fd[fd] != NULL);
+ VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
spin_unlock(&files->file_lock);
return;
@@ -650,7 +656,7 @@ void fd_install(unsigned int fd, struct file *file)
/* coupled with smp_wmb() in expand_fdtable() */
smp_rmb();
fdt = rcu_dereference_sched(files->fdt);
- BUG_ON(fdt->fd[fd] != NULL);
+ VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
rcu_read_unlock_sched();
}
@@ -679,7 +685,7 @@ struct file *file_close_fd_locked(struct files_struct *files, unsigned fd)
return NULL;
fd = array_index_nospec(fd, fdt->max_fds);
- file = fdt->fd[fd];
+ file = rcu_dereference_raw(fdt->fd[fd]);
if (file) {
rcu_assign_pointer(fdt->fd[fd], NULL);
__put_unused_fd(files, fd);
@@ -1178,8 +1184,23 @@ struct fd fdget_raw(unsigned int fd)
*/
static inline bool file_needs_f_pos_lock(struct file *file)
{
- return (file->f_mode & FMODE_ATOMIC_POS) &&
- (file_count(file) > 1 || file->f_op->iterate_shared);
+ if (!(file->f_mode & FMODE_ATOMIC_POS))
+ return false;
+ if (__file_ref_read_raw(&file->f_ref) != FILE_REF_ONEREF)
+ return true;
+ if (file->f_op->iterate_shared)
+ return true;
+ return false;
+}
+
+bool file_seek_cur_needs_f_lock(struct file *file)
+{
+ if (!(file->f_mode & FMODE_ATOMIC_POS) && !file->f_op->iterate_shared)
+ return false;
+
+ VFS_WARN_ON_ONCE((file_count(file) > 1) &&
+ !mutex_is_locked(&file->f_pos_lock));
+ return true;
}
struct fd fdget_pos(unsigned int fd)
@@ -1187,7 +1208,7 @@ struct fd fdget_pos(unsigned int fd)
struct fd f = fdget(fd);
struct file *file = fd_file(f);
- if (file && file_needs_f_pos_lock(file)) {
+ if (likely(file) && file_needs_f_pos_lock(file)) {
f.word |= FDPUT_POS_UNLOCK;
mutex_lock(&file->f_pos_lock);
}
@@ -1230,14 +1251,34 @@ __releases(&files->file_lock)
struct fdtable *fdt;
/*
- * We need to detect attempts to do dup2() over allocated but still
- * not finished descriptor.
+ * dup2() is expected to close the file installed in the target fd slot
+ * (if any). However, userspace hand-picking a fd may be racing against
+ * its own threads which happened to allocate it in open() et al but did
+ * not populate it yet.
+ *
+ * Broadly speaking we may be racing against the following:
+ * fd = get_unused_fd_flags(); // fd slot reserved, ->fd[fd] == NULL
+ * file = hard_work_goes_here();
+ * fd_install(fd, file); // only now ->fd[fd] == file
+ *
+ * It is an invariant that a successfully allocated fd has a NULL entry
+ * in the array until the matching fd_install().
+ *
+ * If we fit the window, we have the fd to populate, yet no target file
+ * to close. Trying to ignore it and install our new file would violate
+ * the invariant and make fd_install() overwrite our file.
+ *
+ * Things can be done(tm) to handle this. However, the issue does not
+ * concern legitimate programs and we only need to make sure the kernel
+ * does not trip over it.
+ *
+ * The simplest way out is to return an error if we find ourselves here.
*
* POSIX is silent on the issue, we return -EBUSY.
*/
fdt = files_fdtable(files);
fd = array_index_nospec(fd, fdt->max_fds);
- tofree = fdt->fd[fd];
+ tofree = rcu_dereference_raw(fdt->fd[fd]);
if (!tofree && fd_is_open(fd, fdt))
goto Ebusy;
get_file(file);
diff --git a/fs/file_table.c b/fs/file_table.c
index 5c00dc38558d..c04ed94cdc4b 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -221,7 +221,8 @@ struct file *alloc_empty_file(int flags, const struct cred *cred)
/*
* Privileged users can go above max_files
*/
- if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
+ if (unlikely(get_nr_files() >= files_stat.max_files) &&
+ !capable(CAP_SYS_ADMIN)) {
/*
* percpu_counters are inaccurate. Do an expensive check before
* we go and fail.
@@ -511,31 +512,37 @@ void flush_delayed_fput(void)
}
EXPORT_SYMBOL_GPL(flush_delayed_fput);
-void fput(struct file *file)
+static void __fput_deferred(struct file *file)
{
- if (file_ref_put(&file->f_ref)) {
- struct task_struct *task = current;
+ struct task_struct *task = current;
+
+ if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) {
+ file_free(file);
+ return;
+ }
- if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) {
- file_free(file);
+ if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
+ init_task_work(&file->f_task_work, ____fput);
+ if (!task_work_add(task, &file->f_task_work, TWA_RESUME))
return;
- }
- if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
- init_task_work(&file->f_task_work, ____fput);
- if (!task_work_add(task, &file->f_task_work, TWA_RESUME))
- return;
- /*
- * After this task has run exit_task_work(),
- * task_work_add() will fail. Fall through to delayed
- * fput to avoid leaking *file.
- */
- }
-
- if (llist_add(&file->f_llist, &delayed_fput_list))
- schedule_delayed_work(&delayed_fput_work, 1);
+ /*
+ * After this task has run exit_task_work(),
+ * task_work_add() will fail. Fall through to delayed
+ * fput to avoid leaking *file.
+ */
}
+
+ if (llist_add(&file->f_llist, &delayed_fput_list))
+ schedule_delayed_work(&delayed_fput_work, 1);
}
+void fput(struct file *file)
+{
+ if (unlikely(file_ref_put(&file->f_ref)))
+ __fput_deferred(file);
+}
+EXPORT_SYMBOL(fput);
+
/*
* synchronous analog of fput(); for kernel threads that might be needed
* in some umount() (and thus can't use flush_delayed_fput() without
@@ -549,10 +556,32 @@ void __fput_sync(struct file *file)
if (file_ref_put(&file->f_ref))
__fput(file);
}
-
-EXPORT_SYMBOL(fput);
EXPORT_SYMBOL(__fput_sync);
+/*
+ * Equivalent to __fput_sync(), but optimized for being called with the last
+ * reference.
+ *
+ * See file_ref_put_close() for details.
+ */
+void fput_close_sync(struct file *file)
+{
+ if (likely(file_ref_put_close(&file->f_ref)))
+ __fput(file);
+}
+
+/*
+ * Equivalent to fput(), but optimized for being called with the last
+ * reference.
+ *
+ * See file_ref_put_close() for details.
+ */
+void fput_close(struct file *file)
+{
+ if (file_ref_put_close(&file->f_ref))
+ __fput_deferred(file);
+}
+
void __init files_init(void)
{
struct kmem_cache_args args = {
diff --git a/fs/fsopen.c b/fs/fsopen.c
index 094a7f510edf..1aaf4cb2afb2 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -453,7 +453,7 @@ SYSCALL_DEFINE5(fsconfig,
case FSCONFIG_SET_FD:
param.type = fs_value_is_file;
ret = -EBADF;
- param.file = fget(aux);
+ param.file = fget_raw(aux);
if (!param.file)
goto out_key;
param.dirfd = aux;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 2c3a4d09e500..51e31df4c546 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -77,7 +77,7 @@ void fuse_set_initialized(struct fuse_conn *fc)
static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background)
{
return !fc->initialized || (for_background && fc->blocked) ||
- (fc->io_uring && !fuse_uring_ready(fc));
+ (fc->io_uring && fc->connected && !fuse_uring_ready(fc));
}
static void fuse_drop_waiting(struct fuse_conn *fc)
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index ebd2931b4f2a..82bf458fa9db 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -208,11 +208,11 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
init_waitqueue_head(&ring->stop_waitq);
- fc->ring = ring;
ring->nr_queues = nr_queues;
ring->fc = fc;
ring->max_payload_sz = max_payload_size;
atomic_set(&ring->queue_refs, 0);
+ smp_store_release(&fc->ring, ring);
spin_unlock(&fc->lock);
return ring;
@@ -1041,7 +1041,7 @@ static int fuse_uring_register(struct io_uring_cmd *cmd,
unsigned int issue_flags, struct fuse_conn *fc)
{
const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe);
- struct fuse_ring *ring = fc->ring;
+ struct fuse_ring *ring = smp_load_acquire(&fc->ring);
struct fuse_ring_queue *queue;
struct fuse_ring_ent *ent;
int err;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 3805f9b06c9d..fa8f1141ea74 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -781,9 +781,9 @@ no_open:
/*
* Code shared between mknod, mkdir, symlink and link
*/
-static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm,
- struct fuse_args *args, struct inode *dir,
- struct dentry *entry, umode_t mode)
+static struct dentry *create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm,
+ struct fuse_args *args, struct inode *dir,
+ struct dentry *entry, umode_t mode)
{
struct fuse_entry_out outarg;
struct inode *inode;
@@ -792,11 +792,11 @@ static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm,
struct fuse_forget_link *forget;
if (fuse_is_bad(dir))
- return -EIO;
+ return ERR_PTR(-EIO);
forget = fuse_alloc_forget();
if (!forget)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
memset(&outarg, 0, sizeof(outarg));
args->nodeid = get_node_id(dir);
@@ -826,29 +826,43 @@ static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm,
&outarg.attr, ATTR_TIMEOUT(&outarg), 0, 0);
if (!inode) {
fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1);
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
}
kfree(forget);
d_drop(entry);
d = d_splice_alias(inode, entry);
if (IS_ERR(d))
- return PTR_ERR(d);
+ return d;
- if (d) {
+ if (d)
fuse_change_entry_timeout(d, &outarg);
- dput(d);
- } else {
+ else
fuse_change_entry_timeout(entry, &outarg);
- }
fuse_dir_changed(dir);
- return 0;
+ return d;
out_put_forget_req:
if (err == -EEXIST)
fuse_invalidate_entry(entry);
kfree(forget);
- return err;
+ return ERR_PTR(err);
+}
+
+static int create_new_nondir(struct mnt_idmap *idmap, struct fuse_mount *fm,
+ struct fuse_args *args, struct inode *dir,
+ struct dentry *entry, umode_t mode)
+{
+ /*
+ * Note that when creating anything other than a directory we
+ * can be sure create_new_entry() will NOT return an alternate
+ * dentry as d_splice_alias() only returns an alternate dentry
+ * for directories. So we don't need to check for that case
+ * when passing back the result.
+ */
+ WARN_ON_ONCE(S_ISDIR(mode));
+
+ return PTR_ERR(create_new_entry(idmap, fm, args, dir, entry, mode));
}
static int fuse_mknod(struct mnt_idmap *idmap, struct inode *dir,
@@ -871,7 +885,7 @@ static int fuse_mknod(struct mnt_idmap *idmap, struct inode *dir,
args.in_args[0].value = &inarg;
args.in_args[1].size = entry->d_name.len + 1;
args.in_args[1].value = entry->d_name.name;
- return create_new_entry(idmap, fm, &args, dir, entry, mode);
+ return create_new_nondir(idmap, fm, &args, dir, entry, mode);
}
static int fuse_create(struct mnt_idmap *idmap, struct inode *dir,
@@ -898,8 +912,8 @@ static int fuse_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
return err;
}
-static int fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *entry, umode_t mode)
+static struct dentry *fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *entry, umode_t mode)
{
struct fuse_mkdir_in inarg;
struct fuse_mount *fm = get_fuse_mount(dir);
@@ -934,7 +948,7 @@ static int fuse_symlink(struct mnt_idmap *idmap, struct inode *dir,
args.in_args[1].value = entry->d_name.name;
args.in_args[2].size = len;
args.in_args[2].value = link;
- return create_new_entry(idmap, fm, &args, dir, entry, S_IFLNK);
+ return create_new_nondir(idmap, fm, &args, dir, entry, S_IFLNK);
}
void fuse_flush_time_update(struct inode *inode)
@@ -1131,7 +1145,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
args.in_args[0].value = &inarg;
args.in_args[1].size = newent->d_name.len + 1;
args.in_args[1].value = newent->d_name.name;
- err = create_new_entry(&invalid_mnt_idmap, fm, &args, newdir, newent, inode->i_mode);
+ err = create_new_nondir(&invalid_mnt_idmap, fm, &args, newdir, newent, inode->i_mode);
if (!err)
fuse_update_ctime_in_cache(inode);
else if (err == -EINTR)
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 1795c4e8dbf6..366516b98b3f 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1300,7 +1300,8 @@ static int gfs2_block_zero_range(struct inode *inode, loff_t from,
unsigned int length)
{
BUG_ON(current->journal_info);
- return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops);
+ return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops,
+ NULL);
}
#define GFS2_JTRUNC_REVOKES 8192
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 6fbbaaad1cd0..198a8cbaf5e5 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1248,14 +1248,15 @@ static int gfs2_symlink(struct mnt_idmap *idmap, struct inode *dir,
* @dentry: The dentry of the new directory
* @mode: The mode of the new directory
*
- * Returns: errno
+ * Returns: the dentry, or ERR_PTR(errno)
*/
-static int gfs2_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *gfs2_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
unsigned dsize = gfs2_max_stuffed_size(GFS2_I(dir));
- return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0);
+
+ return ERR_PTR(gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0));
}
/**
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index b75c26045df4..86a6b317b474 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -219,26 +219,26 @@ static int hfs_create(struct mnt_idmap *idmap, struct inode *dir,
* in a directory, given the inode for the parent directory and the
* name (and its length) of the new directory.
*/
-static int hfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *hfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
int res;
inode = hfs_new_inode(dir, &dentry->d_name, S_IFDIR | mode);
if (!inode)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode);
if (res) {
clear_nlink(inode);
hfs_delete_inode(inode);
iput(inode);
- return res;
+ return ERR_PTR(res);
}
d_instantiate(dentry, inode);
mark_inode_dirty(inode);
- return 0;
+ return NULL;
}
/*
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index f5c4b3e31a1c..876bbb80fb4d 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -523,10 +523,10 @@ static int hfsplus_create(struct mnt_idmap *idmap, struct inode *dir,
return hfsplus_mknod(&nop_mnt_idmap, dir, dentry, mode, 0);
}
-static int hfsplus_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *hfsplus_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
- return hfsplus_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0);
+ return ERR_PTR(hfsplus_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0));
}
static int hfsplus_rename(struct mnt_idmap *idmap,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index e0741e468956..a2c6b9051c5b 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -679,17 +679,25 @@ static int hostfs_symlink(struct mnt_idmap *idmap, struct inode *ino,
return err;
}
-static int hostfs_mkdir(struct mnt_idmap *idmap, struct inode *ino,
- struct dentry *dentry, umode_t mode)
+static struct dentry *hostfs_mkdir(struct mnt_idmap *idmap, struct inode *ino,
+ struct dentry *dentry, umode_t mode)
{
+ struct inode *inode;
char *file;
int err;
if ((file = dentry_name(dentry)) == NULL)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
err = do_mkdir(file, mode);
+ if (err) {
+ dentry = ERR_PTR(err);
+ } else {
+ inode = hostfs_iget(dentry->d_sb, file);
+ d_drop(dentry);
+ dentry = d_splice_alias(inode, dentry);
+ }
__putname(file);
- return err;
+ return dentry;
}
static int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index d0edf9ed33b6..e3cdc421dfba 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -19,8 +19,8 @@ static void hpfs_update_directory_times(struct inode *dir)
hpfs_write_inode_nolock(dir);
}
-static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
const unsigned char *name = dentry->d_name.name;
unsigned len = dentry->d_name.len;
@@ -35,7 +35,7 @@ static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
int r;
struct hpfs_dirent dee;
int err;
- if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
+ if ((err = hpfs_chk_name(name, &len))) return ERR_PTR(err==-ENOENT ? -EINVAL : err);
hpfs_lock(dir->i_sb);
err = -ENOSPC;
fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
@@ -112,7 +112,7 @@ static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
hpfs_update_directory_times(dir);
d_instantiate(dentry, result);
hpfs_unlock(dir->i_sb);
- return 0;
+ return NULL;
bail3:
iput(result);
bail2:
@@ -123,7 +123,7 @@ bail1:
hpfs_free_sectors(dir->i_sb, fno, 1);
bail:
hpfs_unlock(dir->i_sb);
- return err;
+ return ERR_PTR(err);
}
static int hpfs_create(struct mnt_idmap *idmap, struct inode *dir,
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 0fc179a59830..d98caedbb723 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -991,14 +991,14 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
return 0;
}
-static int hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int retval = hugetlbfs_mknod(idmap, dir, dentry,
mode | S_IFDIR, 0);
if (!retval)
inc_nlink(dir);
- return retval;
+ return ERR_PTR(retval);
}
static int hugetlbfs_create(struct mnt_idmap *idmap,
diff --git a/fs/init.c b/fs/init.c
index e9387b6c4f30..eef5124885e3 100644
--- a/fs/init.c
+++ b/fs/init.c
@@ -230,9 +230,12 @@ int __init init_mkdir(const char *pathname, umode_t mode)
return PTR_ERR(dentry);
mode = mode_strip_umask(d_inode(path.dentry), mode);
error = security_path_mkdir(&path, dentry, mode);
- if (!error)
- error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
+ if (!error) {
+ dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
dentry, mode);
+ if (IS_ERR(dentry))
+ error = PTR_ERR(dentry);
+ }
done_path_create(&path, dentry);
return error;
}
diff --git a/fs/inode.c b/fs/inode.c
index 5587aabdaa5e..99318b157a9a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -327,7 +327,17 @@ static void i_callback(struct rcu_head *head)
free_inode_nonrcu(inode);
}
-static struct inode *alloc_inode(struct super_block *sb)
+/**
+ * alloc_inode - obtain an inode
+ * @sb: superblock
+ *
+ * Allocates a new inode for given superblock.
+ * Inode wont be chained in superblock s_inodes list
+ * This means :
+ * - fs can't be unmount
+ * - quotas, fsnotify, writeback can't work
+ */
+struct inode *alloc_inode(struct super_block *sb)
{
const struct super_operations *ops = sb->s_op;
struct inode *inode;
@@ -613,18 +623,22 @@ static void inode_wait_for_lru_isolating(struct inode *inode)
*/
void inode_sb_list_add(struct inode *inode)
{
- spin_lock(&inode->i_sb->s_inode_list_lock);
- list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
- spin_unlock(&inode->i_sb->s_inode_list_lock);
+ struct super_block *sb = inode->i_sb;
+
+ spin_lock(&sb->s_inode_list_lock);
+ list_add(&inode->i_sb_list, &sb->s_inodes);
+ spin_unlock(&sb->s_inode_list_lock);
}
EXPORT_SYMBOL_GPL(inode_sb_list_add);
static inline void inode_sb_list_del(struct inode *inode)
{
+ struct super_block *sb = inode->i_sb;
+
if (!list_empty(&inode->i_sb_list)) {
- spin_lock(&inode->i_sb->s_inode_list_lock);
+ spin_lock(&sb->s_inode_list_lock);
list_del_init(&inode->i_sb_list);
- spin_unlock(&inode->i_sb->s_inode_list_lock);
+ spin_unlock(&sb->s_inode_list_lock);
}
}
@@ -806,23 +820,16 @@ static void evict(struct inode *inode)
/*
* Wake up waiters in __wait_on_freeing_inode().
*
- * Lockless hash lookup may end up finding the inode before we removed
- * it above, but only lock it *after* we are done with the wakeup below.
- * In this case the potential waiter cannot safely block.
+ * It is an invariant that any thread we need to wake up is already
+ * accounted for before remove_inode_hash() acquires ->i_lock -- both
+ * sides take the lock and sleep is aborted if the inode is found
+ * unhashed. Thus either the sleeper wins and goes off CPU, or removal
+ * wins and the sleeper aborts after testing with the lock.
*
- * The inode being unhashed after the call to remove_inode_hash() is
- * used as an indicator whether blocking on it is safe.
+ * This also means we don't need any fences for the call below.
*/
- spin_lock(&inode->i_lock);
- /*
- * Pairs with the barrier in prepare_to_wait_event() to make sure
- * ___wait_var_event() either sees the bit cleared or
- * waitqueue_active() check in wake_up_var() sees the waiter.
- */
- smp_mb__after_spinlock();
inode_wake_up_bit(inode, __I_NEW);
BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
- spin_unlock(&inode->i_lock);
destroy_inode(inode);
}
@@ -900,46 +907,6 @@ again:
}
EXPORT_SYMBOL_GPL(evict_inodes);
-/**
- * invalidate_inodes - attempt to free all inodes on a superblock
- * @sb: superblock to operate on
- *
- * Attempts to free all inodes (including dirty inodes) for a given superblock.
- */
-void invalidate_inodes(struct super_block *sb)
-{
- struct inode *inode, *next;
- LIST_HEAD(dispose);
-
-again:
- spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
- spin_lock(&inode->i_lock);
- if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
- spin_unlock(&inode->i_lock);
- continue;
- }
- if (atomic_read(&inode->i_count)) {
- spin_unlock(&inode->i_lock);
- continue;
- }
-
- inode->i_state |= I_FREEING;
- inode_lru_list_del(inode);
- spin_unlock(&inode->i_lock);
- list_add(&inode->i_lru, &dispose);
- if (need_resched()) {
- spin_unlock(&sb->s_inode_list_lock);
- cond_resched();
- dispose_list(&dispose);
- goto again;
- }
- }
- spin_unlock(&sb->s_inode_list_lock);
-
- dispose_list(&dispose);
-}
-
/*
* Isolate the inode from the LRU in preparation for freeing it.
*
@@ -1160,21 +1127,6 @@ unsigned int get_next_ino(void)
EXPORT_SYMBOL(get_next_ino);
/**
- * new_inode_pseudo - obtain an inode
- * @sb: superblock
- *
- * Allocates a new inode for given superblock.
- * Inode wont be chained in superblock s_inodes list
- * This means :
- * - fs can't be unmount
- * - quotas, fsnotify, writeback can't work
- */
-struct inode *new_inode_pseudo(struct super_block *sb)
-{
- return alloc_inode(sb);
-}
-
-/**
* new_inode - obtain an inode
* @sb: superblock
*
@@ -1190,7 +1142,7 @@ struct inode *new_inode(struct super_block *sb)
{
struct inode *inode;
- inode = new_inode_pseudo(sb);
+ inode = alloc_inode(sb);
if (inode)
inode_sb_list_add(inode);
return inode;
@@ -1348,8 +1300,8 @@ again:
}
if (set && unlikely(set(inode, data))) {
- inode = NULL;
- goto unlock;
+ spin_unlock(&inode_hash_lock);
+ return NULL;
}
/*
@@ -1361,14 +1313,14 @@ again:
hlist_add_head_rcu(&inode->i_hash, head);
spin_unlock(&inode->i_lock);
+ spin_unlock(&inode_hash_lock);
+
/*
* Add inode to the sb list if it's not already. It has I_NEW at this
* point, so it should be safe to test i_sb_list locklessly.
*/
if (list_empty(&inode->i_sb_list))
inode_sb_list_add(inode);
-unlock:
- spin_unlock(&inode_hash_lock);
return inode;
}
@@ -1497,8 +1449,8 @@ again:
inode->i_state = I_NEW;
hlist_add_head_rcu(&inode->i_hash, head);
spin_unlock(&inode->i_lock);
- inode_sb_list_add(inode);
spin_unlock(&inode_hash_lock);
+ inode_sb_list_add(inode);
/* Return the locked inode with I_NEW set, the
* caller is responsible for filling in the contents
@@ -2953,3 +2905,18 @@ umode_t mode_strip_sgid(struct mnt_idmap *idmap,
return mode & ~S_ISGID;
}
EXPORT_SYMBOL(mode_strip_sgid);
+
+#ifdef CONFIG_DEBUG_VFS
+/*
+ * Dump an inode.
+ *
+ * TODO: add a proper inode dumping routine, this is a stub to get debug off the
+ * ground.
+ */
+void dump_inode(struct inode *inode, const char *reason)
+{
+ pr_warn("%s encountered for inode %px", reason, inode);
+}
+
+EXPORT_SYMBOL(dump_inode);
+#endif
diff --git a/fs/internal.h b/fs/internal.h
index e7f02ae1e098..b9b3e29a73fd 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -118,6 +118,9 @@ static inline void put_file_access(struct file *file)
}
}
+void fput_close_sync(struct file *);
+void fput_close(struct file *);
+
/*
* super.c
*/
@@ -187,8 +190,8 @@ extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
struct file *file_close_fd_locked(struct files_struct *files, unsigned fd);
-long do_ftruncate(struct file *file, loff_t length, int small);
-long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
+int do_ftruncate(struct file *file, loff_t length, int small);
+int do_sys_ftruncate(unsigned int fd, loff_t length, int small);
int chmod_common(const struct path *path, umode_t mode);
int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
int flag);
@@ -207,7 +210,6 @@ bool in_group_or_capable(struct mnt_idmap *idmap,
* fs-writeback.c
*/
extern long get_nr_dirty_inodes(void);
-void invalidate_inodes(struct super_block *sb);
/*
* dcache.c
@@ -325,6 +327,7 @@ struct stashed_operations {
int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
struct path *path);
void stashed_dentry_prune(struct dentry *dentry);
+struct dentry *stashed_dentry_get(struct dentry **stashed);
/**
* path_mounted - check whether path is mounted
* @path: path to check
@@ -338,3 +341,5 @@ static inline bool path_mounted(const struct path *path)
return path->mnt->mnt_root == path->dentry;
}
void file_f_owner_release(struct file *file);
+bool file_seek_cur_needs_f_lock(struct file *file);
+int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 638a36be31c1..c91fd2b46a77 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -41,7 +41,7 @@
*
* Returns 0 on success, -errno on error.
*/
-long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
int error = -ENOTTY;
@@ -228,8 +228,8 @@ static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap)
return error;
}
-static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
- u64 off, u64 olen, u64 destoff)
+static int ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
+ u64 off, u64 olen, u64 destoff)
{
CLASS(fd, src_file)(srcfd);
loff_t cloned;
@@ -248,8 +248,8 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
return ret;
}
-static long ioctl_file_clone_range(struct file *file,
- struct file_clone_range __user *argp)
+static int ioctl_file_clone_range(struct file *file,
+ struct file_clone_range __user *argp)
{
struct file_clone_range args;
diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile
index 381d76c5c232..69e8ebb41302 100644
--- a/fs/iomap/Makefile
+++ b/fs/iomap/Makefile
@@ -12,6 +12,7 @@ iomap-y += trace.o \
iter.o
iomap-$(CONFIG_BLOCK) += buffered-io.o \
direct-io.o \
+ ioend.o \
fiemap.o \
seek.o
iomap-$(CONFIG_SWAP) += swapfile.o
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index d303e6c8900c..814b7f679486 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -12,17 +12,15 @@
#include <linux/buffer_head.h>
#include <linux/dax.h>
#include <linux/writeback.h>
-#include <linux/list_sort.h>
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/sched/signal.h>
#include <linux/migrate.h>
+#include "internal.h"
#include "trace.h"
#include "../internal.h"
-#define IOEND_BATCH_SIZE 4096
-
/*
* Structure allocated for each folio to track per-block uptodate, dirty state
* and I/O completions.
@@ -40,8 +38,6 @@ struct iomap_folio_state {
unsigned long state[];
};
-static struct bio_set iomap_ioend_bioset;
-
static inline bool ifs_is_fully_uptodate(struct folio *folio,
struct iomap_folio_state *ifs)
{
@@ -366,20 +362,24 @@ static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter,
pos >= i_size_read(iter->inode);
}
-static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
- struct iomap_readpage_ctx *ctx, loff_t offset)
+static int iomap_readpage_iter(struct iomap_iter *iter,
+ struct iomap_readpage_ctx *ctx)
{
const struct iomap *iomap = &iter->iomap;
- loff_t pos = iter->pos + offset;
- loff_t length = iomap_length(iter) - offset;
+ loff_t pos = iter->pos;
+ loff_t length = iomap_length(iter);
struct folio *folio = ctx->cur_folio;
struct iomap_folio_state *ifs;
- loff_t orig_pos = pos;
size_t poff, plen;
sector_t sector;
+ int ret;
- if (iomap->type == IOMAP_INLINE)
- return iomap_read_inline_data(iter, folio);
+ if (iomap->type == IOMAP_INLINE) {
+ ret = iomap_read_inline_data(iter, folio);
+ if (ret)
+ return ret;
+ return iomap_iter_advance(iter, &length);
+ }
/* zero post-eof blocks as the page may be mapped */
ifs = ifs_alloc(iter->inode, folio, iter->flags);
@@ -438,25 +438,22 @@ done:
* we can skip trailing ones as they will be handled in the next
* iteration.
*/
- return pos - orig_pos + plen;
+ length = pos - iter->pos + plen;
+ return iomap_iter_advance(iter, &length);
}
-static loff_t iomap_read_folio_iter(const struct iomap_iter *iter,
+static int iomap_read_folio_iter(struct iomap_iter *iter,
struct iomap_readpage_ctx *ctx)
{
- struct folio *folio = ctx->cur_folio;
- size_t offset = offset_in_folio(folio, iter->pos);
- loff_t length = min_t(loff_t, folio_size(folio) - offset,
- iomap_length(iter));
- loff_t done, ret;
-
- for (done = 0; done < length; done += ret) {
- ret = iomap_readpage_iter(iter, ctx, done);
- if (ret <= 0)
+ int ret;
+
+ while (iomap_length(iter)) {
+ ret = iomap_readpage_iter(iter, ctx);
+ if (ret)
return ret;
}
- return done;
+ return 0;
}
int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
@@ -474,7 +471,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
trace_iomap_readpage(iter.inode, 1);
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_read_folio_iter(&iter, &ctx);
+ iter.status = iomap_read_folio_iter(&iter, &ctx);
if (ctx.bio) {
submit_bio(ctx.bio);
@@ -493,15 +490,14 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
}
EXPORT_SYMBOL_GPL(iomap_read_folio);
-static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
+static int iomap_readahead_iter(struct iomap_iter *iter,
struct iomap_readpage_ctx *ctx)
{
- loff_t length = iomap_length(iter);
- loff_t done, ret;
+ int ret;
- for (done = 0; done < length; done += ret) {
+ while (iomap_length(iter)) {
if (ctx->cur_folio &&
- offset_in_folio(ctx->cur_folio, iter->pos + done) == 0) {
+ offset_in_folio(ctx->cur_folio, iter->pos) == 0) {
if (!ctx->cur_folio_in_bio)
folio_unlock(ctx->cur_folio);
ctx->cur_folio = NULL;
@@ -510,12 +506,12 @@ static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
ctx->cur_folio = readahead_folio(ctx->rac);
ctx->cur_folio_in_bio = false;
}
- ret = iomap_readpage_iter(iter, ctx, done);
- if (ret <= 0)
+ ret = iomap_readpage_iter(iter, ctx);
+ if (ret)
return ret;
}
- return done;
+ return 0;
}
/**
@@ -547,7 +543,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
trace_iomap_readahead(rac->mapping->host, readahead_count(rac));
while (iomap_iter(&iter, ops) > 0)
- iter.processed = iomap_readahead_iter(&iter, &ctx);
+ iter.status = iomap_readahead_iter(&iter, &ctx);
if (ctx.bio)
submit_bio(ctx.bio);
@@ -603,6 +599,8 @@ struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len)
if (iter->flags & IOMAP_NOWAIT)
fgp |= FGP_NOWAIT;
+ if (iter->flags & IOMAP_DONTCACHE)
+ fgp |= FGP_DONTCACHE;
fgp |= fgf_set_order(len);
return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
@@ -907,12 +905,10 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
return __iomap_write_end(iter->inode, pos, len, copied, folio);
}
-static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
+static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
{
- loff_t length = iomap_length(iter);
- loff_t pos = iter->pos;
ssize_t total_written = 0;
- long status = 0;
+ int status = 0;
struct address_space *mapping = iter->inode->i_mapping;
size_t chunk = mapping_max_folio_size(mapping);
unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
@@ -923,7 +919,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
size_t offset; /* Offset into folio */
size_t bytes; /* Bytes to write to folio */
size_t copied; /* Bytes copied from user */
- size_t written; /* Bytes have been written */
+ u64 written; /* Bytes have been written */
+ loff_t pos = iter->pos;
bytes = iov_iter_count(i);
retry:
@@ -934,8 +931,8 @@ retry:
if (unlikely(status))
break;
- if (bytes > length)
- bytes = length;
+ if (bytes > iomap_length(iter))
+ bytes = iomap_length(iter);
/*
* Bring in the user page that we'll copy from _first_.
@@ -1006,17 +1003,12 @@ retry:
goto retry;
}
} else {
- pos += written;
total_written += written;
- length -= written;
+ iomap_iter_advance(iter, &written);
}
- } while (iov_iter_count(i) && length);
+ } while (iov_iter_count(i) && iomap_length(iter));
- if (status == -EAGAIN) {
- iov_iter_revert(i, total_written);
- return -EAGAIN;
- }
- return total_written ? total_written : status;
+ return total_written ? 0 : status;
}
ssize_t
@@ -1034,9 +1026,11 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
if (iocb->ki_flags & IOCB_NOWAIT)
iter.flags |= IOMAP_NOWAIT;
+ if (iocb->ki_flags & IOCB_DONTCACHE)
+ iter.flags |= IOMAP_DONTCACHE;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_write_iter(&iter, i);
+ iter.status = iomap_write_iter(&iter, i);
if (unlikely(iter.pos == iocb->ki_pos))
return ret;
@@ -1270,23 +1264,22 @@ void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
}
EXPORT_SYMBOL_GPL(iomap_write_delalloc_release);
-static loff_t iomap_unshare_iter(struct iomap_iter *iter)
+static int iomap_unshare_iter(struct iomap_iter *iter)
{
struct iomap *iomap = &iter->iomap;
- loff_t pos = iter->pos;
- loff_t length = iomap_length(iter);
- loff_t written = 0;
+ u64 bytes = iomap_length(iter);
+ int status;
if (!iomap_want_unshare_iter(iter))
- return length;
+ return iomap_iter_advance(iter, &bytes);
do {
struct folio *folio;
- int status;
size_t offset;
- size_t bytes = min_t(u64, SIZE_MAX, length);
+ loff_t pos = iter->pos;
bool ret;
+ bytes = min_t(u64, SIZE_MAX, bytes);
status = iomap_write_begin(iter, pos, bytes, &folio);
if (unlikely(status))
return status;
@@ -1304,14 +1297,14 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
cond_resched();
- pos += bytes;
- written += bytes;
- length -= bytes;
-
balance_dirty_pages_ratelimited(iter->inode->i_mapping);
- } while (length > 0);
- return written;
+ status = iomap_iter_advance(iter, &bytes);
+ if (status)
+ break;
+ } while (bytes > 0);
+
+ return status;
}
int
@@ -1331,7 +1324,7 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
iter.len = min(len, size - pos);
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_unshare_iter(&iter);
+ iter.status = iomap_unshare_iter(&iter);
return ret;
}
EXPORT_SYMBOL_GPL(iomap_file_unshare);
@@ -1350,19 +1343,18 @@ static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i)
return filemap_write_and_wait_range(mapping, i->pos, end);
}
-static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
+static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
{
- loff_t pos = iter->pos;
- loff_t length = iomap_length(iter);
- loff_t written = 0;
+ u64 bytes = iomap_length(iter);
+ int status;
do {
struct folio *folio;
- int status;
size_t offset;
- size_t bytes = min_t(u64, SIZE_MAX, length);
+ loff_t pos = iter->pos;
bool ret;
+ bytes = min_t(u64, SIZE_MAX, bytes);
status = iomap_write_begin(iter, pos, bytes, &folio);
if (status)
return status;
@@ -1383,25 +1375,26 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
if (WARN_ON_ONCE(!ret))
return -EIO;
- pos += bytes;
- length -= bytes;
- written += bytes;
- } while (length > 0);
+ status = iomap_iter_advance(iter, &bytes);
+ if (status)
+ break;
+ } while (bytes > 0);
if (did_zero)
*did_zero = true;
- return written;
+ return status;
}
int
iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
- const struct iomap_ops *ops)
+ const struct iomap_ops *ops, void *private)
{
struct iomap_iter iter = {
.inode = inode,
.pos = pos,
.len = len,
.flags = IOMAP_ZERO,
+ .private = private,
};
struct address_space *mapping = inode->i_mapping;
unsigned int blocksize = i_blocksize(inode);
@@ -1424,7 +1417,7 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) {
iter.len = plen;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_zero_iter(&iter, did_zero);
+ iter.status = iomap_zero_iter(&iter, did_zero);
iter.len = len - (iter.pos - pos);
if (ret || !iter.len)
@@ -1443,17 +1436,19 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
if (srcmap->type == IOMAP_HOLE ||
srcmap->type == IOMAP_UNWRITTEN) {
- loff_t proc = iomap_length(&iter);
+ s64 status;
if (range_dirty) {
range_dirty = false;
- proc = iomap_zero_iter_flush_and_stale(&iter);
+ status = iomap_zero_iter_flush_and_stale(&iter);
+ } else {
+ status = iomap_iter_advance_full(&iter);
}
- iter.processed = proc;
+ iter.status = status;
continue;
}
- iter.processed = iomap_zero_iter(&iter, did_zero);
+ iter.status = iomap_zero_iter(&iter, did_zero);
}
return ret;
}
@@ -1461,7 +1456,7 @@ EXPORT_SYMBOL_GPL(iomap_zero_range);
int
iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
- const struct iomap_ops *ops)
+ const struct iomap_ops *ops, void *private)
{
unsigned int blocksize = i_blocksize(inode);
unsigned int off = pos & (blocksize - 1);
@@ -1469,11 +1464,12 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
/* Block boundary? Nothing to do */
if (!off)
return 0;
- return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
+ return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops,
+ private);
}
EXPORT_SYMBOL_GPL(iomap_truncate_page);
-static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
+static int iomap_folio_mkwrite_iter(struct iomap_iter *iter,
struct folio *folio)
{
loff_t length = iomap_length(iter);
@@ -1490,14 +1486,16 @@ static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
folio_mark_dirty(folio);
}
- return length;
+ return iomap_iter_advance(iter, &length);
}
-vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
+vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops,
+ void *private)
{
struct iomap_iter iter = {
.inode = file_inode(vmf->vma->vm_file),
.flags = IOMAP_WRITE | IOMAP_FAULT,
+ .private = private,
};
struct folio *folio = page_folio(vmf->page);
ssize_t ret;
@@ -1509,7 +1507,7 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
iter.pos = folio_pos(folio);
iter.len = ret;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_folio_mkwrite_iter(&iter, folio);
+ iter.status = iomap_folio_mkwrite_iter(&iter, folio);
if (ret < 0)
goto out_unlock;
@@ -1538,16 +1536,15 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
* state, release holds on bios, and finally free up memory. Do not use the
* ioend after this.
*/
-static u32
-iomap_finish_ioend(struct iomap_ioend *ioend, int error)
+u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
{
struct inode *inode = ioend->io_inode;
struct bio *bio = &ioend->io_bio;
struct folio_iter fi;
u32 folio_count = 0;
- if (error) {
- mapping_set_error(inode->i_mapping, error);
+ if (ioend->io_error) {
+ mapping_set_error(inode->i_mapping, ioend->io_error);
if (!bio_flagged(bio, BIO_QUIET)) {
pr_err_ratelimited(
"%s: writeback error on inode %lu, offset %lld, sector %llu",
@@ -1566,116 +1563,16 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error)
return folio_count;
}
-/*
- * Ioend completion routine for merged bios. This can only be called from task
- * contexts as merged ioends can be of unbound length. Hence we have to break up
- * the writeback completions into manageable chunks to avoid long scheduler
- * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
- * good batch processing throughput without creating adverse scheduler latency
- * conditions.
- */
-void
-iomap_finish_ioends(struct iomap_ioend *ioend, int error)
-{
- struct list_head tmp;
- u32 completions;
-
- might_sleep();
-
- list_replace_init(&ioend->io_list, &tmp);
- completions = iomap_finish_ioend(ioend, error);
-
- while (!list_empty(&tmp)) {
- if (completions > IOEND_BATCH_SIZE * 8) {
- cond_resched();
- completions = 0;
- }
- ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
- list_del_init(&ioend->io_list);
- completions += iomap_finish_ioend(ioend, error);
- }
-}
-EXPORT_SYMBOL_GPL(iomap_finish_ioends);
-
-/*
- * We can merge two adjacent ioends if they have the same set of work to do.
- */
-static bool
-iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
-{
- if (ioend->io_bio.bi_status != next->io_bio.bi_status)
- return false;
- if (next->io_flags & IOMAP_F_BOUNDARY)
- return false;
- if ((ioend->io_flags & IOMAP_F_SHARED) ^
- (next->io_flags & IOMAP_F_SHARED))
- return false;
- if ((ioend->io_type == IOMAP_UNWRITTEN) ^
- (next->io_type == IOMAP_UNWRITTEN))
- return false;
- if (ioend->io_offset + ioend->io_size != next->io_offset)
- return false;
- /*
- * Do not merge physically discontiguous ioends. The filesystem
- * completion functions will have to iterate the physical
- * discontiguities even if we merge the ioends at a logical level, so
- * we don't gain anything by merging physical discontiguities here.
- *
- * We cannot use bio->bi_iter.bi_sector here as it is modified during
- * submission so does not point to the start sector of the bio at
- * completion.
- */
- if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector)
- return false;
- return true;
-}
-
-void
-iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends)
-{
- struct iomap_ioend *next;
-
- INIT_LIST_HEAD(&ioend->io_list);
-
- while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
- io_list))) {
- if (!iomap_ioend_can_merge(ioend, next))
- break;
- list_move_tail(&next->io_list, &ioend->io_list);
- ioend->io_size += next->io_size;
- }
-}
-EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
-
-static int
-iomap_ioend_compare(void *priv, const struct list_head *a,
- const struct list_head *b)
-{
- struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
- struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
-
- if (ia->io_offset < ib->io_offset)
- return -1;
- if (ia->io_offset > ib->io_offset)
- return 1;
- return 0;
-}
-
-void
-iomap_sort_ioends(struct list_head *ioend_list)
-{
- list_sort(NULL, ioend_list, iomap_ioend_compare);
-}
-EXPORT_SYMBOL_GPL(iomap_sort_ioends);
-
static void iomap_writepage_end_bio(struct bio *bio)
{
- iomap_finish_ioend(iomap_ioend_from_bio(bio),
- blk_status_to_errno(bio->bi_status));
+ struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
+
+ ioend->io_error = blk_status_to_errno(bio->bi_status);
+ iomap_finish_ioend_buffered(ioend);
}
/*
- * Submit the final bio for an ioend.
+ * Submit an ioend.
*
* If @error is non-zero, it means that we have a situation where some part of
* the submission process has failed after we've marked pages for writeback.
@@ -1694,14 +1591,18 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
* failure happened so that the file system end I/O handler gets called
* to clean up.
*/
- if (wpc->ops->prepare_ioend)
- error = wpc->ops->prepare_ioend(wpc->ioend, error);
+ if (wpc->ops->submit_ioend) {
+ error = wpc->ops->submit_ioend(wpc, error);
+ } else {
+ if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE))
+ error = -EIO;
+ if (!error)
+ submit_bio(&wpc->ioend->io_bio);
+ }
if (error) {
wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
bio_endio(&wpc->ioend->io_bio);
- } else {
- submit_bio(&wpc->ioend->io_bio);
}
wpc->ioend = NULL;
@@ -1709,9 +1610,9 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
}
static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
- struct writeback_control *wbc, struct inode *inode, loff_t pos)
+ struct writeback_control *wbc, struct inode *inode, loff_t pos,
+ u16 ioend_flags)
{
- struct iomap_ioend *ioend;
struct bio *bio;
bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
@@ -1719,36 +1620,24 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
GFP_NOFS, &iomap_ioend_bioset);
bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
bio->bi_end_io = iomap_writepage_end_bio;
- wbc_init_bio(wbc, bio);
bio->bi_write_hint = inode->i_write_hint;
-
- ioend = iomap_ioend_from_bio(bio);
- INIT_LIST_HEAD(&ioend->io_list);
- ioend->io_type = wpc->iomap.type;
- ioend->io_flags = wpc->iomap.flags;
- if (pos > wpc->iomap.offset)
- wpc->iomap.flags &= ~IOMAP_F_BOUNDARY;
- ioend->io_inode = inode;
- ioend->io_size = 0;
- ioend->io_offset = pos;
- ioend->io_sector = bio->bi_iter.bi_sector;
-
+ wbc_init_bio(wbc, bio);
wpc->nr_folios = 0;
- return ioend;
+ return iomap_init_ioend(inode, bio, pos, ioend_flags);
}
-static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
+static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
+ u16 ioend_flags)
{
- if (wpc->iomap.offset == pos && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
- return false;
- if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
- (wpc->ioend->io_flags & IOMAP_F_SHARED))
+ if (ioend_flags & IOMAP_IOEND_BOUNDARY)
return false;
- if (wpc->iomap.type != wpc->ioend->io_type)
+ if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
+ (wpc->ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
return false;
if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
return false;
- if (iomap_sector(&wpc->iomap, pos) !=
+ if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) &&
+ iomap_sector(&wpc->iomap, pos) !=
bio_end_sector(&wpc->ioend->io_bio))
return false;
/*
@@ -1779,14 +1668,23 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
{
struct iomap_folio_state *ifs = folio->private;
size_t poff = offset_in_folio(folio, pos);
+ unsigned int ioend_flags = 0;
int error;
- if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) {
+ if (wpc->iomap.type == IOMAP_UNWRITTEN)
+ ioend_flags |= IOMAP_IOEND_UNWRITTEN;
+ if (wpc->iomap.flags & IOMAP_F_SHARED)
+ ioend_flags |= IOMAP_IOEND_SHARED;
+ if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
+ ioend_flags |= IOMAP_IOEND_BOUNDARY;
+
+ if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) {
new_ioend:
error = iomap_submit_ioend(wpc, 0);
if (error)
return error;
- wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos);
+ wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos,
+ ioend_flags);
}
if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
@@ -2062,11 +1960,3 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
return iomap_submit_ioend(wpc, error);
}
EXPORT_SYMBOL_GPL(iomap_writepages);
-
-static int __init iomap_buffered_init(void)
-{
- return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
- offsetof(struct iomap_ioend, io_bio),
- BIOSET_NEED_BVECS);
-}
-fs_initcall(iomap_buffered_init);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 0e47da82b0c2..844261a31156 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2010 Red Hat, Inc.
- * Copyright (c) 2016-2021 Christoph Hellwig.
+ * Copyright (c) 2016-2025 Christoph Hellwig.
*/
#include <linux/module.h>
#include <linux/compiler.h>
@@ -12,6 +12,7 @@
#include <linux/backing-dev.h>
#include <linux/uio.h>
#include <linux/task_io_accounting_ops.h>
+#include "internal.h"
#include "trace.h"
#include "../internal.h"
@@ -20,6 +21,7 @@
* Private flags for iomap_dio, must not overlap with the public ones in
* iomap.h:
*/
+#define IOMAP_DIO_NO_INVALIDATE (1U << 25)
#define IOMAP_DIO_CALLER_COMP (1U << 26)
#define IOMAP_DIO_INLINE_COMP (1U << 27)
#define IOMAP_DIO_WRITE_THROUGH (1U << 28)
@@ -81,10 +83,12 @@ static void iomap_dio_submit_bio(const struct iomap_iter *iter,
WRITE_ONCE(iocb->private, bio);
}
- if (dio->dops && dio->dops->submit_io)
+ if (dio->dops && dio->dops->submit_io) {
dio->dops->submit_io(iter, bio, pos);
- else
+ } else {
+ WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_ANON_WRITE);
submit_bio(bio);
+ }
}
ssize_t iomap_dio_complete(struct iomap_dio *dio)
@@ -117,7 +121,8 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
* ->end_io() when necessary, otherwise a racing buffer read would cache
* zeros from unwritten extents.
*/
- if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE))
+ if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE) &&
+ !(dio->flags & IOMAP_DIO_NO_INVALIDATE))
kiocb_invalidate_post_direct_write(iocb, dio->size);
inode_dio_end(file_inode(iocb->ki_filp));
@@ -163,43 +168,31 @@ static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
cmpxchg(&dio->error, 0, ret);
}
-void iomap_dio_bio_end_io(struct bio *bio)
+/*
+ * Called when dio->ref reaches zero from an I/O completion.
+ */
+static void iomap_dio_done(struct iomap_dio *dio)
{
- struct iomap_dio *dio = bio->bi_private;
- bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
struct kiocb *iocb = dio->iocb;
- if (bio->bi_status)
- iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
- if (!atomic_dec_and_test(&dio->ref))
- goto release_bio;
-
- /*
- * Synchronous dio, task itself will handle any completion work
- * that needs after IO. All we need to do is wake the task.
- */
if (dio->wait_for_completion) {
+ /*
+ * Synchronous I/O, task itself will handle any completion work
+ * that needs after IO. All we need to do is wake the task.
+ */
struct task_struct *waiter = dio->submit.waiter;
WRITE_ONCE(dio->submit.waiter, NULL);
blk_wake_io_task(waiter);
- goto release_bio;
- }
-
- /*
- * Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline
- */
- if (dio->flags & IOMAP_DIO_INLINE_COMP) {
+ } else if (dio->flags & IOMAP_DIO_INLINE_COMP) {
WRITE_ONCE(iocb->private, NULL);
iomap_dio_complete_work(&dio->aio.work);
- goto release_bio;
- }
-
- /*
- * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule
- * our completion that way to avoid an async punt to a workqueue.
- */
- if (dio->flags & IOMAP_DIO_CALLER_COMP) {
+ } else if (dio->flags & IOMAP_DIO_CALLER_COMP) {
+ /*
+ * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then
+ * schedule our completion that way to avoid an async punt to a
+ * workqueue.
+ */
/* only polled IO cares about private cleared */
iocb->private = dio;
iocb->dio_complete = iomap_dio_deferred_complete;
@@ -217,19 +210,31 @@ void iomap_dio_bio_end_io(struct bio *bio)
* issuer.
*/
iocb->ki_complete(iocb, 0);
- goto release_bio;
+ } else {
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ /*
+ * Async DIO completion that requires filesystem level
+ * completion work gets punted to a work queue to complete as
+ * the operation may require more IO to be issued to finalise
+ * filesystem metadata changes or guarantee data integrity.
+ */
+ INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
+ queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
}
+}
+
+void iomap_dio_bio_end_io(struct bio *bio)
+{
+ struct iomap_dio *dio = bio->bi_private;
+ bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
+
+ if (bio->bi_status)
+ iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
+
+ if (atomic_dec_and_test(&dio->ref))
+ iomap_dio_done(dio);
- /*
- * Async DIO completion that requires filesystem level completion work
- * gets punted to a work queue to complete as the operation may require
- * more IO to be issued to finalise filesystem metadata changes or
- * guarantee data integrity.
- */
- INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
- queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq,
- &dio->aio.work);
-release_bio:
if (should_dirty) {
bio_check_pages_dirty(bio);
} else {
@@ -239,6 +244,47 @@ release_bio:
}
EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io);
+u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend)
+{
+ struct iomap_dio *dio = ioend->io_bio.bi_private;
+ bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
+ u32 vec_count = ioend->io_bio.bi_vcnt;
+
+ if (ioend->io_error)
+ iomap_dio_set_error(dio, ioend->io_error);
+
+ if (atomic_dec_and_test(&dio->ref)) {
+ /*
+ * Try to avoid another context switch for the completion given
+ * that we are already called from the ioend completion
+ * workqueue, but never invalidate pages from this thread to
+ * avoid deadlocks with buffered I/O completions. Tough luck if
+ * you hit the tiny race with someone dirtying the range now
+ * between this check and the actual completion.
+ */
+ if (!dio->iocb->ki_filp->f_mapping->nrpages) {
+ dio->flags |= IOMAP_DIO_INLINE_COMP;
+ dio->flags |= IOMAP_DIO_NO_INVALIDATE;
+ }
+ dio->flags &= ~IOMAP_DIO_CALLER_COMP;
+ iomap_dio_done(dio);
+ }
+
+ if (should_dirty) {
+ bio_check_pages_dirty(&ioend->io_bio);
+ } else {
+ bio_release_pages(&ioend->io_bio, false);
+ bio_put(&ioend->io_bio);
+ }
+
+ /*
+ * Return the number of bvecs completed as even direct I/O completions
+ * do significant per-folio work and we'll still want to give up the
+ * CPU after a lot of completions.
+ */
+ return vec_count;
+}
+
static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
loff_t pos, unsigned len)
{
@@ -266,81 +312,85 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
}
/*
- * Figure out the bio's operation flags from the dio request, the
- * mapping, and whether or not we want FUA. Note that we can end up
- * clearing the WRITE_THROUGH flag in the dio request.
+ * Use a FUA write if we need datasync semantics and this is a pure data I/O
+ * that doesn't require any metadata updates (including after I/O completion
+ * such as unwritten extent conversion) and the underlying device either
+ * doesn't have a volatile write cache or supports FUA.
+ * This allows us to avoid cache flushes on I/O completion.
*/
-static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
- const struct iomap *iomap, bool use_fua, bool atomic)
+static inline bool iomap_dio_can_use_fua(const struct iomap *iomap,
+ struct iomap_dio *dio)
{
- blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
-
- if (!(dio->flags & IOMAP_DIO_WRITE))
- return REQ_OP_READ;
-
- opflags |= REQ_OP_WRITE;
- if (use_fua)
- opflags |= REQ_FUA;
- else
- dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
- if (atomic)
- opflags |= REQ_ATOMIC;
-
- return opflags;
+ if (iomap->flags & (IOMAP_F_SHARED | IOMAP_F_DIRTY))
+ return false;
+ if (!(dio->flags & IOMAP_DIO_WRITE_THROUGH))
+ return false;
+ return !bdev_write_cache(iomap->bdev) || bdev_fua(iomap->bdev);
}
-static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
- struct iomap_dio *dio)
+static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
{
const struct iomap *iomap = &iter->iomap;
struct inode *inode = iter->inode;
unsigned int fs_block_size = i_blocksize(inode), pad;
const loff_t length = iomap_length(iter);
- bool atomic = iter->flags & IOMAP_ATOMIC;
loff_t pos = iter->pos;
- blk_opf_t bio_opf;
+ blk_opf_t bio_opf = REQ_SYNC | REQ_IDLE;
struct bio *bio;
bool need_zeroout = false;
- bool use_fua = false;
int nr_pages, ret = 0;
- size_t copied = 0;
+ u64 copied = 0;
size_t orig_count;
- if (atomic && length != fs_block_size)
- return -EINVAL;
-
if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
!bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
return -EINVAL;
- if (iomap->type == IOMAP_UNWRITTEN) {
- dio->flags |= IOMAP_DIO_UNWRITTEN;
- need_zeroout = true;
- }
+ if (dio->flags & IOMAP_DIO_WRITE) {
+ bio_opf |= REQ_OP_WRITE;
+
+ if (iomap->flags & IOMAP_F_ATOMIC_BIO) {
+ /*
+ * Ensure that the mapping covers the full write
+ * length, otherwise it won't be submitted as a single
+ * bio, which is required to use hardware atomics.
+ */
+ if (length != iter->len)
+ return -EINVAL;
+ bio_opf |= REQ_ATOMIC;
+ }
- if (iomap->flags & IOMAP_F_SHARED)
- dio->flags |= IOMAP_DIO_COW;
+ if (iomap->type == IOMAP_UNWRITTEN) {
+ dio->flags |= IOMAP_DIO_UNWRITTEN;
+ need_zeroout = true;
+ }
+
+ if (iomap->flags & IOMAP_F_SHARED)
+ dio->flags |= IOMAP_DIO_COW;
+
+ if (iomap->flags & IOMAP_F_NEW) {
+ need_zeroout = true;
+ } else if (iomap->type == IOMAP_MAPPED) {
+ if (iomap_dio_can_use_fua(iomap, dio))
+ bio_opf |= REQ_FUA;
+ else
+ dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
+ }
- if (iomap->flags & IOMAP_F_NEW) {
- need_zeroout = true;
- } else if (iomap->type == IOMAP_MAPPED) {
/*
- * Use a FUA write if we need datasync semantics, this is a pure
- * data IO that doesn't require any metadata updates (including
- * after IO completion such as unwritten extent conversion) and
- * the underlying device either supports FUA or doesn't have
- * a volatile write cache. This allows us to avoid cache flushes
- * on IO completion. If we can't use writethrough and need to
- * sync, disable in-task completions as dio completion will
- * need to call generic_write_sync() which will do a blocking
- * fsync / cache flush call.
+ * We can only do deferred completion for pure overwrites that
+ * don't require additional I/O at completion time.
+ *
+ * This rules out writes that need zeroing or extent conversion,
+ * extend the file size, or issue metadata I/O or cache flushes
+ * during completion processing.
*/
- if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
- (dio->flags & IOMAP_DIO_WRITE_THROUGH) &&
- (bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev)))
- use_fua = true;
- else if (dio->flags & IOMAP_DIO_NEED_SYNC)
+ if (need_zeroout || (pos >= i_size_read(inode)) ||
+ ((dio->flags & IOMAP_DIO_NEED_SYNC) &&
+ !(bio_opf & REQ_FUA)))
dio->flags &= ~IOMAP_DIO_CALLER_COMP;
+ } else {
+ bio_opf |= REQ_OP_READ;
}
/*
@@ -355,18 +405,6 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
goto out;
/*
- * We can only do deferred completion for pure overwrites that
- * don't require additional IO at completion. This rules out
- * writes that need zeroing or extent conversion, extend
- * the file size, or issue journal IO or cache flushes
- * during completion processing.
- */
- if (need_zeroout ||
- ((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) ||
- ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
- dio->flags &= ~IOMAP_DIO_CALLER_COMP;
-
- /*
* The rules for polled IO completions follow the guidelines as the
* ones we set for inline and deferred completions. If none of those
* are available for this IO, clear the polled flag.
@@ -383,8 +421,6 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
goto out;
}
- bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic);
-
nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
do {
size_t n;
@@ -416,9 +452,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
}
n = bio->bi_iter.bi_size;
- if (WARN_ON_ONCE(atomic && n != length)) {
+ if (WARN_ON_ONCE((bio_opf & REQ_ATOMIC) && n != length)) {
/*
- * This bio should have covered the complete length,
+ * An atomic write bio must cover the complete length,
* which it doesn't, so error. We may need to zero out
* the tail (complete FS block), similar to when
* bio_iov_iter_get_pages() returns an error, above.
@@ -465,30 +501,28 @@ out:
/* Undo iter limitation to current extent */
iov_iter_reexpand(dio->submit.iter, orig_count - copied);
if (copied)
- return copied;
+ return iomap_iter_advance(iter, &copied);
return ret;
}
-static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter,
- struct iomap_dio *dio)
+static int iomap_dio_hole_iter(struct iomap_iter *iter, struct iomap_dio *dio)
{
loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter);
dio->size += length;
if (!length)
return -EFAULT;
- return length;
+ return iomap_iter_advance(iter, &length);
}
-static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi,
- struct iomap_dio *dio)
+static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio)
{
const struct iomap *iomap = &iomi->iomap;
struct iov_iter *iter = dio->submit.iter;
void *inline_data = iomap_inline_data(iomap, iomi->pos);
loff_t length = iomap_length(iomi);
loff_t pos = iomi->pos;
- size_t copied;
+ u64 copied;
if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap)))
return -EIO;
@@ -510,11 +544,10 @@ static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi,
dio->size += copied;
if (!copied)
return -EFAULT;
- return copied;
+ return iomap_iter_advance(iomi, &copied);
}
-static loff_t iomap_dio_iter(const struct iomap_iter *iter,
- struct iomap_dio *dio)
+static int iomap_dio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
{
switch (iter->iomap.type) {
case IOMAP_HOLE:
@@ -608,9 +641,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iocb->ki_flags & IOCB_NOWAIT)
iomi.flags |= IOMAP_NOWAIT;
- if (iocb->ki_flags & IOCB_ATOMIC)
- iomi.flags |= IOMAP_ATOMIC;
-
if (iov_iter_rw(iter) == READ) {
/* reads can always complete inline */
dio->flags |= IOMAP_DIO_INLINE_COMP;
@@ -645,6 +675,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
iomi.flags |= IOMAP_OVERWRITE_ONLY;
}
+ if (iocb->ki_flags & IOCB_ATOMIC)
+ iomi.flags |= IOMAP_ATOMIC;
+
/* for data sync or sync, we need sync completion processing */
if (iocb_is_dsync(iocb)) {
dio->flags |= IOMAP_DIO_NEED_SYNC;
@@ -698,7 +731,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
blk_start_plug(&plug);
while ((ret = iomap_iter(&iomi, ops)) > 0) {
- iomi.processed = iomap_dio_iter(&iomi, dio);
+ iomi.status = iomap_dio_iter(&iomi, dio);
/*
* We can only poll for single bio I/Os.
diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c
index 610ca6f1ec9b..80675c42e94e 100644
--- a/fs/iomap/fiemap.c
+++ b/fs/iomap/fiemap.c
@@ -39,24 +39,23 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
iomap->length, flags);
}
-static loff_t iomap_fiemap_iter(const struct iomap_iter *iter,
+static int iomap_fiemap_iter(struct iomap_iter *iter,
struct fiemap_extent_info *fi, struct iomap *prev)
{
int ret;
if (iter->iomap.type == IOMAP_HOLE)
- return iomap_length(iter);
+ goto advance;
ret = iomap_to_fiemap(fi, prev, 0);
*prev = iter->iomap;
- switch (ret) {
- case 0: /* success */
- return iomap_length(iter);
- case 1: /* extent array full */
- return 0;
- default: /* error */
+ if (ret < 0)
return ret;
- }
+ if (ret == 1) /* extent array full */
+ return 0;
+
+advance:
+ return iomap_iter_advance_full(iter);
}
int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
@@ -78,7 +77,7 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
return ret;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_fiemap_iter(&iter, fi, &prev);
+ iter.status = iomap_fiemap_iter(&iter, fi, &prev);
if (prev.type != IOMAP_HOLE) {
ret = iomap_to_fiemap(fi, &prev, FIEMAP_EXTENT_LAST);
@@ -114,7 +113,7 @@ iomap_bmap(struct address_space *mapping, sector_t bno,
while ((ret = iomap_iter(&iter, ops)) > 0) {
if (iter.iomap.type == IOMAP_MAPPED)
bno = iomap_sector(&iter.iomap, iter.pos) >> blkshift;
- /* leave iter.processed unset to abort loop */
+ /* leave iter.status unset to abort loop */
}
if (ret)
return 0;
diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h
new file mode 100644
index 000000000000..f6992a3bf66a
--- /dev/null
+++ b/fs/iomap/internal.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _IOMAP_INTERNAL_H
+#define _IOMAP_INTERNAL_H 1
+
+#define IOEND_BATCH_SIZE 4096
+
+u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend);
+u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend);
+
+#endif /* _IOMAP_INTERNAL_H */
diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
new file mode 100644
index 000000000000..18894ebba6db
--- /dev/null
+++ b/fs/iomap/ioend.c
@@ -0,0 +1,216 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2024-2025 Christoph Hellwig.
+ */
+#include <linux/iomap.h>
+#include <linux/list_sort.h>
+#include "internal.h"
+
+struct bio_set iomap_ioend_bioset;
+EXPORT_SYMBOL_GPL(iomap_ioend_bioset);
+
+struct iomap_ioend *iomap_init_ioend(struct inode *inode,
+ struct bio *bio, loff_t file_offset, u16 ioend_flags)
+{
+ struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
+
+ atomic_set(&ioend->io_remaining, 1);
+ ioend->io_error = 0;
+ ioend->io_parent = NULL;
+ INIT_LIST_HEAD(&ioend->io_list);
+ ioend->io_flags = ioend_flags;
+ ioend->io_inode = inode;
+ ioend->io_offset = file_offset;
+ ioend->io_size = bio->bi_iter.bi_size;
+ ioend->io_sector = bio->bi_iter.bi_sector;
+ ioend->io_private = NULL;
+ return ioend;
+}
+EXPORT_SYMBOL_GPL(iomap_init_ioend);
+
+static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error)
+{
+ if (ioend->io_parent) {
+ struct bio *bio = &ioend->io_bio;
+
+ ioend = ioend->io_parent;
+ bio_put(bio);
+ }
+
+ if (error)
+ cmpxchg(&ioend->io_error, 0, error);
+
+ if (!atomic_dec_and_test(&ioend->io_remaining))
+ return 0;
+ if (ioend->io_flags & IOMAP_IOEND_DIRECT)
+ return iomap_finish_ioend_direct(ioend);
+ return iomap_finish_ioend_buffered(ioend);
+}
+
+/*
+ * Ioend completion routine for merged bios. This can only be called from task
+ * contexts as merged ioends can be of unbound length. Hence we have to break up
+ * the writeback completions into manageable chunks to avoid long scheduler
+ * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
+ * good batch processing throughput without creating adverse scheduler latency
+ * conditions.
+ */
+void iomap_finish_ioends(struct iomap_ioend *ioend, int error)
+{
+ struct list_head tmp;
+ u32 completions;
+
+ might_sleep();
+
+ list_replace_init(&ioend->io_list, &tmp);
+ completions = iomap_finish_ioend(ioend, error);
+
+ while (!list_empty(&tmp)) {
+ if (completions > IOEND_BATCH_SIZE * 8) {
+ cond_resched();
+ completions = 0;
+ }
+ ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
+ list_del_init(&ioend->io_list);
+ completions += iomap_finish_ioend(ioend, error);
+ }
+}
+EXPORT_SYMBOL_GPL(iomap_finish_ioends);
+
+/*
+ * We can merge two adjacent ioends if they have the same set of work to do.
+ */
+static bool iomap_ioend_can_merge(struct iomap_ioend *ioend,
+ struct iomap_ioend *next)
+{
+ if (ioend->io_bio.bi_status != next->io_bio.bi_status)
+ return false;
+ if (next->io_flags & IOMAP_IOEND_BOUNDARY)
+ return false;
+ if ((ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
+ (next->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
+ return false;
+ if (ioend->io_offset + ioend->io_size != next->io_offset)
+ return false;
+ /*
+ * Do not merge physically discontiguous ioends. The filesystem
+ * completion functions will have to iterate the physical
+ * discontiguities even if we merge the ioends at a logical level, so
+ * we don't gain anything by merging physical discontiguities here.
+ *
+ * We cannot use bio->bi_iter.bi_sector here as it is modified during
+ * submission so does not point to the start sector of the bio at
+ * completion.
+ */
+ if (ioend->io_sector + (ioend->io_size >> SECTOR_SHIFT) !=
+ next->io_sector)
+ return false;
+ return true;
+}
+
+void iomap_ioend_try_merge(struct iomap_ioend *ioend,
+ struct list_head *more_ioends)
+{
+ struct iomap_ioend *next;
+
+ INIT_LIST_HEAD(&ioend->io_list);
+
+ while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
+ io_list))) {
+ if (!iomap_ioend_can_merge(ioend, next))
+ break;
+ list_move_tail(&next->io_list, &ioend->io_list);
+ ioend->io_size += next->io_size;
+ }
+}
+EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
+
+static int iomap_ioend_compare(void *priv, const struct list_head *a,
+ const struct list_head *b)
+{
+ struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
+ struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
+
+ if (ia->io_offset < ib->io_offset)
+ return -1;
+ if (ia->io_offset > ib->io_offset)
+ return 1;
+ return 0;
+}
+
+void iomap_sort_ioends(struct list_head *ioend_list)
+{
+ list_sort(NULL, ioend_list, iomap_ioend_compare);
+}
+EXPORT_SYMBOL_GPL(iomap_sort_ioends);
+
+/*
+ * Split up to the first @max_len bytes from @ioend if the ioend covers more
+ * than @max_len bytes.
+ *
+ * If @is_append is set, the split will be based on the hardware limits for
+ * REQ_OP_ZONE_APPEND commands and can be less than @max_len if the hardware
+ * limits don't allow the entire @max_len length.
+ *
+ * The bio embedded into @ioend must be a REQ_OP_WRITE because the block layer
+ * does not allow splitting REQ_OP_ZONE_APPEND bios. The file systems has to
+ * switch the operation after this call, but before submitting the bio.
+ */
+struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend,
+ unsigned int max_len, bool is_append)
+{
+ struct bio *bio = &ioend->io_bio;
+ struct iomap_ioend *split_ioend;
+ unsigned int nr_segs;
+ int sector_offset;
+ struct bio *split;
+
+ if (is_append) {
+ struct queue_limits *lim = bdev_limits(bio->bi_bdev);
+
+ max_len = min(max_len,
+ lim->max_zone_append_sectors << SECTOR_SHIFT);
+
+ sector_offset = bio_split_rw_at(bio, lim, &nr_segs, max_len);
+ if (unlikely(sector_offset < 0))
+ return ERR_PTR(sector_offset);
+ if (!sector_offset)
+ return NULL;
+ } else {
+ if (bio->bi_iter.bi_size <= max_len)
+ return NULL;
+ sector_offset = max_len >> SECTOR_SHIFT;
+ }
+
+ /* ensure the split ioend is still block size aligned */
+ sector_offset = ALIGN_DOWN(sector_offset << SECTOR_SHIFT,
+ i_blocksize(ioend->io_inode)) >> SECTOR_SHIFT;
+
+ split = bio_split(bio, sector_offset, GFP_NOFS, &iomap_ioend_bioset);
+ if (IS_ERR(split))
+ return ERR_CAST(split);
+ split->bi_private = bio->bi_private;
+ split->bi_end_io = bio->bi_end_io;
+
+ split_ioend = iomap_init_ioend(ioend->io_inode, split, ioend->io_offset,
+ ioend->io_flags);
+ split_ioend->io_parent = ioend;
+
+ atomic_inc(&ioend->io_remaining);
+ ioend->io_offset += split_ioend->io_size;
+ ioend->io_size -= split_ioend->io_size;
+
+ split_ioend->io_sector = ioend->io_sector;
+ if (!is_append)
+ ioend->io_sector += (split_ioend->io_size >> SECTOR_SHIFT);
+ return split_ioend;
+}
+EXPORT_SYMBOL_GPL(iomap_split_ioend);
+
+static int __init iomap_ioend_init(void)
+{
+ return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
+ offsetof(struct iomap_ioend, io_bio),
+ BIOSET_NEED_BVECS);
+}
+fs_initcall(iomap_ioend_init);
diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c
index 3790918646af..6ffc6a7b9ba5 100644
--- a/fs/iomap/iter.c
+++ b/fs/iomap/iter.c
@@ -7,40 +7,25 @@
#include <linux/iomap.h>
#include "trace.h"
-/*
- * Advance to the next range we need to map.
- *
- * If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully
- * processed - it was aborted because the extent the iomap spanned may have been
- * changed during the operation. In this case, the iteration behaviour is to
- * remap the unprocessed range of the iter, and that means we may need to remap
- * even when we've made no progress (i.e. iter->processed = 0). Hence the
- * "finished iterating" case needs to distinguish between
- * (processed = 0) meaning we are done and (processed = 0 && stale) meaning we
- * need to remap the entire remaining range.
- */
-static inline int iomap_iter_advance(struct iomap_iter *iter)
+static inline void iomap_iter_reset_iomap(struct iomap_iter *iter)
{
- bool stale = iter->iomap.flags & IOMAP_F_STALE;
- int ret = 1;
-
- /* handle the previous iteration (if any) */
- if (iter->iomap.length) {
- if (iter->processed < 0)
- return iter->processed;
- if (WARN_ON_ONCE(iter->processed > iomap_length(iter)))
- return -EIO;
- iter->pos += iter->processed;
- iter->len -= iter->processed;
- if (!iter->len || (!iter->processed && !stale))
- ret = 0;
- }
-
- /* clear the per iteration state */
- iter->processed = 0;
+ iter->status = 0;
memset(&iter->iomap, 0, sizeof(iter->iomap));
memset(&iter->srcmap, 0, sizeof(iter->srcmap));
- return ret;
+}
+
+/*
+ * Advance the current iterator position and output the length remaining for the
+ * current mapping.
+ */
+int iomap_iter_advance(struct iomap_iter *iter, u64 *count)
+{
+ if (WARN_ON_ONCE(*count > iomap_length(iter)))
+ return -EIO;
+ iter->pos += *count;
+ iter->len -= *count;
+ *count = iomap_length(iter);
+ return 0;
}
static inline void iomap_iter_done(struct iomap_iter *iter)
@@ -50,6 +35,8 @@ static inline void iomap_iter_done(struct iomap_iter *iter)
WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos);
WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE);
+ iter->iter_start_pos = iter->pos;
+
trace_iomap_iter_dstmap(iter->inode, &iter->iomap);
if (iter->srcmap.type != IOMAP_HOLE)
trace_iomap_iter_srcmap(iter->inode, &iter->srcmap);
@@ -67,26 +54,58 @@ static inline void iomap_iter_done(struct iomap_iter *iter)
* function must be called in a loop that continues as long it returns a
* positive value. If 0 or a negative value is returned, the caller must not
* return to the loop body. Within a loop body, there are two ways to break out
- * of the loop body: leave @iter.processed unchanged, or set it to a negative
+ * of the loop body: leave @iter.status unchanged, or set it to a negative
* errno.
*/
int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops)
{
+ bool stale = iter->iomap.flags & IOMAP_F_STALE;
+ ssize_t advanced;
+ u64 olen;
int ret;
- if (iter->iomap.length && ops->iomap_end) {
- ret = ops->iomap_end(iter->inode, iter->pos, iomap_length(iter),
- iter->processed > 0 ? iter->processed : 0,
- iter->flags, &iter->iomap);
- if (ret < 0 && !iter->processed)
+ trace_iomap_iter(iter, ops, _RET_IP_);
+
+ if (!iter->iomap.length)
+ goto begin;
+
+ /*
+ * Calculate how far the iter was advanced and the original length bytes
+ * for ->iomap_end().
+ */
+ advanced = iter->pos - iter->iter_start_pos;
+ olen = iter->len + advanced;
+
+ if (ops->iomap_end) {
+ ret = ops->iomap_end(iter->inode, iter->iter_start_pos,
+ iomap_length_trim(iter, iter->iter_start_pos,
+ olen),
+ advanced, iter->flags, &iter->iomap);
+ if (ret < 0 && !advanced)
return ret;
}
- trace_iomap_iter(iter, ops, _RET_IP_);
- ret = iomap_iter_advance(iter);
+ /* detect old return semantics where this would advance */
+ if (WARN_ON_ONCE(iter->status > 0))
+ iter->status = -EIO;
+
+ /*
+ * Use iter->len to determine whether to continue onto the next mapping.
+ * Explicitly terminate on error status or if the current iter has not
+ * advanced at all (i.e. no work was done for some reason) unless the
+ * mapping has been marked stale and needs to be reprocessed.
+ */
+ if (iter->status < 0)
+ ret = iter->status;
+ else if (iter->len == 0 || (!advanced && !stale))
+ ret = 0;
+ else
+ ret = 1;
+ iomap_iter_reset_iomap(iter);
if (ret <= 0)
return ret;
+begin:
ret = ops->iomap_begin(iter->inode, iter->pos, iter->len, iter->flags,
&iter->iomap, &iter->srcmap);
if (ret < 0)
diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c
index a845c012b50c..04d7919636c1 100644
--- a/fs/iomap/seek.c
+++ b/fs/iomap/seek.c
@@ -10,7 +10,7 @@
#include <linux/pagemap.h>
#include <linux/pagevec.h>
-static loff_t iomap_seek_hole_iter(const struct iomap_iter *iter,
+static int iomap_seek_hole_iter(struct iomap_iter *iter,
loff_t *hole_pos)
{
loff_t length = iomap_length(iter);
@@ -20,13 +20,13 @@ static loff_t iomap_seek_hole_iter(const struct iomap_iter *iter,
*hole_pos = mapping_seek_hole_data(iter->inode->i_mapping,
iter->pos, iter->pos + length, SEEK_HOLE);
if (*hole_pos == iter->pos + length)
- return length;
+ return iomap_iter_advance(iter, &length);
return 0;
case IOMAP_HOLE:
*hole_pos = iter->pos;
return 0;
default:
- return length;
+ return iomap_iter_advance(iter, &length);
}
}
@@ -47,7 +47,7 @@ iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops)
iter.len = size - pos;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_seek_hole_iter(&iter, &pos);
+ iter.status = iomap_seek_hole_iter(&iter, &pos);
if (ret < 0)
return ret;
if (iter.len) /* found hole before EOF */
@@ -56,19 +56,19 @@ iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops)
}
EXPORT_SYMBOL_GPL(iomap_seek_hole);
-static loff_t iomap_seek_data_iter(const struct iomap_iter *iter,
+static int iomap_seek_data_iter(struct iomap_iter *iter,
loff_t *hole_pos)
{
loff_t length = iomap_length(iter);
switch (iter->iomap.type) {
case IOMAP_HOLE:
- return length;
+ return iomap_iter_advance(iter, &length);
case IOMAP_UNWRITTEN:
*hole_pos = mapping_seek_hole_data(iter->inode->i_mapping,
iter->pos, iter->pos + length, SEEK_DATA);
if (*hole_pos < 0)
- return length;
+ return iomap_iter_advance(iter, &length);
return 0;
default:
*hole_pos = iter->pos;
@@ -93,7 +93,7 @@ iomap_seek_data(struct inode *inode, loff_t pos, const struct iomap_ops *ops)
iter.len = size - pos;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_seek_data_iter(&iter, &pos);
+ iter.status = iomap_seek_data_iter(&iter, &pos);
if (ret < 0)
return ret;
if (iter.len) /* found data before EOF */
diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c
index b90d0eda9e51..c1a762c10ce4 100644
--- a/fs/iomap/swapfile.c
+++ b/fs/iomap/swapfile.c
@@ -94,7 +94,7 @@ static int iomap_swapfile_fail(struct iomap_swapfile_info *isi, const char *str)
* swap only cares about contiguous page-aligned physical extents and makes no
* distinction between written and unwritten extents.
*/
-static loff_t iomap_swapfile_iter(const struct iomap_iter *iter,
+static int iomap_swapfile_iter(struct iomap_iter *iter,
struct iomap *iomap, struct iomap_swapfile_info *isi)
{
switch (iomap->type) {
@@ -132,7 +132,8 @@ static loff_t iomap_swapfile_iter(const struct iomap_iter *iter,
return error;
memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
}
- return iomap_length(iter);
+
+ return iomap_iter_advance_full(iter);
}
/*
@@ -166,7 +167,7 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
return ret;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_swapfile_iter(&iter, &iter.iomap, &isi);
+ iter.status = iomap_swapfile_iter(&iter, &iter.iomap, &isi);
if (ret < 0)
return ret;
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index 4118a42cdab0..9eab2c8ac3c5 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -207,7 +207,7 @@ TRACE_EVENT(iomap_iter,
__field(u64, ino)
__field(loff_t, pos)
__field(u64, length)
- __field(s64, processed)
+ __field(int, status)
__field(unsigned int, flags)
__field(const void *, ops)
__field(unsigned long, caller)
@@ -217,17 +217,17 @@ TRACE_EVENT(iomap_iter,
__entry->ino = iter->inode->i_ino;
__entry->pos = iter->pos;
__entry->length = iomap_length(iter);
- __entry->processed = iter->processed;
+ __entry->status = iter->status;
__entry->flags = iter->flags;
__entry->ops = ops;
__entry->caller = caller;
),
- TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx processed %lld flags %s (0x%x) ops %ps caller %pS",
+ TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx status %d flags %s (0x%x) ops %ps caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->pos,
__entry->length,
- __entry->processed,
+ __entry->status,
__print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS),
__entry->flags,
__entry->ops,
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 2b2938970da3..dd91f725ded6 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -32,8 +32,8 @@ static int jffs2_link (struct dentry *,struct inode *,struct dentry *);
static int jffs2_unlink (struct inode *,struct dentry *);
static int jffs2_symlink (struct mnt_idmap *, struct inode *,
struct dentry *, const char *);
-static int jffs2_mkdir (struct mnt_idmap *, struct inode *,struct dentry *,
- umode_t);
+static struct dentry *jffs2_mkdir (struct mnt_idmap *, struct inode *,struct dentry *,
+ umode_t);
static int jffs2_rmdir (struct inode *,struct dentry *);
static int jffs2_mknod (struct mnt_idmap *, struct inode *,struct dentry *,
umode_t,dev_t);
@@ -446,8 +446,8 @@ static int jffs2_symlink (struct mnt_idmap *idmap, struct inode *dir_i,
}
-static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i,
- struct dentry *dentry, umode_t mode)
+static struct dentry *jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i,
+ struct dentry *dentry, umode_t mode)
{
struct jffs2_inode_info *f, *dir_f;
struct jffs2_sb_info *c;
@@ -464,7 +464,7 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i,
ri = jffs2_alloc_raw_inode();
if (!ri)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
c = JFFS2_SB_INFO(dir_i->i_sb);
@@ -477,7 +477,7 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i,
if (ret) {
jffs2_free_raw_inode(ri);
- return ret;
+ return ERR_PTR(ret);
}
inode = jffs2_new_inode(dir_i, mode, ri);
@@ -485,7 +485,7 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i,
if (IS_ERR(inode)) {
jffs2_free_raw_inode(ri);
jffs2_complete_reservation(c);
- return PTR_ERR(inode);
+ return ERR_CAST(inode);
}
inode->i_op = &jffs2_dir_inode_operations;
@@ -584,11 +584,11 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i,
jffs2_complete_reservation(c);
d_instantiate_new(dentry, inode);
- return 0;
+ return NULL;
fail:
iget_failed(inode);
- return ret;
+ return ERR_PTR(ret);
}
static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index fc8ede43afde..65a218eba8fa 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -187,13 +187,13 @@ static int jfs_create(struct mnt_idmap *idmap, struct inode *dip,
* dentry - dentry of child directory
* mode - create mode (rwxrwxrwx).
*
- * RETURN: Errors from subroutines
+ * RETURN: ERR_PTR() of errors from subroutines.
*
* note:
* EACCES: user needs search+write permission on the parent directory
*/
-static int jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip,
- struct dentry *dentry, umode_t mode)
+static struct dentry *jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip,
+ struct dentry *dentry, umode_t mode)
{
int rc = 0;
tid_t tid; /* transaction id */
@@ -308,7 +308,7 @@ static int jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip,
out1:
jfs_info("jfs_mkdir: rc:%d", rc);
- return rc;
+ return ERR_PTR(rc);
}
/*
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 5f0f8b95f44c..d296aad70800 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -1230,24 +1230,24 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
return d_splice_alias(inode, dentry);
}
-static int kernfs_iop_mkdir(struct mnt_idmap *idmap,
- struct inode *dir, struct dentry *dentry,
- umode_t mode)
+static struct dentry *kernfs_iop_mkdir(struct mnt_idmap *idmap,
+ struct inode *dir, struct dentry *dentry,
+ umode_t mode)
{
struct kernfs_node *parent = dir->i_private;
struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops;
int ret;
if (!scops || !scops->mkdir)
- return -EPERM;
+ return ERR_PTR(-EPERM);
if (!kernfs_get_active(parent))
- return -ENODEV;
+ return ERR_PTR(-ENODEV);
ret = scops->mkdir(parent, dentry->d_name.name, mode);
kernfs_put_active(parent);
- return ret;
+ return ERR_PTR(ret);
}
static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry)
diff --git a/fs/libfs.c b/fs/libfs.c
index 8444f5cc4064..6393d7c49ee6 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -496,7 +496,7 @@ offset_dir_lookup(struct dentry *parent, loff_t offset)
found = find_positive_dentry(parent, NULL, false);
else {
rcu_read_lock();
- child = mas_find(&mas, DIR_OFFSET_MAX);
+ child = mas_find_rev(&mas, DIR_OFFSET_MIN);
found = find_positive_dentry(parent, child, false);
rcu_read_unlock();
}
@@ -2113,7 +2113,7 @@ struct timespec64 simple_inode_init_ts(struct inode *inode)
}
EXPORT_SYMBOL(simple_inode_init_ts);
-static inline struct dentry *get_stashed_dentry(struct dentry **stashed)
+struct dentry *stashed_dentry_get(struct dentry **stashed)
{
struct dentry *dentry;
@@ -2215,7 +2215,7 @@ int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info;
/* See if dentry can be reused. */
- path->dentry = get_stashed_dentry(stashed);
+ path->dentry = stashed_dentry_get(stashed);
if (path->dentry) {
sops->put_data(data);
goto out_path;
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 5d9c1406fe27..8938536d8d3c 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -104,15 +104,15 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir,
return add_nondir(dentry, inode);
}
-static int minix_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *minix_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode * inode;
int err;
inode = minix_new_inode(dir, S_IFDIR | mode);
if (IS_ERR(inode))
- return PTR_ERR(inode);
+ return ERR_CAST(inode);
inode_inc_link_count(dir);
minix_set_inode(inode, 0);
@@ -128,7 +128,7 @@ static int minix_mkdir(struct mnt_idmap *idmap, struct inode *dir,
d_instantiate(dentry, inode);
out:
- return err;
+ return ERR_PTR(err);
out_fail:
inode_dec_link_count(inode);
diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c
index 7b1df8cc2821..a37991fdb194 100644
--- a/fs/mnt_idmapping.c
+++ b/fs/mnt_idmapping.c
@@ -6,6 +6,7 @@
#include <linux/mnt_idmapping.h>
#include <linux/slab.h>
#include <linux/user_namespace.h>
+#include <linux/seq_file.h>
#include "internal.h"
@@ -334,3 +335,53 @@ void mnt_idmap_put(struct mnt_idmap *idmap)
free_mnt_idmap(idmap);
}
EXPORT_SYMBOL_GPL(mnt_idmap_put);
+
+int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map)
+{
+ struct uid_gid_map *map, *map_up;
+ u32 idx, nr_mappings;
+
+ if (!is_valid_mnt_idmap(idmap))
+ return 0;
+
+ /*
+ * Idmappings are shown relative to the caller's idmapping.
+ * This is both the most intuitive and most useful solution.
+ */
+ if (uid_map) {
+ map = &idmap->uid_map;
+ map_up = &current_user_ns()->uid_map;
+ } else {
+ map = &idmap->gid_map;
+ map_up = &current_user_ns()->gid_map;
+ }
+
+ for (idx = 0, nr_mappings = 0; idx < map->nr_extents; idx++) {
+ uid_t lower;
+ struct uid_gid_extent *extent;
+
+ if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+ extent = &map->extent[idx];
+ else
+ extent = &map->forward[idx];
+
+ /*
+ * Verify that the whole range of the mapping can be
+ * resolved in the caller's idmapping. If it cannot be
+ * resolved skip the mapping.
+ */
+ lower = map_id_range_up(map_up, extent->lower_first, extent->count);
+ if (lower == (uid_t) -1)
+ continue;
+
+ seq_printf(seq, "%u %u %u", extent->first, lower, extent->count);
+
+ seq->count++; /* mappings are separated by \0 */
+ if (seq_has_overflowed(seq))
+ return -EAGAIN;
+
+ nr_mappings++;
+ }
+
+ return nr_mappings;
+}
diff --git a/fs/mount.h b/fs/mount.h
index ffb613cdfeee..7aecf2a60472 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -5,6 +5,12 @@
#include <linux/ns_common.h>
#include <linux/fs_pin.h>
+extern struct list_head notify_list;
+
+typedef __u32 __bitwise mntns_flags_t;
+
+#define MNTNS_PROPAGATING ((__force mntns_flags_t)(1 << 0))
+
struct mnt_namespace {
struct ns_common ns;
struct mount * root;
@@ -20,12 +26,18 @@ struct mnt_namespace {
wait_queue_head_t poll;
struct rcu_head mnt_ns_rcu;
};
+ u64 seq_origin; /* Sequence number of origin mount namespace */
u64 event;
+#ifdef CONFIG_FSNOTIFY
+ __u32 n_fsnotify_mask;
+ struct fsnotify_mark_connector __rcu *n_fsnotify_marks;
+#endif
unsigned int nr_mounts; /* # of mounts in the namespace */
unsigned int pending_mounts;
struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */
struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */
refcount_t passive; /* number references not pinning @mounts */
+ mntns_flags_t mntns_flags;
} __randomize_layout;
struct mnt_pcp {
@@ -76,6 +88,8 @@ struct mount {
#ifdef CONFIG_FSNOTIFY
struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
__u32 mnt_fsnotify_mask;
+ struct list_head to_notify; /* need to queue notification */
+ struct mnt_namespace *prev_ns; /* previous namespace (NULL if none) */
#endif
int mnt_id; /* mount identifier, reused */
u64 mnt_id_unique; /* mount ID unique until reboot */
@@ -156,6 +170,11 @@ static inline bool mnt_ns_attached(const struct mount *mnt)
return !RB_EMPTY_NODE(&mnt->mnt_node);
}
+static inline bool mnt_ns_empty(const struct mnt_namespace *ns)
+{
+ return RB_EMPTY_ROOT(&ns->mounts);
+}
+
static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
{
struct mnt_namespace *ns = mnt->mnt_ns;
@@ -177,3 +196,21 @@ static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
{
return container_of(ns, struct mnt_namespace, ns);
}
+
+#ifdef CONFIG_FSNOTIFY
+static inline void mnt_notify_add(struct mount *m)
+{
+ /* Optimize the case where there are no watches */
+ if ((m->mnt_ns && m->mnt_ns->n_fsnotify_marks) ||
+ (m->prev_ns && m->prev_ns->n_fsnotify_marks))
+ list_add_tail(&m->to_notify, &notify_list);
+ else
+ m->prev_ns = m->mnt_ns;
+}
+#else
+static inline void mnt_notify_add(struct mount *m)
+{
+}
+#endif
+
+struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry);
diff --git a/fs/mpage.c b/fs/mpage.c
index 82aecf372743..ad7844de87c3 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -107,7 +107,7 @@ static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh,
* don't make any buffers if there is only one buffer on
* the folio and the folio just needs to be set up to date
*/
- if (inode->i_blkbits == PAGE_SHIFT &&
+ if (inode->i_blkbits == folio_shift(folio) &&
buffer_uptodate(bh)) {
folio_mark_uptodate(folio);
return;
@@ -153,7 +153,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
struct folio *folio = args->folio;
struct inode *inode = folio->mapping->host;
const unsigned blkbits = inode->i_blkbits;
- const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
+ const unsigned blocks_per_folio = folio_size(folio) >> blkbits;
const unsigned blocksize = 1 << blkbits;
struct buffer_head *map_bh = &args->map_bh;
sector_t block_in_file;
@@ -161,7 +161,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
sector_t last_block_in_file;
sector_t first_block;
unsigned page_block;
- unsigned first_hole = blocks_per_page;
+ unsigned first_hole = blocks_per_folio;
struct block_device *bdev = NULL;
int length;
int fully_mapped = 1;
@@ -170,9 +170,6 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
unsigned relative_block;
gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);
- /* MAX_BUF_PER_PAGE, for example */
- VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
-
if (args->is_readahead) {
opf |= REQ_RAHEAD;
gfp |= __GFP_NORETRY | __GFP_NOWARN;
@@ -181,8 +178,8 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
if (folio_buffers(folio))
goto confused;
- block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits);
- last_block = block_in_file + args->nr_pages * blocks_per_page;
+ block_in_file = folio_pos(folio) >> blkbits;
+ last_block = block_in_file + ((args->nr_pages * PAGE_SIZE) >> blkbits);
last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
if (last_block > last_block_in_file)
last_block = last_block_in_file;
@@ -204,7 +201,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
clear_buffer_mapped(map_bh);
break;
}
- if (page_block == blocks_per_page)
+ if (page_block == blocks_per_folio)
break;
page_block++;
block_in_file++;
@@ -216,7 +213,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
* Then do more get_blocks calls until we are done with this folio.
*/
map_bh->b_folio = folio;
- while (page_block < blocks_per_page) {
+ while (page_block < blocks_per_folio) {
map_bh->b_state = 0;
map_bh->b_size = 0;
@@ -229,7 +226,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
if (!buffer_mapped(map_bh)) {
fully_mapped = 0;
- if (first_hole == blocks_per_page)
+ if (first_hole == blocks_per_folio)
first_hole = page_block;
page_block++;
block_in_file++;
@@ -247,7 +244,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
goto confused;
}
- if (first_hole != blocks_per_page)
+ if (first_hole != blocks_per_folio)
goto confused; /* hole -> non-hole */
/* Contiguous blocks? */
@@ -260,7 +257,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
if (relative_block == nblocks) {
clear_buffer_mapped(map_bh);
break;
- } else if (page_block == blocks_per_page)
+ } else if (page_block == blocks_per_folio)
break;
page_block++;
block_in_file++;
@@ -268,8 +265,8 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
bdev = map_bh->b_bdev;
}
- if (first_hole != blocks_per_page) {
- folio_zero_segment(folio, first_hole << blkbits, PAGE_SIZE);
+ if (first_hole != blocks_per_folio) {
+ folio_zero_segment(folio, first_hole << blkbits, folio_size(folio));
if (first_hole == 0) {
folio_mark_uptodate(folio);
folio_unlock(folio);
@@ -303,10 +300,10 @@ alloc_new:
relative_block = block_in_file - args->first_logical_block;
nblocks = map_bh->b_size >> blkbits;
if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
- (first_hole != blocks_per_page))
+ (first_hole != blocks_per_folio))
args->bio = mpage_bio_submit_read(args->bio);
else
- args->last_block_in_bio = first_block + blocks_per_page - 1;
+ args->last_block_in_bio = first_block + blocks_per_folio - 1;
out:
return args->bio;
@@ -385,7 +382,7 @@ int mpage_read_folio(struct folio *folio, get_block_t get_block)
{
struct mpage_readpage_args args = {
.folio = folio,
- .nr_pages = 1,
+ .nr_pages = folio_nr_pages(folio),
.get_block = get_block,
};
@@ -456,12 +453,12 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
struct address_space *mapping = folio->mapping;
struct inode *inode = mapping->host;
const unsigned blkbits = inode->i_blkbits;
- const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
+ const unsigned blocks_per_folio = folio_size(folio) >> blkbits;
sector_t last_block;
sector_t block_in_file;
sector_t first_block;
unsigned page_block;
- unsigned first_unmapped = blocks_per_page;
+ unsigned first_unmapped = blocks_per_folio;
struct block_device *bdev = NULL;
int boundary = 0;
sector_t boundary_block = 0;
@@ -486,12 +483,12 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
*/
if (buffer_dirty(bh))
goto confused;
- if (first_unmapped == blocks_per_page)
+ if (first_unmapped == blocks_per_folio)
first_unmapped = page_block;
continue;
}
- if (first_unmapped != blocks_per_page)
+ if (first_unmapped != blocks_per_folio)
goto confused; /* hole -> non-hole */
if (!buffer_dirty(bh) || !buffer_uptodate(bh))
@@ -527,7 +524,7 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
* The page has no buffers: map it to disk
*/
BUG_ON(!folio_test_uptodate(folio));
- block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits);
+ block_in_file = folio_pos(folio) >> blkbits;
/*
* Whole page beyond EOF? Skip allocating blocks to avoid leaking
* space.
@@ -536,7 +533,7 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
goto page_is_mapped;
last_block = (i_size - 1) >> blkbits;
map_bh.b_folio = folio;
- for (page_block = 0; page_block < blocks_per_page; ) {
+ for (page_block = 0; page_block < blocks_per_folio; ) {
map_bh.b_state = 0;
map_bh.b_size = 1 << blkbits;
@@ -618,14 +615,14 @@ alloc_new:
BUG_ON(folio_test_writeback(folio));
folio_start_writeback(folio);
folio_unlock(folio);
- if (boundary || (first_unmapped != blocks_per_page)) {
+ if (boundary || (first_unmapped != blocks_per_folio)) {
bio = mpage_bio_submit_write(bio);
if (boundary_block) {
write_boundary_block(boundary_bdev,
boundary_block, 1 << blkbits);
}
} else {
- mpd->last_block_in_bio = first_block + blocks_per_page - 1;
+ mpd->last_block_in_bio = first_block + blocks_per_folio - 1;
}
goto out;
diff --git a/fs/namei.c b/fs/namei.c
index ecb7b95c2ca3..360a86ca1f02 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -125,6 +125,13 @@
#define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname))
+static inline void initname(struct filename *name)
+{
+ name->uptr = NULL;
+ name->aname = NULL;
+ atomic_set(&name->refcnt, 1);
+}
+
struct filename *
getname_flags(const char __user *filename, int flags)
{
@@ -203,10 +210,7 @@ getname_flags(const char __user *filename, int flags)
return ERR_PTR(-ENAMETOOLONG);
}
}
-
- atomic_set(&result->refcnt, 1);
- result->uptr = filename;
- result->aname = NULL;
+ initname(result);
audit_getname(result);
return result;
}
@@ -218,11 +222,6 @@ struct filename *getname_uflags(const char __user *filename, int uflags)
return getname_flags(filename, flags);
}
-struct filename *getname(const char __user * filename)
-{
- return getname_flags(filename, 0);
-}
-
struct filename *__getname_maybe_null(const char __user *pathname)
{
struct filename *name;
@@ -269,25 +268,27 @@ struct filename *getname_kernel(const char * filename)
return ERR_PTR(-ENAMETOOLONG);
}
memcpy((char *)result->name, filename, len);
- result->uptr = NULL;
- result->aname = NULL;
- atomic_set(&result->refcnt, 1);
+ initname(result);
audit_getname(result);
-
return result;
}
EXPORT_SYMBOL(getname_kernel);
void putname(struct filename *name)
{
+ int refcnt;
+
if (IS_ERR_OR_NULL(name))
return;
- if (WARN_ON_ONCE(!atomic_read(&name->refcnt)))
- return;
+ refcnt = atomic_read(&name->refcnt);
+ if (refcnt != 1) {
+ if (WARN_ON_ONCE(!refcnt))
+ return;
- if (!atomic_dec_and_test(&name->refcnt))
- return;
+ if (!atomic_dec_and_test(&name->refcnt))
+ return;
+ }
if (name->name != name->iname) {
__putname(name->name);
@@ -1670,6 +1671,8 @@ static struct dentry *lookup_dcache(const struct qstr *name,
* dentries - as the matter of fact, this only gets called
* when directory is guaranteed to have no in-lookup children
* at all.
+ * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed.
+ * Will return -EEXIST if name is found and LOOKUP_EXCL was passed.
*/
struct dentry *lookup_one_qstr_excl(const struct qstr *name,
struct dentry *base,
@@ -1680,7 +1683,7 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name,
struct inode *dir = base->d_inode;
if (dentry)
- return dentry;
+ goto found;
/* Don't create child dentry for a dead directory. */
if (unlikely(IS_DEADDIR(dir)))
@@ -1695,6 +1698,17 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name,
dput(dentry);
dentry = old;
}
+found:
+ if (IS_ERR(dentry))
+ return dentry;
+ if (d_is_negative(dentry) && !(flags & LOOKUP_CREATE)) {
+ dput(dentry);
+ return ERR_PTR(-ENOENT);
+ }
+ if (d_is_positive(dentry) && (flags & LOOKUP_EXCL)) {
+ dput(dentry);
+ return ERR_PTR(-EEXIST);
+ }
return dentry;
}
EXPORT_SYMBOL(lookup_one_qstr_excl);
@@ -2863,15 +2877,14 @@ static int lookup_one_common(struct mnt_idmap *idmap,
* Note that this routine is purely a helper for filesystem usage and should
* not be called by generic code.
*
- * The caller must hold base->i_mutex.
+ * No locks need be held - only a counted reference to @base is needed.
+ *
*/
struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len)
{
struct qstr this;
int err;
- WARN_ON_ONCE(!inode_is_locked(base->d_inode));
-
err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this);
if (err)
return ERR_PTR(err);
@@ -3415,6 +3428,8 @@ static int may_open(struct mnt_idmap *idmap, const struct path *path,
if ((acc_mode & MAY_EXEC) && path_noexec(path))
return -EACCES;
break;
+ default:
+ VFS_BUG_ON_INODE(1, inode);
}
error = inode_permission(idmap, inode, MAY_OPEN | acc_mode);
@@ -3995,7 +4010,7 @@ static struct file *path_openat(struct nameidata *nd,
WARN_ON(1);
error = -EINVAL;
}
- fput(file);
+ fput_close(file);
if (error == -EOPENSTALE) {
if (flags & LOOKUP_RCU)
error = -ECHILD;
@@ -4078,27 +4093,13 @@ static struct dentry *filename_create(int dfd, struct filename *name,
* '/', and a directory wasn't requested.
*/
if (last.name[last.len] && !want_dir)
- create_flags = 0;
+ create_flags &= ~LOOKUP_CREATE;
inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
dentry = lookup_one_qstr_excl(&last, path->dentry,
reval_flag | create_flags);
if (IS_ERR(dentry))
goto unlock;
- error = -EEXIST;
- if (d_is_positive(dentry))
- goto fail;
-
- /*
- * Special case - lookup gave negative, but... we had foo/bar/
- * From the vfs_mknod() POV we just have a negative dentry -
- * all is fine. Let's be bastards - you had / on the end, you've
- * been asking for (non-existent) directory. -ENOENT for you.
- */
- if (unlikely(!create_flags)) {
- error = -ENOENT;
- goto fail;
- }
if (unlikely(err2)) {
error = err2;
goto fail;
@@ -4129,7 +4130,8 @@ EXPORT_SYMBOL(kern_path_create);
void done_path_create(struct path *path, struct dentry *dentry)
{
- dput(dentry);
+ if (!IS_ERR(dentry))
+ dput(dentry);
inode_unlock(path->dentry->d_inode);
mnt_drop_write(path->mnt);
path_put(path);
@@ -4275,7 +4277,7 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d
}
/**
- * vfs_mkdir - create directory
+ * vfs_mkdir - create directory returning correct dentry if possible
* @idmap: idmap of the mount the inode was found from
* @dir: inode of the parent directory
* @dentry: dentry of the child directory
@@ -4288,32 +4290,51 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
* raw inode simply pass @nop_mnt_idmap.
+ *
+ * In the event that the filesystem does not use the *@dentry but leaves it
+ * negative or unhashes it and possibly splices a different one returning it,
+ * the original dentry is dput() and the alternate is returned.
+ *
+ * In case of an error the dentry is dput() and an ERR_PTR() is returned.
*/
-int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int error;
unsigned max_links = dir->i_sb->s_max_links;
+ struct dentry *de;
error = may_create(idmap, dir, dentry);
if (error)
- return error;
+ goto err;
+ error = -EPERM;
if (!dir->i_op->mkdir)
- return -EPERM;
+ goto err;
mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0);
error = security_inode_mkdir(dir, dentry, mode);
if (error)
- return error;
+ goto err;
+ error = -EMLINK;
if (max_links && dir->i_nlink >= max_links)
- return -EMLINK;
+ goto err;
- error = dir->i_op->mkdir(idmap, dir, dentry, mode);
- if (!error)
- fsnotify_mkdir(dir, dentry);
- return error;
+ de = dir->i_op->mkdir(idmap, dir, dentry, mode);
+ error = PTR_ERR(de);
+ if (IS_ERR(de))
+ goto err;
+ if (de) {
+ dput(dentry);
+ dentry = de;
+ }
+ fsnotify_mkdir(dir, dentry);
+ return dentry;
+
+err:
+ dput(dentry);
+ return ERR_PTR(error);
}
EXPORT_SYMBOL(vfs_mkdir);
@@ -4333,8 +4354,10 @@ retry:
error = security_path_mkdir(&path, dentry,
mode_strip_umask(path.dentry->d_inode, mode));
if (!error) {
- error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
+ dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
dentry, mode);
+ if (IS_ERR(dentry))
+ error = PTR_ERR(dentry);
}
done_path_create(&path, dentry);
if (retry_estale(error, lookup_flags)) {
@@ -4445,10 +4468,6 @@ retry:
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto exit3;
- if (!dentry->d_inode) {
- error = -ENOENT;
- goto exit4;
- }
error = security_path_rmdir(&path, dentry);
if (error)
goto exit4;
@@ -4579,7 +4598,7 @@ retry_deleg:
if (!IS_ERR(dentry)) {
/* Why not before? Because we want correct error value */
- if (last.name[last.len] || d_is_negative(dentry))
+ if (last.name[last.len])
goto slashes;
inode = dentry->d_inode;
ihold(inode);
@@ -4613,9 +4632,7 @@ exit1:
return error;
slashes:
- if (d_is_negative(dentry))
- error = -ENOENT;
- else if (d_is_dir(dentry))
+ if (d_is_dir(dentry))
error = -EISDIR;
else
error = -ENOTDIR;
@@ -5115,7 +5132,8 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd,
struct qstr old_last, new_last;
int old_type, new_type;
struct inode *delegated_inode = NULL;
- unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
+ unsigned int lookup_flags = 0, target_flags =
+ LOOKUP_RENAME_TARGET | LOOKUP_CREATE;
bool should_retry = false;
int error = -EINVAL;
@@ -5128,6 +5146,8 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd,
if (flags & RENAME_EXCHANGE)
target_flags = 0;
+ if (flags & RENAME_NOREPLACE)
+ target_flags |= LOOKUP_EXCL;
retry:
error = filename_parentat(olddfd, from, lookup_flags, &old_path,
@@ -5169,23 +5189,12 @@ retry_deleg:
error = PTR_ERR(old_dentry);
if (IS_ERR(old_dentry))
goto exit3;
- /* source must exist */
- error = -ENOENT;
- if (d_is_negative(old_dentry))
- goto exit4;
new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry,
lookup_flags | target_flags);
error = PTR_ERR(new_dentry);
if (IS_ERR(new_dentry))
goto exit4;
- error = -EEXIST;
- if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
- goto exit5;
if (flags & RENAME_EXCHANGE) {
- error = -ENOENT;
- if (d_is_negative(new_dentry))
- goto exit5;
-
if (!d_is_dir(new_dentry)) {
error = -ENOTDIR;
if (new_last.name[new_last.len])
diff --git a/fs/namespace.c b/fs/namespace.c
index 8f1000f9f3df..6100e5b962a6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -81,15 +81,23 @@ static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
static DEFINE_SEQLOCK(mnt_ns_tree_lock);
+#ifdef CONFIG_FSNOTIFY
+LIST_HEAD(notify_list); /* protected by namespace_sem */
+#endif
static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */
+enum mount_kattr_flags_t {
+ MOUNT_KATTR_RECURSE = (1 << 0),
+ MOUNT_KATTR_IDMAP_REPLACE = (1 << 1),
+};
+
struct mount_kattr {
unsigned int attr_set;
unsigned int attr_clr;
unsigned int propagation;
unsigned int lookup_flags;
- bool recurse;
+ enum mount_kattr_flags_t kflags;
struct user_namespace *mnt_userns;
struct mnt_idmap *mnt_idmap;
};
@@ -163,6 +171,7 @@ static void mnt_ns_release(struct mnt_namespace *ns)
{
/* keep alive for {list,stat}mount() */
if (refcount_dec_and_test(&ns->passive)) {
+ fsnotify_mntns_delete(ns);
put_user_ns(ns->user_ns);
kfree(ns);
}
@@ -998,6 +1007,17 @@ static inline int check_mnt(struct mount *mnt)
return mnt->mnt_ns == current->nsproxy->mnt_ns;
}
+static inline bool check_anonymous_mnt(struct mount *mnt)
+{
+ u64 seq;
+
+ if (!is_anon_ns(mnt->mnt_ns))
+ return false;
+
+ seq = mnt->mnt_ns->seq_origin;
+ return !seq || (seq == current->nsproxy->mnt_ns->seq);
+}
+
/*
* vfsmount lock must be held for write
*/
@@ -1176,6 +1196,8 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
ns->mnt_first_node = &mnt->mnt_node;
rb_link_node(&mnt->mnt_node, parent, link);
rb_insert_color(&mnt->mnt_node, &ns->mounts);
+
+ mnt_notify_add(mnt);
}
/*
@@ -1723,6 +1745,50 @@ int may_umount(struct vfsmount *mnt)
EXPORT_SYMBOL(may_umount);
+#ifdef CONFIG_FSNOTIFY
+static void mnt_notify(struct mount *p)
+{
+ if (!p->prev_ns && p->mnt_ns) {
+ fsnotify_mnt_attach(p->mnt_ns, &p->mnt);
+ } else if (p->prev_ns && !p->mnt_ns) {
+ fsnotify_mnt_detach(p->prev_ns, &p->mnt);
+ } else if (p->prev_ns == p->mnt_ns) {
+ fsnotify_mnt_move(p->mnt_ns, &p->mnt);
+ } else {
+ fsnotify_mnt_detach(p->prev_ns, &p->mnt);
+ fsnotify_mnt_attach(p->mnt_ns, &p->mnt);
+ }
+ p->prev_ns = p->mnt_ns;
+}
+
+static void notify_mnt_list(void)
+{
+ struct mount *m, *tmp;
+ /*
+ * Notify about mounts that were added/reparented/detached/remain
+ * connected after unmount.
+ */
+ list_for_each_entry_safe(m, tmp, &notify_list, to_notify) {
+ mnt_notify(m);
+ list_del_init(&m->to_notify);
+ }
+}
+
+static bool need_notify_mnt_list(void)
+{
+ return !list_empty(&notify_list);
+}
+#else
+static void notify_mnt_list(void)
+{
+}
+
+static bool need_notify_mnt_list(void)
+{
+ return false;
+}
+#endif
+
static void namespace_unlock(void)
{
struct hlist_head head;
@@ -1733,7 +1799,18 @@ static void namespace_unlock(void)
hlist_move_list(&unmounted, &head);
list_splice_init(&ex_mountpoints, &list);
- up_write(&namespace_sem);
+ if (need_notify_mnt_list()) {
+ /*
+ * No point blocking out concurrent readers while notifications
+ * are sent. This will also allow statmount()/listmount() to run
+ * concurrently.
+ */
+ downgrade_write(&namespace_sem);
+ notify_mnt_list();
+ up_read(&namespace_sem);
+ } else {
+ up_write(&namespace_sem);
+ }
shrink_dentry_list(&list);
@@ -1846,6 +1923,19 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
change_mnt_propagation(p, MS_PRIVATE);
if (disconnect)
hlist_add_head(&p->mnt_umount, &unmounted);
+
+ /*
+ * At this point p->mnt_ns is NULL, notification will be queued
+ * only if
+ *
+ * - p->prev_ns is non-NULL *and*
+ * - p->prev_ns->n_fsnotify_marks is non-NULL
+ *
+ * This will preclude queuing the mount if this is a cleanup
+ * after a failed copy_tree() or destruction of an anonymous
+ * namespace, etc.
+ */
+ mnt_notify_add(p);
}
}
@@ -2026,6 +2116,7 @@ static void warn_mandlock(void)
static int can_umount(const struct path *path, int flags)
{
struct mount *mnt = real_mount(path->mnt);
+ struct super_block *sb = path->dentry->d_sb;
if (!may_mount())
return -EPERM;
@@ -2035,7 +2126,7 @@ static int can_umount(const struct path *path, int flags)
return -EINVAL;
if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
return -EINVAL;
- if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
+ if (flags & MNT_FORCE && !ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
return -EPERM;
return 0;
}
@@ -2145,16 +2236,24 @@ struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool pr
}
}
+struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry)
+{
+ if (!is_mnt_ns_file(dentry))
+ return NULL;
+
+ return to_mnt_ns(get_proc_ns(dentry->d_inode));
+}
+
static bool mnt_ns_loop(struct dentry *dentry)
{
/* Could bind mounting the mount namespace inode cause a
* mount namespace loop?
*/
- struct mnt_namespace *mnt_ns;
- if (!is_mnt_ns_file(dentry))
+ struct mnt_namespace *mnt_ns = mnt_ns_from_dentry(dentry);
+
+ if (!mnt_ns)
return false;
- mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode));
return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
}
@@ -2246,22 +2345,75 @@ struct vfsmount *collect_mounts(const struct path *path)
static void free_mnt_ns(struct mnt_namespace *);
static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);
+static inline bool must_dissolve(struct mnt_namespace *mnt_ns)
+{
+ /*
+ * This mount belonged to an anonymous mount namespace
+ * but was moved to a non-anonymous mount namespace and
+ * then unmounted.
+ */
+ if (unlikely(!mnt_ns))
+ return false;
+
+ /*
+ * This mount belongs to a non-anonymous mount namespace
+ * and we know that such a mount can never transition to
+ * an anonymous mount namespace again.
+ */
+ if (!is_anon_ns(mnt_ns)) {
+ /*
+ * A detached mount either belongs to an anonymous mount
+ * namespace or a non-anonymous mount namespace. It
+ * should never belong to something purely internal.
+ */
+ VFS_WARN_ON_ONCE(mnt_ns == MNT_NS_INTERNAL);
+ return false;
+ }
+
+ return true;
+}
+
void dissolve_on_fput(struct vfsmount *mnt)
{
struct mnt_namespace *ns;
- namespace_lock();
- lock_mount_hash();
- ns = real_mount(mnt)->mnt_ns;
- if (ns) {
- if (is_anon_ns(ns))
- umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
- else
- ns = NULL;
+ struct mount *m = real_mount(mnt);
+
+ scoped_guard(rcu) {
+ if (!must_dissolve(READ_ONCE(m->mnt_ns)))
+ return;
}
- unlock_mount_hash();
- namespace_unlock();
- if (ns)
- free_mnt_ns(ns);
+
+ scoped_guard(rwsem_write, &namespace_sem) {
+ ns = m->mnt_ns;
+ if (!must_dissolve(ns))
+ return;
+
+ /*
+ * After must_dissolve() we know that this is a detached
+ * mount in an anonymous mount namespace.
+ *
+ * Now when mnt_has_parent() reports that this mount
+ * tree has a parent, we know that this anonymous mount
+ * tree has been moved to another anonymous mount
+ * namespace.
+ *
+ * So when closing this file we cannot unmount the mount
+ * tree. This will be done when the file referring to
+ * the root of the anonymous mount namespace will be
+ * closed (It could already be closed but it would sync
+ * on @namespace_sem and wait for us to finish.).
+ */
+ if (mnt_has_parent(m))
+ return;
+
+ lock_mount_hash();
+ umount_tree(m, UMOUNT_CONNECTED);
+ unlock_mount_hash();
+ }
+
+ /* Make sure we notice when we leak mounts. */
+ VFS_WARN_ON_ONCE(!mnt_ns_empty(ns));
+ free_mnt_ns(ns);
}
void drop_collected_mounts(struct vfsmount *mnt)
@@ -2287,6 +2439,28 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry)
return false;
}
+/*
+ * Check that there aren't references to earlier/same mount namespaces in the
+ * specified subtree. Such references can act as pins for mount namespaces
+ * that aren't checked by the mount-cycle checking code, thereby allowing
+ * cycles to be made.
+ */
+static bool check_for_nsfs_mounts(struct mount *subtree)
+{
+ struct mount *p;
+ bool ret = false;
+
+ lock_mount_hash();
+ for (p = subtree; p; p = next_mnt(p, subtree))
+ if (mnt_ns_loop(p->mnt.mnt_root))
+ goto out;
+
+ ret = true;
+out:
+ unlock_mount_hash();
+ return ret;
+}
+
/**
* clone_private_mount - create a private clone of a path
* @path: path to clone
@@ -2295,6 +2469,8 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry)
* will not be attached anywhere in the namespace and will be private (i.e.
* changes to the originating mount won't be propagated into this).
*
+ * This assumes caller has called or done the equivalent of may_mount().
+ *
* Release with mntput().
*/
struct vfsmount *clone_private_mount(const struct path *path)
@@ -2302,30 +2478,36 @@ struct vfsmount *clone_private_mount(const struct path *path)
struct mount *old_mnt = real_mount(path->mnt);
struct mount *new_mnt;
- down_read(&namespace_sem);
+ scoped_guard(rwsem_read, &namespace_sem)
if (IS_MNT_UNBINDABLE(old_mnt))
- goto invalid;
+ return ERR_PTR(-EINVAL);
- if (!check_mnt(old_mnt))
- goto invalid;
+ if (mnt_has_parent(old_mnt)) {
+ if (!check_mnt(old_mnt))
+ return ERR_PTR(-EINVAL);
+ } else {
+ if (!is_mounted(&old_mnt->mnt))
+ return ERR_PTR(-EINVAL);
+
+ /* Make sure this isn't something purely kernel internal. */
+ if (!is_anon_ns(old_mnt->mnt_ns))
+ return ERR_PTR(-EINVAL);
+
+ /* Make sure we don't create mount namespace loops. */
+ if (!check_for_nsfs_mounts(old_mnt))
+ return ERR_PTR(-EINVAL);
+ }
if (has_locked_children(old_mnt, path->dentry))
- goto invalid;
+ return ERR_PTR(-EINVAL);
new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
- up_read(&namespace_sem);
-
if (IS_ERR(new_mnt))
- return ERR_CAST(new_mnt);
+ return ERR_PTR(-EINVAL);
/* Longterm mount to be removed by kern_unmount*() */
new_mnt->mnt_ns = MNT_NS_INTERNAL;
-
return &new_mnt->mnt;
-
-invalid:
- up_read(&namespace_sem);
- return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(clone_private_mount);
@@ -2424,6 +2606,7 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
enum mnt_tree_flags_t {
MNT_TREE_MOVE = BIT(0),
MNT_TREE_BENEATH = BIT(1),
+ MNT_TREE_PROPAGATION = BIT(2),
};
/**
@@ -2547,6 +2730,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
dest_mp = smp;
unhash_mnt(source_mnt);
attach_mnt(source_mnt, top_mnt, dest_mp, beneath);
+ mnt_notify_add(source_mnt);
touch_mnt_namespace(source_mnt->mnt_ns);
} else {
if (source_mnt->mnt_ns) {
@@ -2773,6 +2957,71 @@ static int do_change_type(struct path *path, int ms_flags)
return err;
}
+/* may_copy_tree() - check if a mount tree can be copied
+ * @path: path to the mount tree to be copied
+ *
+ * This helper checks if the caller may copy the mount tree starting
+ * from @path->mnt. The caller may copy the mount tree under the
+ * following circumstances:
+ *
+ * (1) The caller is located in the mount namespace of the mount tree.
+ * This also implies that the mount does not belong to an anonymous
+ * mount namespace.
+ * (2) The caller tries to copy an nfs mount referring to a mount
+ * namespace, i.e., the caller is trying to copy a mount namespace
+ * entry from nsfs.
+ * (3) The caller tries to copy a pidfs mount referring to a pidfd.
+ * (4) The caller is trying to copy a mount tree that belongs to an
+ * anonymous mount namespace.
+ *
+ * For that to be safe, this helper enforces that the origin mount
+ * namespace the anonymous mount namespace was created from is the
+ * same as the caller's mount namespace by comparing the sequence
+ * numbers.
+ *
+ * This is not strictly necessary. The current semantics of the new
+ * mount api enforce that the caller must be located in the same
+ * mount namespace as the mount tree it interacts with. Using the
+ * origin sequence number preserves these semantics even for
+ * anonymous mount namespaces. However, one could envision extending
+ * the api to directly operate across mount namespace if needed.
+ *
+ * The ownership of a non-anonymous mount namespace such as the
+ * caller's cannot change.
+ * => We know that the caller's mount namespace is stable.
+ *
+ * If the origin sequence number of the anonymous mount namespace is
+ * the same as the sequence number of the caller's mount namespace.
+ * => The owning namespaces are the same.
+ *
+ * ==> The earlier capability check on the owning namespace of the
+ * caller's mount namespace ensures that the caller has the
+ * ability to copy the mount tree.
+ *
+ * Returns true if the mount tree can be copied, false otherwise.
+ */
+static inline bool may_copy_tree(struct path *path)
+{
+ struct mount *mnt = real_mount(path->mnt);
+ const struct dentry_operations *d_op;
+
+ if (check_mnt(mnt))
+ return true;
+
+ d_op = path->dentry->d_op;
+ if (d_op == &ns_dentry_operations)
+ return true;
+
+ if (d_op == &pidfs_dentry_operations)
+ return true;
+
+ if (!is_mounted(path->mnt))
+ return false;
+
+ return check_anonymous_mnt(mnt);
+}
+
+
static struct mount *__do_loopback(struct path *old_path, int recurse)
{
struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);
@@ -2780,13 +3029,8 @@ static struct mount *__do_loopback(struct path *old_path, int recurse)
if (IS_MNT_UNBINDABLE(old))
return mnt;
- if (!check_mnt(old)) {
- const struct dentry_operations *d_op = old_path->dentry->d_op;
-
- if (d_op != &ns_dentry_operations &&
- d_op != &pidfs_dentry_operations)
- return mnt;
- }
+ if (!may_copy_tree(old_path))
+ return mnt;
if (!recurse && has_locked_children(old, old_path->dentry))
return mnt;
@@ -2853,15 +3097,30 @@ out:
static struct file *open_detached_copy(struct path *path, bool recursive)
{
- struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
- struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true);
+ struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns;
+ struct user_namespace *user_ns = mnt_ns->user_ns;
struct mount *mnt, *p;
struct file *file;
+ ns = alloc_mnt_ns(user_ns, true);
if (IS_ERR(ns))
return ERR_CAST(ns);
namespace_lock();
+
+ /*
+ * Record the sequence number of the source mount namespace.
+ * This needs to hold namespace_sem to ensure that the mount
+ * doesn't get attached.
+ */
+ if (is_mounted(path->mnt)) {
+ src_mnt_ns = real_mount(path->mnt)->mnt_ns;
+ if (is_anon_ns(src_mnt_ns))
+ ns->seq_origin = src_mnt_ns->seq_origin;
+ else
+ ns->seq_origin = src_mnt_ns->seq;
+ }
+
mnt = __do_loopback(path, recursive);
if (IS_ERR(mnt)) {
namespace_unlock();
@@ -2889,24 +3148,22 @@ static struct file *open_detached_copy(struct path *path, bool recursive)
return file;
}
-SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
+static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned int flags)
{
- struct file *file;
- struct path path;
+ int ret;
+ struct path path __free(path_put) = {};
int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
bool detached = flags & OPEN_TREE_CLONE;
- int error;
- int fd;
BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
OPEN_TREE_CLOEXEC))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
if (flags & AT_NO_AUTOMOUNT)
lookup_flags &= ~LOOKUP_AUTOMOUNT;
@@ -2916,27 +3173,32 @@ SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, fl
lookup_flags |= LOOKUP_EMPTY;
if (detached && !may_mount())
- return -EPERM;
+ return ERR_PTR(-EPERM);
+
+ ret = user_path_at(dfd, filename, lookup_flags, &path);
+ if (unlikely(ret))
+ return ERR_PTR(ret);
+
+ if (detached)
+ return open_detached_copy(&path, flags & AT_RECURSIVE);
+
+ return dentry_open(&path, O_PATH, current_cred());
+}
+
+SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
+{
+ int fd;
+ struct file *file __free(fput) = NULL;
+
+ file = vfs_open_tree(dfd, filename, flags);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
fd = get_unused_fd_flags(flags & O_CLOEXEC);
if (fd < 0)
return fd;
- error = user_path_at(dfd, filename, lookup_flags, &path);
- if (unlikely(error)) {
- file = ERR_PTR(error);
- } else {
- if (detached)
- file = open_detached_copy(&path, flags & AT_RECURSIVE);
- else
- file = dentry_open(&path, O_PATH, current_cred());
- path_put(&path);
- }
- if (IS_ERR(file)) {
- put_unused_fd(fd);
- return PTR_ERR(file);
- }
- fd_install(fd, file);
+ fd_install(fd, no_free_ptr(file));
return fd;
}
@@ -3123,28 +3385,6 @@ static inline int tree_contains_unbindable(struct mount *mnt)
return 0;
}
-/*
- * Check that there aren't references to earlier/same mount namespaces in the
- * specified subtree. Such references can act as pins for mount namespaces
- * that aren't checked by the mount-cycle checking code, thereby allowing
- * cycles to be made.
- */
-static bool check_for_nsfs_mounts(struct mount *subtree)
-{
- struct mount *p;
- bool ret = false;
-
- lock_mount_hash();
- for (p = subtree; p; p = next_mnt(p, subtree))
- if (mnt_ns_loop(p->mnt.mnt_root))
- goto out;
-
- ret = true;
-out:
- unlock_mount_hash();
- return ret;
-}
-
static int do_set_group(struct path *from_path, struct path *to_path)
{
struct mount *from, *to;
@@ -3320,8 +3560,56 @@ static int can_move_mount_beneath(const struct path *from,
return 0;
}
-static int do_move_mount(struct path *old_path, struct path *new_path,
- bool beneath)
+/* may_use_mount() - check if a mount tree can be used
+ * @mnt: vfsmount to be used
+ *
+ * This helper checks if the caller may use the mount tree starting
+ * from @path->mnt. The caller may use the mount tree under the
+ * following circumstances:
+ *
+ * (1) The caller is located in the mount namespace of the mount tree.
+ * This also implies that the mount does not belong to an anonymous
+ * mount namespace.
+ * (2) The caller is trying to use a mount tree that belongs to an
+ * anonymous mount namespace.
+ *
+ * For that to be safe, this helper enforces that the origin mount
+ * namespace the anonymous mount namespace was created from is the
+ * same as the caller's mount namespace by comparing the sequence
+ * numbers.
+ *
+ * The ownership of a non-anonymous mount namespace such as the
+ * caller's cannot change.
+ * => We know that the caller's mount namespace is stable.
+ *
+ * If the origin sequence number of the anonymous mount namespace is
+ * the same as the sequence number of the caller's mount namespace.
+ * => The owning namespaces are the same.
+ *
+ * ==> The earlier capability check on the owning namespace of the
+ * caller's mount namespace ensures that the caller has the
+ * ability to use the mount tree.
+ *
+ * Returns true if the mount tree can be used, false otherwise.
+ */
+static inline bool may_use_mount(struct mount *mnt)
+{
+ if (check_mnt(mnt))
+ return true;
+
+ /*
+ * Make sure that noone unmounted the target path or somehow
+ * managed to get their hands on something purely kernel
+ * internal.
+ */
+ if (!is_mounted(&mnt->mnt))
+ return false;
+
+ return check_anonymous_mnt(mnt);
+}
+
+static int do_move_mount(struct path *old_path,
+ struct path *new_path, enum mnt_tree_flags_t flags)
{
struct mnt_namespace *ns;
struct mount *p;
@@ -3329,8 +3617,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path,
struct mount *parent;
struct mountpoint *mp, *old_mp;
int err;
- bool attached;
- enum mnt_tree_flags_t flags = 0;
+ bool attached, beneath = flags & MNT_TREE_BENEATH;
mp = do_lock_mount(new_path, beneath);
if (IS_ERR(mp))
@@ -3346,8 +3633,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path,
ns = old->mnt_ns;
err = -EINVAL;
- /* The mountpoint must be in our namespace. */
- if (!check_mnt(p))
+ if (!may_use_mount(p))
goto out;
/* The thing moved must be mounted... */
@@ -3358,6 +3644,32 @@ static int do_move_mount(struct path *old_path, struct path *new_path,
if (!(attached ? check_mnt(old) : is_anon_ns(ns)))
goto out;
+ if (is_anon_ns(ns)) {
+ /*
+ * Ending up with two files referring to the root of the
+ * same anonymous mount namespace would cause an error
+ * as this would mean trying to move the same mount
+ * twice into the mount tree which would be rejected
+ * later. But be explicit about it right here.
+ */
+ if ((is_anon_ns(p->mnt_ns) && ns == p->mnt_ns))
+ goto out;
+
+ /*
+ * If this is an anonymous mount tree ensure that mount
+ * propagation can detect mounts that were just
+ * propagated to the target mount tree so we don't
+ * propagate onto them.
+ */
+ ns->mntns_flags |= MNTNS_PROPAGATING;
+ } else if (is_anon_ns(p->mnt_ns)) {
+ /*
+ * Don't allow moving an attached mount tree to an
+ * anonymous mount tree.
+ */
+ goto out;
+ }
+
if (old->mnt.mnt_flags & MNT_LOCKED)
goto out;
@@ -3400,6 +3712,9 @@ static int do_move_mount(struct path *old_path, struct path *new_path,
if (err)
goto out;
+ if (is_anon_ns(ns))
+ ns->mntns_flags &= ~MNTNS_PROPAGATING;
+
/* if the mount is moved, it should no longer be expire
* automatically */
list_del_init(&old->mnt_expire);
@@ -3408,10 +3723,13 @@ static int do_move_mount(struct path *old_path, struct path *new_path,
out:
unlock_mount(mp);
if (!err) {
- if (attached)
+ if (attached) {
mntput_no_expire(parent);
- else
+ } else {
+ /* Make sure we notice when we leak mounts. */
+ VFS_WARN_ON_ONCE(!mnt_ns_empty(ns));
free_mnt_ns(ns);
+ }
}
return err;
}
@@ -3428,7 +3746,7 @@ static int do_move_mount_old(struct path *path, const char *old_name)
if (err)
return err;
- err = do_move_mount(&old_path, path, false);
+ err = do_move_mount(&old_path, path, 0);
path_put(&old_path);
return err;
}
@@ -4269,6 +4587,21 @@ err_unlock:
return ret;
}
+static inline int vfs_move_mount(struct path *from_path, struct path *to_path,
+ enum mnt_tree_flags_t mflags)
+{
+ int ret;
+
+ ret = security_move_mount(from_path, to_path);
+ if (ret)
+ return ret;
+
+ if (mflags & MNT_TREE_PROPAGATION)
+ return do_set_group(from_path, to_path);
+
+ return do_move_mount(from_path, to_path, mflags);
+}
+
/*
* Move a mount from one place to another. In combination with
* fsopen()/fsmount() this is used to install a new mount and in combination
@@ -4282,8 +4615,12 @@ SYSCALL_DEFINE5(move_mount,
int, to_dfd, const char __user *, to_pathname,
unsigned int, flags)
{
- struct path from_path, to_path;
- unsigned int lflags;
+ struct path to_path __free(path_put) = {};
+ struct path from_path __free(path_put) = {};
+ struct filename *to_name __free(putname) = NULL;
+ struct filename *from_name __free(putname) = NULL;
+ unsigned int lflags, uflags;
+ enum mnt_tree_flags_t mflags = 0;
int ret = 0;
if (!may_mount())
@@ -4296,43 +4633,53 @@ SYSCALL_DEFINE5(move_mount,
(MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP))
return -EINVAL;
- /* If someone gives a pathname, they aren't permitted to move
- * from an fd that requires unmount as we can't get at the flag
- * to clear it afterwards.
- */
+ if (flags & MOVE_MOUNT_SET_GROUP) mflags |= MNT_TREE_PROPAGATION;
+ if (flags & MOVE_MOUNT_BENEATH) mflags |= MNT_TREE_BENEATH;
+
lflags = 0;
if (flags & MOVE_MOUNT_F_SYMLINKS) lflags |= LOOKUP_FOLLOW;
if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT;
- if (flags & MOVE_MOUNT_F_EMPTY_PATH) lflags |= LOOKUP_EMPTY;
-
- ret = user_path_at(from_dfd, from_pathname, lflags, &from_path);
- if (ret < 0)
- return ret;
+ uflags = 0;
+ if (flags & MOVE_MOUNT_F_EMPTY_PATH) uflags = AT_EMPTY_PATH;
+ from_name = getname_maybe_null(from_pathname, uflags);
+ if (IS_ERR(from_name))
+ return PTR_ERR(from_name);
lflags = 0;
if (flags & MOVE_MOUNT_T_SYMLINKS) lflags |= LOOKUP_FOLLOW;
if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT;
- if (flags & MOVE_MOUNT_T_EMPTY_PATH) lflags |= LOOKUP_EMPTY;
+ uflags = 0;
+ if (flags & MOVE_MOUNT_T_EMPTY_PATH) uflags = AT_EMPTY_PATH;
+ to_name = getname_maybe_null(to_pathname, uflags);
+ if (IS_ERR(to_name))
+ return PTR_ERR(to_name);
+
+ if (!to_name && to_dfd >= 0) {
+ CLASS(fd_raw, f_to)(to_dfd);
+ if (fd_empty(f_to))
+ return -EBADF;
+
+ to_path = fd_file(f_to)->f_path;
+ path_get(&to_path);
+ } else {
+ ret = filename_lookup(to_dfd, to_name, lflags, &to_path, NULL);
+ if (ret)
+ return ret;
+ }
- ret = user_path_at(to_dfd, to_pathname, lflags, &to_path);
- if (ret < 0)
- goto out_from;
+ if (!from_name && from_dfd >= 0) {
+ CLASS(fd_raw, f_from)(from_dfd);
+ if (fd_empty(f_from))
+ return -EBADF;
- ret = security_move_mount(&from_path, &to_path);
- if (ret < 0)
- goto out_to;
+ return vfs_move_mount(&fd_file(f_from)->f_path, &to_path, mflags);
+ }
- if (flags & MOVE_MOUNT_SET_GROUP)
- ret = do_set_group(&from_path, &to_path);
- else
- ret = do_move_mount(&from_path, &to_path,
- (flags & MOVE_MOUNT_BENEATH));
+ ret = filename_lookup(from_dfd, from_name, lflags, &from_path, NULL);
+ if (ret)
+ return ret;
-out_to:
- path_put(&to_path);
-out_from:
- path_put(&from_path);
- return ret;
+ return vfs_move_mount(&from_path, &to_path, mflags);
}
/*
@@ -4468,6 +4815,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
list_del_init(&new_mnt->mnt_expire);
put_mountpoint(root_mp);
unlock_mount_hash();
+ mnt_notify_add(root_mnt);
+ mnt_notify_add(new_mnt);
chroot_fs_refs(&root, &new);
error = 0;
out4:
@@ -4512,11 +4861,10 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
return -EINVAL;
/*
- * Once a mount has been idmapped we don't allow it to change its
- * mapping. It makes things simpler and callers can just create
- * another bind-mount they can idmap if they want to.
+ * We only allow an mount to change it's idmapping if it has
+ * never been accessible to userspace.
*/
- if (is_idmapped_mnt(m))
+ if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE) && is_idmapped_mnt(m))
return -EPERM;
/* The underlying filesystem doesn't support idmapped mounts yet. */
@@ -4576,7 +4924,7 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
break;
}
- if (!kattr->recurse)
+ if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
return 0;
}
@@ -4606,18 +4954,16 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
{
+ struct mnt_idmap *old_idmap;
+
if (!kattr->mnt_idmap)
return;
- /*
- * Pairs with smp_load_acquire() in mnt_idmap().
- *
- * Since we only allow a mount to change the idmapping once and
- * verified this in can_idmap_mount() we know that the mount has
- * @nop_mnt_idmap attached to it. So there's no need to drop any
- * references.
- */
+ old_idmap = mnt_idmap(&mnt->mnt);
+
+ /* Pairs with smp_load_acquire() in mnt_idmap(). */
smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap));
+ mnt_idmap_put(old_idmap);
}
static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
@@ -4637,7 +4983,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
if (kattr->propagation)
change_mnt_propagation(m, kattr->propagation);
- if (!kattr->recurse)
+ if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
break;
}
touch_mnt_namespace(mnt->mnt_ns);
@@ -4667,7 +5013,7 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
*/
namespace_lock();
if (kattr->propagation == MS_SHARED) {
- err = invent_group_ids(mnt, kattr->recurse);
+ err = invent_group_ids(mnt, kattr->kflags & MOUNT_KATTR_RECURSE);
if (err) {
namespace_unlock();
return err;
@@ -4718,7 +5064,7 @@ out:
}
static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
- struct mount_kattr *kattr, unsigned int flags)
+ struct mount_kattr *kattr)
{
struct ns_common *ns;
struct user_namespace *mnt_userns;
@@ -4726,13 +5072,23 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP))
return 0;
- /*
- * We currently do not support clearing an idmapped mount. If this ever
- * is a use-case we can revisit this but for now let's keep it simple
- * and not allow it.
- */
- if (attr->attr_clr & MOUNT_ATTR_IDMAP)
- return -EINVAL;
+ if (attr->attr_clr & MOUNT_ATTR_IDMAP) {
+ /*
+ * We can only remove an idmapping if it's never been
+ * exposed to userspace.
+ */
+ if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE))
+ return -EINVAL;
+
+ /*
+ * Removal of idmappings is equivalent to setting
+ * nop_mnt_idmap.
+ */
+ if (!(attr->attr_set & MOUNT_ATTR_IDMAP)) {
+ kattr->mnt_idmap = &nop_mnt_idmap;
+ return 0;
+ }
+ }
if (attr->userns_fd > INT_MAX)
return -EINVAL;
@@ -4769,22 +5125,8 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
}
static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
- struct mount_kattr *kattr, unsigned int flags)
+ struct mount_kattr *kattr)
{
- unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
-
- if (flags & AT_NO_AUTOMOUNT)
- lookup_flags &= ~LOOKUP_AUTOMOUNT;
- if (flags & AT_SYMLINK_NOFOLLOW)
- lookup_flags &= ~LOOKUP_FOLLOW;
- if (flags & AT_EMPTY_PATH)
- lookup_flags |= LOOKUP_EMPTY;
-
- *kattr = (struct mount_kattr) {
- .lookup_flags = lookup_flags,
- .recurse = !!(flags & AT_RECURSIVE),
- };
-
if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS)
return -EINVAL;
if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1)
@@ -4832,35 +5174,28 @@ static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
return -EINVAL;
}
- return build_mount_idmapped(attr, usize, kattr, flags);
+ return build_mount_idmapped(attr, usize, kattr);
}
static void finish_mount_kattr(struct mount_kattr *kattr)
{
- put_user_ns(kattr->mnt_userns);
- kattr->mnt_userns = NULL;
+ if (kattr->mnt_userns) {
+ put_user_ns(kattr->mnt_userns);
+ kattr->mnt_userns = NULL;
+ }
if (kattr->mnt_idmap)
mnt_idmap_put(kattr->mnt_idmap);
}
-SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
- unsigned int, flags, struct mount_attr __user *, uattr,
- size_t, usize)
+static int copy_mount_setattr(struct mount_attr __user *uattr, size_t usize,
+ struct mount_kattr *kattr)
{
- int err;
- struct path target;
+ int ret;
struct mount_attr attr;
- struct mount_kattr kattr;
BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0);
- if (flags & ~(AT_EMPTY_PATH |
- AT_RECURSIVE |
- AT_SYMLINK_NOFOLLOW |
- AT_NO_AUTOMOUNT))
- return -EINVAL;
-
if (unlikely(usize > PAGE_SIZE))
return -E2BIG;
if (unlikely(usize < MOUNT_ATTR_SIZE_VER0))
@@ -4869,9 +5204,9 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
if (!may_mount())
return -EPERM;
- err = copy_struct_from_user(&attr, sizeof(attr), uattr, usize);
- if (err)
- return err;
+ ret = copy_struct_from_user(&attr, sizeof(attr), uattr, usize);
+ if (ret)
+ return ret;
/* Don't bother walking through the mounts if this is a nop. */
if (attr.attr_set == 0 &&
@@ -4879,7 +5214,39 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
attr.propagation == 0)
return 0;
- err = build_mount_kattr(&attr, usize, &kattr, flags);
+ return build_mount_kattr(&attr, usize, kattr);
+}
+
+SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
+ unsigned int, flags, struct mount_attr __user *, uattr,
+ size_t, usize)
+{
+ int err;
+ struct path target;
+ struct mount_kattr kattr;
+ unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
+
+ if (flags & ~(AT_EMPTY_PATH |
+ AT_RECURSIVE |
+ AT_SYMLINK_NOFOLLOW |
+ AT_NO_AUTOMOUNT))
+ return -EINVAL;
+
+ if (flags & AT_NO_AUTOMOUNT)
+ lookup_flags &= ~LOOKUP_AUTOMOUNT;
+ if (flags & AT_SYMLINK_NOFOLLOW)
+ lookup_flags &= ~LOOKUP_FOLLOW;
+ if (flags & AT_EMPTY_PATH)
+ lookup_flags |= LOOKUP_EMPTY;
+
+ kattr = (struct mount_kattr) {
+ .lookup_flags = lookup_flags,
+ };
+
+ if (flags & AT_RECURSIVE)
+ kattr.kflags |= MOUNT_KATTR_RECURSE;
+
+ err = copy_mount_setattr(uattr, usize, &kattr);
if (err)
return err;
@@ -4892,6 +5259,47 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
return err;
}
+SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename,
+ unsigned, flags, struct mount_attr __user *, uattr,
+ size_t, usize)
+{
+ struct file __free(fput) *file = NULL;
+ int fd;
+
+ if (!uattr && usize)
+ return -EINVAL;
+
+ file = vfs_open_tree(dfd, filename, flags);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ if (uattr) {
+ int ret;
+ struct mount_kattr kattr = {};
+
+ kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE;
+ if (flags & AT_RECURSIVE)
+ kattr.kflags |= MOUNT_KATTR_RECURSE;
+
+ ret = copy_mount_setattr(uattr, usize, &kattr);
+ if (ret)
+ return ret;
+
+ ret = do_mount_setattr(&file->f_path, &kattr);
+ if (ret)
+ return ret;
+
+ finish_mount_kattr(&kattr);
+ }
+
+ fd = get_unused_fd_flags(flags & O_CLOEXEC);
+ if (fd < 0)
+ return fd;
+
+ fd_install(fd, no_free_ptr(file));
+ return fd;
+}
+
int show_path(struct seq_file *m, struct dentry *root)
{
if (root->d_sb->s_op->show_path)
@@ -4915,6 +5323,7 @@ struct kstatmount {
struct statmount __user *buf;
size_t bufsize;
struct vfsmount *mnt;
+ struct mnt_idmap *idmap;
u64 mask;
struct path root;
struct statmount sm;
@@ -5184,6 +5593,46 @@ static int statmount_opt_sec_array(struct kstatmount *s, struct seq_file *seq)
return 0;
}
+static inline int statmount_mnt_uidmap(struct kstatmount *s, struct seq_file *seq)
+{
+ int ret;
+
+ ret = statmount_mnt_idmap(s->idmap, seq, true);
+ if (ret < 0)
+ return ret;
+
+ s->sm.mnt_uidmap_num = ret;
+ /*
+ * Always raise STATMOUNT_MNT_UIDMAP even if there are no valid
+ * mappings. This allows userspace to distinguish between a
+ * non-idmapped mount and an idmapped mount where none of the
+ * individual mappings are valid in the caller's idmapping.
+ */
+ if (is_valid_mnt_idmap(s->idmap))
+ s->sm.mask |= STATMOUNT_MNT_UIDMAP;
+ return 0;
+}
+
+static inline int statmount_mnt_gidmap(struct kstatmount *s, struct seq_file *seq)
+{
+ int ret;
+
+ ret = statmount_mnt_idmap(s->idmap, seq, false);
+ if (ret < 0)
+ return ret;
+
+ s->sm.mnt_gidmap_num = ret;
+ /*
+ * Always raise STATMOUNT_MNT_GIDMAP even if there are no valid
+ * mappings. This allows userspace to distinguish between a
+ * non-idmapped mount and an idmapped mount where none of the
+ * individual mappings are valid in the caller's idmapping.
+ */
+ if (is_valid_mnt_idmap(s->idmap))
+ s->sm.mask |= STATMOUNT_MNT_GIDMAP;
+ return 0;
+}
+
static int statmount_string(struct kstatmount *s, u64 flag)
{
int ret = 0;
@@ -5231,6 +5680,14 @@ static int statmount_string(struct kstatmount *s, u64 flag)
offp = &sm->sb_source;
ret = statmount_sb_source(s, seq);
break;
+ case STATMOUNT_MNT_UIDMAP:
+ sm->mnt_uidmap = start;
+ ret = statmount_mnt_uidmap(s, seq);
+ break;
+ case STATMOUNT_MNT_GIDMAP:
+ sm->mnt_gidmap = start;
+ ret = statmount_mnt_gidmap(s, seq);
+ break;
default:
WARN_ON_ONCE(true);
return -EINVAL;
@@ -5306,7 +5763,7 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
* We have to find the first mount in our ns and use that, however it
* may not exist, so handle that properly.
*/
- if (RB_EMPTY_ROOT(&ns->mounts))
+ if (mnt_ns_empty(ns))
return -ENOENT;
first = child = ns->root;
@@ -5323,6 +5780,21 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
return 0;
}
+/* This must be updated whenever a new flag is added */
+#define STATMOUNT_SUPPORTED (STATMOUNT_SB_BASIC | \
+ STATMOUNT_MNT_BASIC | \
+ STATMOUNT_PROPAGATE_FROM | \
+ STATMOUNT_MNT_ROOT | \
+ STATMOUNT_MNT_POINT | \
+ STATMOUNT_FS_TYPE | \
+ STATMOUNT_MNT_NS_ID | \
+ STATMOUNT_MNT_OPTS | \
+ STATMOUNT_FS_SUBTYPE | \
+ STATMOUNT_SB_SOURCE | \
+ STATMOUNT_OPT_ARRAY | \
+ STATMOUNT_OPT_SEC_ARRAY | \
+ STATMOUNT_SUPPORTED_MASK)
+
static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
struct mnt_namespace *ns)
{
@@ -5331,7 +5803,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
int err;
/* Has the namespace already been emptied? */
- if (mnt_ns_id && RB_EMPTY_ROOT(&ns->mounts))
+ if (mnt_ns_id && mnt_ns_empty(ns))
return -ENOENT;
s->mnt = lookup_mnt_in_ns(mnt_id, ns);
@@ -5356,6 +5828,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
return err;
s->root = root;
+ s->idmap = mnt_idmap(s->mnt);
if (s->mask & STATMOUNT_SB_BASIC)
statmount_sb_basic(s);
@@ -5389,12 +5862,26 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
if (!err && s->mask & STATMOUNT_SB_SOURCE)
err = statmount_string(s, STATMOUNT_SB_SOURCE);
+ if (!err && s->mask & STATMOUNT_MNT_UIDMAP)
+ err = statmount_string(s, STATMOUNT_MNT_UIDMAP);
+
+ if (!err && s->mask & STATMOUNT_MNT_GIDMAP)
+ err = statmount_string(s, STATMOUNT_MNT_GIDMAP);
+
if (!err && s->mask & STATMOUNT_MNT_NS_ID)
statmount_mnt_ns_id(s, ns);
+ if (!err && s->mask & STATMOUNT_SUPPORTED_MASK) {
+ s->sm.mask |= STATMOUNT_SUPPORTED_MASK;
+ s->sm.supported_mask = STATMOUNT_SUPPORTED;
+ }
+
if (err)
return err;
+ /* Are there bits in the return mask not present in STATMOUNT_SUPPORTED? */
+ WARN_ON_ONCE(~STATMOUNT_SUPPORTED & s->sm.mask);
+
return 0;
}
@@ -5412,7 +5899,8 @@ static inline bool retry_statmount(const long ret, size_t *seq_size)
#define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \
STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \
STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \
- STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY)
+ STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY | \
+ STATMOUNT_MNT_UIDMAP | STATMOUNT_MNT_GIDMAP)
static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
struct statmount __user *buf, size_t bufsize,
diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c
index 0bf3c2f5a710..5e3f0aeb51f3 100644
--- a/fs/netfs/direct_read.c
+++ b/fs/netfs/direct_read.c
@@ -125,9 +125,9 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq)
* Perform a read to an application buffer, bypassing the pagecache and the
* local disk cache.
*/
-static int netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync)
+static ssize_t netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync)
{
- int ret;
+ ssize_t ret;
_enter("R=%x %llx-%llx",
rreq->debug_id, rreq->start, rreq->start + rreq->len - 1);
@@ -155,7 +155,7 @@ static int netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync)
else
ret = -EIOCBQUEUED;
out:
- _leave(" = %d", ret);
+ _leave(" = %zd", ret);
return ret;
}
diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
index 636cc5a98ef5..23c75755ad4e 100644
--- a/fs/netfs/read_collect.c
+++ b/fs/netfs/read_collect.c
@@ -682,14 +682,16 @@ void netfs_wait_for_pause(struct netfs_io_request *rreq)
trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue);
prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE);
- subreq = list_first_entry_or_null(&stream->subrequests,
- struct netfs_io_subrequest, rreq_link);
- if (subreq &&
- (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags) ||
- test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) {
- __set_current_state(TASK_RUNNING);
- netfs_read_collection(rreq);
- continue;
+ if (!test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) {
+ subreq = list_first_entry_or_null(&stream->subrequests,
+ struct netfs_io_subrequest, rreq_link);
+ if (subreq &&
+ (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags) ||
+ test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) {
+ __set_current_state(TASK_RUNNING);
+ netfs_read_collection(rreq);
+ continue;
+ }
}
if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags) ||
diff --git a/fs/netfs/rolling_buffer.c b/fs/netfs/rolling_buffer.c
index 75d97af14b4a..207b6a326651 100644
--- a/fs/netfs/rolling_buffer.c
+++ b/fs/netfs/rolling_buffer.c
@@ -146,10 +146,6 @@ ssize_t rolling_buffer_load_from_ra(struct rolling_buffer *roll,
/* Store the counter after setting the slot. */
smp_store_release(&roll->next_head_slot, to);
-
- for (; ix < folioq_nr_slots(fq); ix++)
- folioq_clear(fq, ix);
-
return size;
}
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
index 294f67795f79..3fca59e6475d 100644
--- a/fs/netfs/write_collect.c
+++ b/fs/netfs/write_collect.c
@@ -400,7 +400,8 @@ void netfs_write_collection_worker(struct work_struct *work)
trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
if (wreq->io_streams[1].active &&
- wreq->io_streams[1].failed) {
+ wreq->io_streams[1].failed &&
+ ictx->ops->invalidate_cache) {
/* Cache write failure doesn't prevent writeback completion
* unless we're in disconnected mode.
*/
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 2b04038b0e40..bc957487f6ec 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1532,7 +1532,8 @@ static int nfs_is_exclusive_create(struct inode *dir, unsigned int flags)
{
if (NFS_PROTO(dir)->version == 2)
return 0;
- return flags & LOOKUP_EXCL;
+ return (flags & (LOOKUP_CREATE | LOOKUP_EXCL)) ==
+ (LOOKUP_CREATE | LOOKUP_EXCL);
}
/*
@@ -2421,11 +2422,11 @@ EXPORT_SYMBOL_GPL(nfs_mknod);
/*
* See comments for nfs_proc_create regarding failed operations.
*/
-int nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+struct dentry *nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct iattr attr;
- int error;
+ struct dentry *ret;
dfprintk(VFS, "NFS: mkdir(%s/%lu), %pd\n",
dir->i_sb->s_id, dir->i_ino, dentry);
@@ -2434,14 +2435,9 @@ int nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
attr.ia_mode = mode | S_IFDIR;
trace_nfs_mkdir_enter(dir, dentry);
- error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr);
- trace_nfs_mkdir_exit(dir, dentry, error);
- if (error != 0)
- goto out_err;
- return 0;
-out_err:
- d_drop(dentry);
- return error;
+ ret = NFS_PROTO(dir)->mkdir(dir, dentry, &attr);
+ trace_nfs_mkdir_exit(dir, dentry, PTR_ERR_OR_ZERO(ret));
+ return ret;
}
EXPORT_SYMBOL_GPL(nfs_mkdir);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index fae2c7ae4acc..1ac1d3eec517 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -400,8 +400,8 @@ struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int);
void nfs_d_prune_case_insensitive_aliases(struct inode *inode);
int nfs_create(struct mnt_idmap *, struct inode *, struct dentry *,
umode_t, bool);
-int nfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *,
- umode_t);
+struct dentry *nfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *,
+ umode_t);
int nfs_rmdir(struct inode *, struct dentry *);
int nfs_unlink(struct inode *, struct dentry *);
int nfs_symlink(struct mnt_idmap *, struct inode *, struct dentry *,
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 0c3bc98cd999..755ed3c37051 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -578,13 +578,13 @@ out:
return status;
}
-static int
+static struct dentry *
nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
{
struct posix_acl *default_acl, *acl;
struct nfs3_createdata *data;
- struct dentry *d_alias;
- int status = -ENOMEM;
+ struct dentry *ret = ERR_PTR(-ENOMEM);
+ int status;
dprintk("NFS call mkdir %pd\n", dentry);
@@ -592,8 +592,9 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
if (data == NULL)
goto out;
- status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl);
- if (status)
+ ret = ERR_PTR(posix_acl_create(dir, &sattr->ia_mode,
+ &default_acl, &acl));
+ if (IS_ERR(ret))
goto out;
data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR];
@@ -602,25 +603,27 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
data->arg.mkdir.len = dentry->d_name.len;
data->arg.mkdir.sattr = sattr;
- d_alias = nfs3_do_create(dir, dentry, data);
- status = PTR_ERR_OR_ZERO(d_alias);
+ ret = nfs3_do_create(dir, dentry, data);
- if (status != 0)
+ if (IS_ERR(ret))
goto out_release_acls;
- if (d_alias)
- dentry = d_alias;
+ if (ret)
+ dentry = ret;
status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
+ if (status) {
+ dput(ret);
+ ret = ERR_PTR(status);
+ }
- dput(d_alias);
out_release_acls:
posix_acl_release(acl);
posix_acl_release(default_acl);
out:
nfs3_free_createdata(data);
- dprintk("NFS reply mkdir: %d\n", status);
- return status;
+ dprintk("NFS reply mkdir: %d\n", PTR_ERR_OR_ZERO(ret));
+ return ret;
}
static int
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 6e95db6c17e9..70c8ea943019 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3154,9 +3154,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
if (d_really_is_negative(dentry)) {
struct dentry *alias;
d_drop(dentry);
- alias = d_exact_alias(dentry, state->inode);
- if (!alias)
- alias = d_splice_alias(igrab(state->inode), dentry);
+ alias = d_splice_alias(igrab(state->inode), dentry);
/* d_splice_alias() can't fail here - it's a non-directory */
if (alias) {
dput(ctx->dentry);
@@ -5139,9 +5137,6 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_
&data->arg.seq_args, &data->res.seq_res, 1);
if (status == 0) {
spin_lock(&dir->i_lock);
- /* Creating a directory bumps nlink in the parent */
- if (data->arg.ftype == NF4DIR)
- nfs4_inc_nlink_locked(dir);
nfs4_update_changeattr_locked(dir, &data->res.dir_cinfo,
data->res.fattr->time_start,
NFS_INO_INVALID_DATA);
@@ -5151,6 +5146,25 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_
return status;
}
+static struct dentry *nfs4_do_mkdir(struct inode *dir, struct dentry *dentry,
+ struct nfs4_createdata *data)
+{
+ int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg,
+ &data->arg.seq_args, &data->res.seq_res, 1);
+
+ if (status)
+ return ERR_PTR(status);
+
+ spin_lock(&dir->i_lock);
+ /* Creating a directory bumps nlink in the parent */
+ nfs4_inc_nlink_locked(dir);
+ nfs4_update_changeattr_locked(dir, &data->res.dir_cinfo,
+ data->res.fattr->time_start,
+ NFS_INO_INVALID_DATA);
+ spin_unlock(&dir->i_lock);
+ return nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr);
+}
+
static void nfs4_free_createdata(struct nfs4_createdata *data)
{
nfs4_label_free(data->fattr.label);
@@ -5207,32 +5221,34 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
return err;
}
-static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
- struct iattr *sattr, struct nfs4_label *label)
+static struct dentry *_nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
+ struct iattr *sattr,
+ struct nfs4_label *label)
{
struct nfs4_createdata *data;
- int status = -ENOMEM;
+ struct dentry *ret = ERR_PTR(-ENOMEM);
data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR);
if (data == NULL)
goto out;
data->arg.label = label;
- status = nfs4_do_create(dir, dentry, data);
+ ret = nfs4_do_mkdir(dir, dentry, data);
nfs4_free_createdata(data);
out:
- return status;
+ return ret;
}
-static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
- struct iattr *sattr)
+static struct dentry *nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
+ struct iattr *sattr)
{
struct nfs_server *server = NFS_SERVER(dir);
struct nfs4_exception exception = {
.interruptible = true,
};
struct nfs4_label l, *label;
+ struct dentry *alias;
int err;
label = nfs4_label_init_security(dir, dentry, sattr, &l);
@@ -5240,14 +5256,15 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
sattr->ia_mode &= ~current_umask();
do {
- err = _nfs4_proc_mkdir(dir, dentry, sattr, label);
+ alias = _nfs4_proc_mkdir(dir, dentry, sattr, label);
+ err = PTR_ERR_OR_ZERO(alias);
trace_nfs4_mkdir(dir, &dentry->d_name, err);
err = nfs4_handle_exception(NFS_SERVER(dir), err,
&exception);
} while (exception.retry);
nfs4_label_release_security(label);
- return err;
+ return alias;
}
static int _nfs4_proc_readdir(struct nfs_readdir_arg *nr_arg,
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 77920a2e3cef..63e71310b9f6 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -446,13 +446,14 @@ out:
return status;
}
-static int
+static struct dentry *
nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
{
struct nfs_createdata *data;
struct rpc_message msg = {
.rpc_proc = &nfs_procedures[NFSPROC_MKDIR],
};
+ struct dentry *alias = NULL;
int status = -ENOMEM;
dprintk("NFS call mkdir %pd\n", dentry);
@@ -464,12 +465,15 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
nfs_mark_for_revalidate(dir);
- if (status == 0)
- status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ if (status == 0) {
+ alias = nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr);
+ status = PTR_ERR_OR_ZERO(alias);
+ } else
+ alias = ERR_PTR(status);
nfs_free_createdata(data);
out:
dprintk("NFS reply mkdir: %d\n", status);
- return status;
+ return alias;
}
static int
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 28f4d5311c40..c1d9bd07285f 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -233,9 +233,12 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
* as well be forgiving and just succeed silently.
*/
goto out_put;
- status = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, S_IRWXU);
+ dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, S_IRWXU);
+ if (IS_ERR(dentry))
+ status = PTR_ERR(dentry);
out_put:
- dput(dentry);
+ if (!status)
+ dput(dentry);
out_unlock:
inode_unlock(d_inode(dir));
if (status == 0) {
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 29cb7b812d71..34d7aa531662 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1461,7 +1461,7 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct inode *dirp;
struct iattr *iap = attrs->na_iattr;
__be32 err;
- int host_err;
+ int host_err = 0;
dentry = fhp->fh_dentry;
dirp = d_inode(dentry);
@@ -1488,28 +1488,15 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
nfsd_check_ignore_resizing(iap);
break;
case S_IFDIR:
- host_err = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode);
- if (!host_err && unlikely(d_unhashed(dchild))) {
- struct dentry *d;
- d = lookup_one_len(dchild->d_name.name,
- dchild->d_parent,
- dchild->d_name.len);
- if (IS_ERR(d)) {
- host_err = PTR_ERR(d);
- break;
- }
- if (unlikely(d_is_negative(d))) {
- dput(d);
- err = nfserr_serverfault;
- goto out;
- }
+ dchild = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode);
+ if (IS_ERR(dchild)) {
+ host_err = PTR_ERR(dchild);
+ } else if (d_is_negative(dchild)) {
+ err = nfserr_serverfault;
+ goto out;
+ } else if (unlikely(dchild != resfhp->fh_dentry)) {
dput(resfhp->fh_dentry);
- resfhp->fh_dentry = dget(d);
- err = fh_update(resfhp);
- dput(dchild);
- dchild = d;
- if (err)
- goto out;
+ resfhp->fh_dentry = dget(dchild);
}
break;
case S_IFCHR:
@@ -1530,7 +1517,8 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
err = nfsd_create_setattr(rqstp, fhp, resfhp, attrs);
out:
- dput(dchild);
+ if (!IS_ERR(dchild))
+ dput(dchild);
return err;
out_nfserr:
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 953fbd5f0851..40f4b1a28705 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -218,8 +218,8 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
return err;
}
-static int nilfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *nilfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
struct nilfs_transaction_info ti;
@@ -227,7 +227,7 @@ static int nilfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
if (err)
- return err;
+ return ERR_PTR(err);
inc_nlink(dir);
@@ -258,7 +258,7 @@ out:
else
nilfs_transaction_abort(dir->i_sb);
- return err;
+ return ERR_PTR(err);
out_fail:
drop_nlink(inode);
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 95646f7c46ca..6d386080faf2 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -166,6 +166,8 @@ static bool fanotify_should_merge(struct fanotify_event *old,
case FANOTIFY_EVENT_TYPE_FS_ERROR:
return fanotify_error_event_equal(FANOTIFY_EE(old),
FANOTIFY_EE(new));
+ case FANOTIFY_EVENT_TYPE_MNT:
+ return false;
default:
WARN_ON_ONCE(1);
}
@@ -312,7 +314,10 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n",
__func__, iter_info->report_mask, event_mask, data, data_type);
- if (!fid_mode) {
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_MNT)) {
+ if (data_type != FSNOTIFY_EVENT_MNT)
+ return 0;
+ } else if (!fid_mode) {
/* Do we have path to open a file descriptor? */
if (!path)
return 0;
@@ -557,6 +562,20 @@ static struct fanotify_event *fanotify_alloc_path_event(const struct path *path,
return &pevent->fae;
}
+static struct fanotify_event *fanotify_alloc_mnt_event(u64 mnt_id, gfp_t gfp)
+{
+ struct fanotify_mnt_event *pevent;
+
+ pevent = kmem_cache_alloc(fanotify_mnt_event_cachep, gfp);
+ if (!pevent)
+ return NULL;
+
+ pevent->fae.type = FANOTIFY_EVENT_TYPE_MNT;
+ pevent->mnt_id = mnt_id;
+
+ return &pevent->fae;
+}
+
static struct fanotify_event *fanotify_alloc_perm_event(const void *data,
int data_type,
gfp_t gfp)
@@ -731,6 +750,7 @@ static struct fanotify_event *fanotify_alloc_event(
fid_mode);
struct inode *dirid = fanotify_dfid_inode(mask, data, data_type, dir);
const struct path *path = fsnotify_data_path(data, data_type);
+ u64 mnt_id = fsnotify_data_mnt_id(data, data_type);
struct mem_cgroup *old_memcg;
struct dentry *moved = NULL;
struct inode *child = NULL;
@@ -826,8 +846,12 @@ static struct fanotify_event *fanotify_alloc_event(
moved, &hash, gfp);
} else if (fid_mode) {
event = fanotify_alloc_fid_event(id, fsid, &hash, gfp);
- } else {
+ } else if (path) {
event = fanotify_alloc_path_event(path, &hash, gfp);
+ } else if (mnt_id) {
+ event = fanotify_alloc_mnt_event(mnt_id, gfp);
+ } else {
+ WARN_ON_ONCE(1);
}
if (!event)
@@ -927,7 +951,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
BUILD_BUG_ON(FAN_RENAME != FS_RENAME);
BUILD_BUG_ON(FAN_PRE_ACCESS != FS_PRE_ACCESS);
- BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 22);
+ BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 24);
mask = fanotify_group_event_mask(group, iter_info, &match_mask,
mask, data, data_type, dir);
@@ -1028,6 +1052,11 @@ static void fanotify_free_error_event(struct fsnotify_group *group,
mempool_free(fee, &group->fanotify_data.error_events_pool);
}
+static void fanotify_free_mnt_event(struct fanotify_event *event)
+{
+ kmem_cache_free(fanotify_mnt_event_cachep, FANOTIFY_ME(event));
+}
+
static void fanotify_free_event(struct fsnotify_group *group,
struct fsnotify_event *fsn_event)
{
@@ -1054,6 +1083,9 @@ static void fanotify_free_event(struct fsnotify_group *group,
case FANOTIFY_EVENT_TYPE_FS_ERROR:
fanotify_free_error_event(group, event);
break;
+ case FANOTIFY_EVENT_TYPE_MNT:
+ fanotify_free_mnt_event(event);
+ break;
default:
WARN_ON_ONCE(1);
}
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index c12cbc270539..b44e70e44be6 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -9,6 +9,7 @@ extern struct kmem_cache *fanotify_mark_cache;
extern struct kmem_cache *fanotify_fid_event_cachep;
extern struct kmem_cache *fanotify_path_event_cachep;
extern struct kmem_cache *fanotify_perm_event_cachep;
+extern struct kmem_cache *fanotify_mnt_event_cachep;
/* Possible states of the permission event */
enum {
@@ -244,6 +245,7 @@ enum fanotify_event_type {
FANOTIFY_EVENT_TYPE_PATH_PERM,
FANOTIFY_EVENT_TYPE_OVERFLOW, /* struct fanotify_event */
FANOTIFY_EVENT_TYPE_FS_ERROR, /* struct fanotify_error_event */
+ FANOTIFY_EVENT_TYPE_MNT,
__FANOTIFY_EVENT_TYPE_NUM
};
@@ -409,12 +411,23 @@ struct fanotify_path_event {
struct path path;
};
+struct fanotify_mnt_event {
+ struct fanotify_event fae;
+ u64 mnt_id;
+};
+
static inline struct fanotify_path_event *
FANOTIFY_PE(struct fanotify_event *event)
{
return container_of(event, struct fanotify_path_event, fae);
}
+static inline struct fanotify_mnt_event *
+FANOTIFY_ME(struct fanotify_event *event)
+{
+ return container_of(event, struct fanotify_mnt_event, fae);
+}
+
/*
* Structure for permission fanotify events. It gets allocated and freed in
* fanotify_handle_event() since we wait there for user response. When the
@@ -466,6 +479,11 @@ static inline bool fanotify_is_error_event(u32 mask)
return mask & FAN_FS_ERROR;
}
+static inline bool fanotify_is_mnt_event(u32 mask)
+{
+ return mask & (FAN_MNT_ATTACH | FAN_MNT_DETACH);
+}
+
static inline const struct path *fanotify_event_path(struct fanotify_event *event)
{
if (event->type == FANOTIFY_EVENT_TYPE_PATH)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index ba3e2d09eb44..f2d840ae4ded 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -113,6 +113,7 @@ struct kmem_cache *fanotify_mark_cache __ro_after_init;
struct kmem_cache *fanotify_fid_event_cachep __ro_after_init;
struct kmem_cache *fanotify_path_event_cachep __ro_after_init;
struct kmem_cache *fanotify_perm_event_cachep __ro_after_init;
+struct kmem_cache *fanotify_mnt_event_cachep __ro_after_init;
#define FANOTIFY_EVENT_ALIGN 4
#define FANOTIFY_FID_INFO_HDR_LEN \
@@ -123,6 +124,8 @@ struct kmem_cache *fanotify_perm_event_cachep __ro_after_init;
(sizeof(struct fanotify_event_info_error))
#define FANOTIFY_RANGE_INFO_LEN \
(sizeof(struct fanotify_event_info_range))
+#define FANOTIFY_MNT_INFO_LEN \
+ (sizeof(struct fanotify_event_info_mnt))
static int fanotify_fid_info_len(int fh_len, int name_len)
{
@@ -178,6 +181,8 @@ static size_t fanotify_event_len(unsigned int info_mode,
fh_len = fanotify_event_object_fh_len(event);
event_len += fanotify_fid_info_len(fh_len, dot_len);
}
+ if (fanotify_is_mnt_event(event->mask))
+ event_len += FANOTIFY_MNT_INFO_LEN;
if (info_mode & FAN_REPORT_PIDFD)
event_len += FANOTIFY_PIDFD_INFO_LEN;
@@ -405,6 +410,25 @@ static int process_access_response(struct fsnotify_group *group,
return -ENOENT;
}
+static size_t copy_mnt_info_to_user(struct fanotify_event *event,
+ char __user *buf, int count)
+{
+ struct fanotify_event_info_mnt info = { };
+
+ info.hdr.info_type = FAN_EVENT_INFO_TYPE_MNT;
+ info.hdr.len = FANOTIFY_MNT_INFO_LEN;
+
+ if (WARN_ON(count < info.hdr.len))
+ return -EFAULT;
+
+ info.mnt_id = FANOTIFY_ME(event)->mnt_id;
+
+ if (copy_to_user(buf, &info, sizeof(info)))
+ return -EFAULT;
+
+ return info.hdr.len;
+}
+
static size_t copy_error_info_to_user(struct fanotify_event *event,
char __user *buf, int count)
{
@@ -700,6 +724,15 @@ static int copy_info_records_to_user(struct fanotify_event *event,
total_bytes += ret;
}
+ if (fanotify_is_mnt_event(event->mask)) {
+ ret = copy_mnt_info_to_user(event, buf, count);
+ if (ret < 0)
+ return ret;
+ buf += ret;
+ count -= ret;
+ total_bytes += ret;
+ }
+
return total_bytes;
}
@@ -1508,6 +1541,14 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID))
return -EINVAL;
+ /* Don't allow mixing mnt events with inode events for now */
+ if (flags & FAN_REPORT_MNT) {
+ if (class != FAN_CLASS_NOTIF)
+ return -EINVAL;
+ if (flags & (FANOTIFY_FID_BITS | FAN_REPORT_FD_ERROR))
+ return -EINVAL;
+ }
+
if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
return -EINVAL;
@@ -1767,7 +1808,6 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
int dfd, const char __user *pathname)
{
struct inode *inode = NULL;
- struct vfsmount *mnt = NULL;
struct fsnotify_group *group;
struct path path;
struct fan_fsid __fsid, *fsid = NULL;
@@ -1776,7 +1816,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS;
unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS;
unsigned int obj_type, fid_mode;
- void *obj;
+ void *obj = NULL;
u32 umask = 0;
int ret;
@@ -1800,6 +1840,9 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
case FAN_MARK_FILESYSTEM:
obj_type = FSNOTIFY_OBJ_TYPE_SB;
break;
+ case FAN_MARK_MNTNS:
+ obj_type = FSNOTIFY_OBJ_TYPE_MNTNS;
+ break;
default:
return -EINVAL;
}
@@ -1847,6 +1890,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
return -EINVAL;
group = fd_file(f)->private_data;
+ /* Only report mount events on mnt namespace */
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_MNT)) {
+ if (mask & ~FANOTIFY_MOUNT_EVENTS)
+ return -EINVAL;
+ if (mark_type != FAN_MARK_MNTNS)
+ return -EINVAL;
+ } else {
+ if (mask & FANOTIFY_MOUNT_EVENTS)
+ return -EINVAL;
+ if (mark_type == FAN_MARK_MNTNS)
+ return -EINVAL;
+ }
+
/*
* An unprivileged user is not allowed to setup mount nor filesystem
* marks. This also includes setting up such marks by a group that
@@ -1888,7 +1944,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
* point.
*/
fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
- if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) &&
+ if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_MOUNT_EVENTS|FANOTIFY_EVENT_FLAGS) &&
(!fid_mode || mark_type == FAN_MARK_MOUNT))
return -EINVAL;
@@ -1938,17 +1994,21 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
}
/* inode held in place by reference to path; group by fget on fd */
- if (mark_type == FAN_MARK_INODE) {
+ if (obj_type == FSNOTIFY_OBJ_TYPE_INODE) {
inode = path.dentry->d_inode;
obj = inode;
- } else {
- mnt = path.mnt;
- if (mark_type == FAN_MARK_MOUNT)
- obj = mnt;
- else
- obj = mnt->mnt_sb;
+ } else if (obj_type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
+ obj = path.mnt;
+ } else if (obj_type == FSNOTIFY_OBJ_TYPE_SB) {
+ obj = path.mnt->mnt_sb;
+ } else if (obj_type == FSNOTIFY_OBJ_TYPE_MNTNS) {
+ obj = mnt_ns_from_dentry(path.dentry);
}
+ ret = -EINVAL;
+ if (!obj)
+ goto path_put_and_out;
+
/*
* If some other task has this inode open for write we should not add
* an ignore mask, unless that ignore mask is supposed to survive
@@ -1956,10 +2016,10 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
*/
if (mark_cmd == FAN_MARK_ADD && (flags & FANOTIFY_MARK_IGNORE_BITS) &&
!(flags & FAN_MARK_IGNORED_SURV_MODIFY)) {
- ret = mnt ? -EINVAL : -EISDIR;
+ ret = !inode ? -EINVAL : -EISDIR;
/* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */
if (ignore == FAN_MARK_IGNORE &&
- (mnt || S_ISDIR(inode->i_mode)))
+ (!inode || S_ISDIR(inode->i_mode)))
goto path_put_and_out;
ret = 0;
@@ -1968,7 +2028,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
}
/* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
- if (mnt || !S_ISDIR(inode->i_mode)) {
+ if (!inode || !S_ISDIR(inode->i_mode)) {
mask &= ~FAN_EVENT_ON_CHILD;
umask = FAN_EVENT_ON_CHILD;
/*
@@ -2042,7 +2102,7 @@ static int __init fanotify_user_setup(void)
FANOTIFY_DEFAULT_MAX_USER_MARKS);
BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
- BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 13);
+ BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 14);
BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11);
fanotify_mark_cache = KMEM_CACHE(fanotify_mark,
@@ -2055,6 +2115,7 @@ static int __init fanotify_user_setup(void)
fanotify_perm_event_cachep =
KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
}
+ fanotify_mnt_event_cachep = KMEM_CACHE(fanotify_mnt_event, SLAB_PANIC);
fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS;
init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] =
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index e933f9c65d90..1161eabf11ee 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -121,6 +121,11 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
seq_printf(m, "fanotify sdev:%x mflags:%x mask:%x ignored_mask:%x\n",
sb->s_dev, mflags, mark->mask, mark->ignore_mask);
+ } else if (mark->connector->type == FSNOTIFY_OBJ_TYPE_MNTNS) {
+ struct mnt_namespace *mnt_ns = fsnotify_conn_mntns(mark->connector);
+
+ seq_printf(m, "fanotify mnt_ns:%u mflags:%x mask:%x ignored_mask:%x\n",
+ mnt_ns->ns.inum, mflags, mark->mask, mark->ignore_mask);
}
}
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index fae1b6d397ea..e2b4f17a48bb 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -28,6 +28,11 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
fsnotify_clear_marks_by_mount(mnt);
}
+void __fsnotify_mntns_delete(struct mnt_namespace *mntns)
+{
+ fsnotify_clear_marks_by_mntns(mntns);
+}
+
/**
* fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes.
* @sb: superblock being unmounted.
@@ -420,7 +425,7 @@ static int send_to_group(__u32 mask, const void *data, int data_type,
file_name, cookie, iter_info);
}
-static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector **connp)
+static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector *const *connp)
{
struct fsnotify_mark_connector *conn;
struct hlist_node *node = NULL;
@@ -538,14 +543,15 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
{
const struct path *path = fsnotify_data_path(data, data_type);
struct super_block *sb = fsnotify_data_sb(data, data_type);
- struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);
+ const struct fsnotify_mnt *mnt_data = fsnotify_data_mnt(data, data_type);
+ struct fsnotify_sb_info *sbinfo = sb ? fsnotify_sb_info(sb) : NULL;
struct fsnotify_iter_info iter_info = {};
struct mount *mnt = NULL;
struct inode *inode2 = NULL;
struct dentry *moved;
int inode2_type;
int ret = 0;
- __u32 test_mask, marks_mask;
+ __u32 test_mask, marks_mask = 0;
if (path)
mnt = real_mount(path->mnt);
@@ -578,17 +584,20 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
if ((!sbinfo || !sbinfo->sb_marks) &&
(!mnt || !mnt->mnt_fsnotify_marks) &&
(!inode || !inode->i_fsnotify_marks) &&
- (!inode2 || !inode2->i_fsnotify_marks))
+ (!inode2 || !inode2->i_fsnotify_marks) &&
+ (!mnt_data || !mnt_data->ns->n_fsnotify_marks))
return 0;
- marks_mask = READ_ONCE(sb->s_fsnotify_mask);
+ if (sb)
+ marks_mask |= READ_ONCE(sb->s_fsnotify_mask);
if (mnt)
marks_mask |= READ_ONCE(mnt->mnt_fsnotify_mask);
if (inode)
marks_mask |= READ_ONCE(inode->i_fsnotify_mask);
if (inode2)
marks_mask |= READ_ONCE(inode2->i_fsnotify_mask);
-
+ if (mnt_data)
+ marks_mask |= READ_ONCE(mnt_data->ns->n_fsnotify_mask);
/*
* If this is a modify event we may need to clear some ignore masks.
@@ -618,6 +627,10 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
iter_info.marks[inode2_type] =
fsnotify_first_mark(&inode2->i_fsnotify_marks);
}
+ if (mnt_data) {
+ iter_info.marks[FSNOTIFY_ITER_TYPE_MNTNS] =
+ fsnotify_first_mark(&mnt_data->ns->n_fsnotify_marks);
+ }
/*
* We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark
@@ -708,11 +721,31 @@ void file_set_fsnotify_mode_from_watchers(struct file *file)
}
#endif
+void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt)
+{
+ struct fsnotify_mnt data = {
+ .ns = ns,
+ .mnt_id = real_mount(mnt)->mnt_id_unique,
+ };
+
+ if (WARN_ON_ONCE(!ns))
+ return;
+
+ /*
+ * This is an optimization as well as making sure fsnotify_init() has
+ * been called.
+ */
+ if (!ns->n_fsnotify_marks)
+ return;
+
+ fsnotify(mask, &data, FSNOTIFY_EVENT_MNT, NULL, NULL, NULL, 0);
+}
+
static __init int fsnotify_init(void)
{
int ret;
- BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 24);
+ BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 26);
ret = init_srcu_struct(&fsnotify_mark_srcu);
if (ret)
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 663759ed6fbc..5950c7a67f41 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -33,6 +33,12 @@ static inline struct super_block *fsnotify_conn_sb(
return conn->obj;
}
+static inline struct mnt_namespace *fsnotify_conn_mntns(
+ struct fsnotify_mark_connector *conn)
+{
+ return conn->obj;
+}
+
static inline struct super_block *fsnotify_object_sb(void *obj,
enum fsnotify_obj_type obj_type)
{
@@ -89,6 +95,11 @@ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb)
fsnotify_destroy_marks(fsnotify_sb_marks(sb));
}
+static inline void fsnotify_clear_marks_by_mntns(struct mnt_namespace *mntns)
+{
+ fsnotify_destroy_marks(&mntns->n_fsnotify_marks);
+}
+
/*
* update the dentry->d_flags of all of inode's children to indicate if inode cares
* about events that happen to its children.
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 4981439e6209..798340db69d7 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -107,6 +107,8 @@ static fsnotify_connp_t *fsnotify_object_connp(void *obj,
return &real_mount(obj)->mnt_fsnotify_marks;
case FSNOTIFY_OBJ_TYPE_SB:
return fsnotify_sb_marks(obj);
+ case FSNOTIFY_OBJ_TYPE_MNTNS:
+ return &((struct mnt_namespace *)obj)->n_fsnotify_marks;
default:
return NULL;
}
@@ -120,6 +122,8 @@ static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn)
return &fsnotify_conn_mount(conn)->mnt_fsnotify_mask;
else if (conn->type == FSNOTIFY_OBJ_TYPE_SB)
return &fsnotify_conn_sb(conn)->s_fsnotify_mask;
+ else if (conn->type == FSNOTIFY_OBJ_TYPE_MNTNS)
+ return &fsnotify_conn_mntns(conn)->n_fsnotify_mask;
return NULL;
}
@@ -346,12 +350,15 @@ static void *fsnotify_detach_connector_from_object(
fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0;
} else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) {
fsnotify_conn_sb(conn)->s_fsnotify_mask = 0;
+ } else if (conn->type == FSNOTIFY_OBJ_TYPE_MNTNS) {
+ fsnotify_conn_mntns(conn)->n_fsnotify_mask = 0;
}
rcu_assign_pointer(*connp, NULL);
conn->obj = NULL;
conn->type = FSNOTIFY_OBJ_TYPE_DETACHED;
- fsnotify_update_sb_watchers(sb, conn);
+ if (sb)
+ fsnotify_update_sb_watchers(sb, conn);
return inode;
}
@@ -724,7 +731,7 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, void *obj,
* Attach the sb info before attaching a connector to any object on sb.
* The sb info will remain attached as long as sb lives.
*/
- if (!fsnotify_sb_info(sb)) {
+ if (sb && !fsnotify_sb_info(sb)) {
err = fsnotify_attach_info_to_sb(sb);
if (err)
return err;
@@ -770,7 +777,8 @@ restart:
/* mark should be the last entry. last is the current last entry */
hlist_add_behind_rcu(&mark->obj_list, &last->obj_list);
added:
- fsnotify_update_sb_watchers(sb, conn);
+ if (sb)
+ fsnotify_update_sb_watchers(sb, conn);
/*
* Since connector is attached to object using cmpxchg() we are
* guaranteed that connector initialization is fully visible by anyone
diff --git a/fs/nsfs.c b/fs/nsfs.c
index f7fddf8ecf73..59aa801347a7 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -151,19 +151,49 @@ static int copy_ns_info_to_user(const struct mnt_namespace *mnt_ns,
return 0;
}
+static bool nsfs_ioctl_valid(unsigned int cmd)
+{
+ switch (cmd) {
+ case NS_GET_USERNS:
+ case NS_GET_PARENT:
+ case NS_GET_NSTYPE:
+ case NS_GET_OWNER_UID:
+ case NS_GET_MNTNS_ID:
+ case NS_GET_PID_FROM_PIDNS:
+ case NS_GET_TGID_FROM_PIDNS:
+ case NS_GET_PID_IN_PIDNS:
+ case NS_GET_TGID_IN_PIDNS:
+ return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd));
+ }
+
+ /* Extensible ioctls require some extra handling. */
+ switch (_IOC_NR(cmd)) {
+ case _IOC_NR(NS_MNT_GET_INFO):
+ case _IOC_NR(NS_MNT_GET_NEXT):
+ case _IOC_NR(NS_MNT_GET_PREV):
+ return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd));
+ }
+
+ return false;
+}
+
static long ns_ioctl(struct file *filp, unsigned int ioctl,
unsigned long arg)
{
struct user_namespace *user_ns;
struct pid_namespace *pid_ns;
struct task_struct *tsk;
- struct ns_common *ns = get_proc_ns(file_inode(filp));
+ struct ns_common *ns;
struct mnt_namespace *mnt_ns;
bool previous = false;
uid_t __user *argp;
uid_t uid;
int ret;
+ if (!nsfs_ioctl_valid(ioctl))
+ return -ENOIOCTLCMD;
+
+ ns = get_proc_ns(file_inode(filp));
switch (ioctl) {
case NS_GET_USERNS:
return open_related_ns(ns, ns_get_owner);
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index abf7e81584a9..652735a0b0c4 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -201,11 +201,11 @@ static int ntfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
/*
* ntfs_mkdir- inode_operations::mkdir
*/
-static int ntfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *ntfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
- return ntfs_create_inode(idmap, dir, dentry, NULL, S_IFDIR | mode, 0,
- NULL, 0, NULL);
+ return ERR_PTR(ntfs_create_inode(idmap, dir, dentry, NULL, S_IFDIR | mode, 0,
+ NULL, 0, NULL));
}
/*
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 2a7f36643895..5130ec44e5e1 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -402,10 +402,10 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
* File creation. Allocate an inode, and we're done..
*/
/* SMP-safe */
-static int dlmfs_mkdir(struct mnt_idmap * idmap,
- struct inode * dir,
- struct dentry * dentry,
- umode_t mode)
+static struct dentry *dlmfs_mkdir(struct mnt_idmap * idmap,
+ struct inode * dir,
+ struct dentry * dentry,
+ umode_t mode)
{
int status;
struct inode *inode = NULL;
@@ -448,7 +448,7 @@ static int dlmfs_mkdir(struct mnt_idmap * idmap,
bail:
if (status < 0)
iput(inode);
- return status;
+ return ERR_PTR(status);
}
static int dlmfs_create(struct mnt_idmap *idmap,
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 0ec63a1a94b8..99278c8f0e24 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -644,10 +644,10 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
suballoc_loc, suballoc_bit);
}
-static int ocfs2_mkdir(struct mnt_idmap *idmap,
- struct inode *dir,
- struct dentry *dentry,
- umode_t mode)
+static struct dentry *ocfs2_mkdir(struct mnt_idmap *idmap,
+ struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode)
{
int ret;
@@ -657,7 +657,7 @@ static int ocfs2_mkdir(struct mnt_idmap *idmap,
if (ret)
mlog_errno(ret);
- return ret;
+ return ERR_PTR(ret);
}
static int ocfs2_create(struct mnt_idmap *idmap,
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index 6bda275826d6..2ed541fccf33 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -279,10 +279,10 @@ out_free_inode:
return err;
}
-static int omfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *omfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
- return omfs_add_node(dir, dentry, mode | S_IFDIR);
+ return ERR_PTR(omfs_add_node(dir, dentry, mode | S_IFDIR));
}
static int omfs_create(struct mnt_idmap *idmap, struct inode *dir,
diff --git a/fs/open.c b/fs/open.c
index 1be20de9f283..a9063cca9911 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -67,11 +67,11 @@ int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry,
return ret;
}
-long vfs_truncate(const struct path *path, loff_t length)
+int vfs_truncate(const struct path *path, loff_t length)
{
struct mnt_idmap *idmap;
struct inode *inode;
- long error;
+ int error;
inode = path->dentry->d_inode;
@@ -123,7 +123,7 @@ mnt_drop_write_and_out:
}
EXPORT_SYMBOL_GPL(vfs_truncate);
-long do_sys_truncate(const char __user *pathname, loff_t length)
+int do_sys_truncate(const char __user *pathname, loff_t length)
{
unsigned int lookup_flags = LOOKUP_FOLLOW;
struct path path;
@@ -157,7 +157,7 @@ COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length
}
#endif
-long do_ftruncate(struct file *file, loff_t length, int small)
+int do_ftruncate(struct file *file, loff_t length, int small)
{
struct inode *inode;
struct dentry *dentry;
@@ -196,7 +196,7 @@ long do_ftruncate(struct file *file, loff_t length, int small)
return error;
}
-long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
+int do_sys_ftruncate(unsigned int fd, loff_t length, int small)
{
if (length < 0)
return -EINVAL;
@@ -251,7 +251,7 @@ COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd,
int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
- long ret;
+ int ret;
loff_t sum;
if (offset < 0 || len <= 0)
@@ -460,7 +460,7 @@ static const struct cred *access_override_creds(void)
return override_creds(override_cred);
}
-static long do_faccessat(int dfd, const char __user *filename, int mode, int flags)
+static int do_faccessat(int dfd, const char __user *filename, int mode, int flags)
{
struct path path;
struct inode *inode;
@@ -1409,22 +1409,23 @@ struct file *file_open_root(const struct path *root,
}
EXPORT_SYMBOL(file_open_root);
-static long do_sys_openat2(int dfd, const char __user *filename,
- struct open_how *how)
+static int do_sys_openat2(int dfd, const char __user *filename,
+ struct open_how *how)
{
struct open_flags op;
- int fd = build_open_flags(how, &op);
struct filename *tmp;
+ int err, fd;
- if (fd)
- return fd;
+ err = build_open_flags(how, &op);
+ if (unlikely(err))
+ return err;
tmp = getname(filename);
if (IS_ERR(tmp))
return PTR_ERR(tmp);
fd = get_unused_fd_flags(how->flags);
- if (fd >= 0) {
+ if (likely(fd >= 0)) {
struct file *f = do_filp_open(dfd, tmp, &op);
if (IS_ERR(f)) {
put_unused_fd(fd);
@@ -1437,7 +1438,7 @@ static long do_sys_openat2(int dfd, const char __user *filename,
return fd;
}
-long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
+int do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
struct open_how how = build_open_how(flags, mode);
return do_sys_openat2(dfd, filename, &how);
@@ -1551,7 +1552,7 @@ int filp_close(struct file *filp, fl_owner_t id)
int retval;
retval = filp_flush(filp, id);
- fput(filp);
+ fput_close(filp);
return retval;
}
@@ -1577,13 +1578,16 @@ SYSCALL_DEFINE1(close, unsigned int, fd)
* We're returning to user space. Don't bother
* with any delayed fput() cases.
*/
- __fput_sync(file);
+ fput_close_sync(file);
+
+ if (likely(retval == 0))
+ return 0;
/* can't restart close syscall because file table entry was cleared */
- if (unlikely(retval == -ERESTARTSYS ||
- retval == -ERESTARTNOINTR ||
- retval == -ERESTARTNOHAND ||
- retval == -ERESTART_RESTARTBLOCK))
+ if (retval == -ERESTARTSYS ||
+ retval == -ERESTARTNOINTR ||
+ retval == -ERESTARTNOHAND ||
+ retval == -ERESTART_RESTARTBLOCK)
retval = -EINTR;
return retval;
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index d68372241b30..90c49c0de243 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -57,8 +57,8 @@ ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,
int buffer_index;
ssize_t ret;
size_t copy_amount;
- int open_for_read;
- int open_for_write;
+ bool open_for_read;
+ bool open_for_write;
new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO);
if (!new_op)
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index aae6d2b8767d..5ac743c6bc2e 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -16,22 +16,22 @@
#include "orangefs-kernel.h"
#include "orangefs-bufmap.h"
-static int orangefs_writepage_locked(struct page *page,
- struct writeback_control *wbc)
+static int orangefs_writepage_locked(struct folio *folio,
+ struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct orangefs_write_range *wr = NULL;
struct iov_iter iter;
struct bio_vec bv;
- size_t len, wlen;
+ size_t wlen;
ssize_t ret;
- loff_t off;
+ loff_t len, off;
- set_page_writeback(page);
+ folio_start_writeback(folio);
len = i_size_read(inode);
- if (PagePrivate(page)) {
- wr = (struct orangefs_write_range *)page_private(page);
+ if (folio->private) {
+ wr = folio->private;
WARN_ON(wr->pos >= len);
off = wr->pos;
if (off + wr->len > len)
@@ -40,36 +40,27 @@ static int orangefs_writepage_locked(struct page *page,
wlen = wr->len;
} else {
WARN_ON(1);
- off = page_offset(page);
- if (off + PAGE_SIZE > len)
+ off = folio_pos(folio);
+ wlen = folio_size(folio);
+
+ if (wlen > len - off)
wlen = len - off;
- else
- wlen = PAGE_SIZE;
}
/* Should've been handled in orangefs_invalidate_folio. */
WARN_ON(off == len || off + wlen > len);
WARN_ON(wlen == 0);
- bvec_set_page(&bv, page, wlen, off % PAGE_SIZE);
+ bvec_set_folio(&bv, folio, wlen, offset_in_folio(folio, off));
iov_iter_bvec(&iter, ITER_SOURCE, &bv, 1, wlen);
ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen,
len, wr, NULL, NULL);
if (ret < 0) {
- mapping_set_error(page->mapping, ret);
+ mapping_set_error(folio->mapping, ret);
} else {
ret = 0;
}
- kfree(detach_page_private(page));
- return ret;
-}
-
-static int orangefs_writepage(struct page *page, struct writeback_control *wbc)
-{
- int ret;
- ret = orangefs_writepage_locked(page, wbc);
- unlock_page(page);
- end_page_writeback(page);
+ kfree(folio_detach_private(folio));
return ret;
}
@@ -79,33 +70,33 @@ struct orangefs_writepages {
kuid_t uid;
kgid_t gid;
int maxpages;
- int npages;
- struct page **pages;
+ int nfolios;
+ struct address_space *mapping;
+ struct folio **folios;
struct bio_vec *bv;
};
static int orangefs_writepages_work(struct orangefs_writepages *ow,
- struct writeback_control *wbc)
+ struct writeback_control *wbc)
{
- struct inode *inode = ow->pages[0]->mapping->host;
+ struct inode *inode = ow->mapping->host;
struct orangefs_write_range *wrp, wr;
struct iov_iter iter;
ssize_t ret;
- size_t len;
- loff_t off;
+ size_t start;
+ loff_t len, off;
int i;
len = i_size_read(inode);
- for (i = 0; i < ow->npages; i++) {
- set_page_writeback(ow->pages[i]);
- bvec_set_page(&ow->bv[i], ow->pages[i],
- min(page_offset(ow->pages[i]) + PAGE_SIZE,
- ow->off + ow->len) -
- max(ow->off, page_offset(ow->pages[i])),
- i == 0 ? ow->off - page_offset(ow->pages[i]) : 0);
+ start = offset_in_folio(ow->folios[0], ow->off);
+ for (i = 0; i < ow->nfolios; i++) {
+ folio_start_writeback(ow->folios[i]);
+ bvec_set_folio(&ow->bv[i], ow->folios[i],
+ folio_size(ow->folios[i]) - start, start);
+ start = 0;
}
- iov_iter_bvec(&iter, ITER_SOURCE, ow->bv, ow->npages, ow->len);
+ iov_iter_bvec(&iter, ITER_SOURCE, ow->bv, ow->nfolios, ow->len);
WARN_ON(ow->off >= len);
if (ow->off + ow->len > len)
@@ -116,40 +107,24 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow,
wr.gid = ow->gid;
ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, ow->len,
0, &wr, NULL, NULL);
- if (ret < 0) {
- for (i = 0; i < ow->npages; i++) {
- mapping_set_error(ow->pages[i]->mapping, ret);
- if (PagePrivate(ow->pages[i])) {
- wrp = (struct orangefs_write_range *)
- page_private(ow->pages[i]);
- ClearPagePrivate(ow->pages[i]);
- put_page(ow->pages[i]);
- kfree(wrp);
- }
- end_page_writeback(ow->pages[i]);
- unlock_page(ow->pages[i]);
- }
- } else {
+ if (ret < 0)
+ mapping_set_error(ow->mapping, ret);
+ else
ret = 0;
- for (i = 0; i < ow->npages; i++) {
- if (PagePrivate(ow->pages[i])) {
- wrp = (struct orangefs_write_range *)
- page_private(ow->pages[i]);
- ClearPagePrivate(ow->pages[i]);
- put_page(ow->pages[i]);
- kfree(wrp);
- }
- end_page_writeback(ow->pages[i]);
- unlock_page(ow->pages[i]);
- }
+
+ for (i = 0; i < ow->nfolios; i++) {
+ wrp = folio_detach_private(ow->folios[i]);
+ kfree(wrp);
+ folio_end_writeback(ow->folios[i]);
+ folio_unlock(ow->folios[i]);
}
+
return ret;
}
static int orangefs_writepages_callback(struct folio *folio,
- struct writeback_control *wbc, void *data)
+ struct writeback_control *wbc, struct orangefs_writepages *ow)
{
- struct orangefs_writepages *ow = data;
struct orangefs_write_range *wr = folio->private;
int ret;
@@ -162,41 +137,41 @@ static int orangefs_writepages_callback(struct folio *folio,
}
ret = -1;
- if (ow->npages == 0) {
+ if (ow->nfolios == 0) {
ow->off = wr->pos;
ow->len = wr->len;
ow->uid = wr->uid;
ow->gid = wr->gid;
- ow->pages[ow->npages++] = &folio->page;
+ ow->folios[ow->nfolios++] = folio;
ret = 0;
goto done;
}
if (!uid_eq(ow->uid, wr->uid) || !gid_eq(ow->gid, wr->gid)) {
orangefs_writepages_work(ow, wbc);
- ow->npages = 0;
+ ow->nfolios = 0;
ret = -1;
goto done;
}
if (ow->off + ow->len == wr->pos) {
ow->len += wr->len;
- ow->pages[ow->npages++] = &folio->page;
+ ow->folios[ow->nfolios++] = folio;
ret = 0;
goto done;
}
done:
if (ret == -1) {
- if (ow->npages) {
+ if (ow->nfolios) {
orangefs_writepages_work(ow, wbc);
- ow->npages = 0;
+ ow->nfolios = 0;
}
- ret = orangefs_writepage_locked(&folio->page, wbc);
+ ret = orangefs_writepage_locked(folio, wbc);
mapping_set_error(folio->mapping, ret);
folio_unlock(folio);
folio_end_writeback(folio);
} else {
- if (ow->npages == ow->maxpages) {
+ if (ow->nfolios == ow->maxpages) {
orangefs_writepages_work(ow, wbc);
- ow->npages = 0;
+ ow->nfolios = 0;
}
}
return ret;
@@ -207,31 +182,35 @@ static int orangefs_writepages(struct address_space *mapping,
{
struct orangefs_writepages *ow;
struct blk_plug plug;
- int ret;
+ int error;
+ struct folio *folio = NULL;
+
ow = kzalloc(sizeof(struct orangefs_writepages), GFP_KERNEL);
if (!ow)
return -ENOMEM;
ow->maxpages = orangefs_bufmap_size_query()/PAGE_SIZE;
- ow->pages = kcalloc(ow->maxpages, sizeof(struct page *), GFP_KERNEL);
- if (!ow->pages) {
+ ow->folios = kcalloc(ow->maxpages, sizeof(struct folio *), GFP_KERNEL);
+ if (!ow->folios) {
kfree(ow);
return -ENOMEM;
}
ow->bv = kcalloc(ow->maxpages, sizeof(struct bio_vec), GFP_KERNEL);
if (!ow->bv) {
- kfree(ow->pages);
+ kfree(ow->folios);
kfree(ow);
return -ENOMEM;
}
+ ow->mapping = mapping;
blk_start_plug(&plug);
- ret = write_cache_pages(mapping, wbc, orangefs_writepages_callback, ow);
- if (ow->npages)
- ret = orangefs_writepages_work(ow, wbc);
+ while ((folio = writeback_iter(mapping, wbc, folio, &error)))
+ error = orangefs_writepages_callback(folio, wbc, ow);
+ if (ow->nfolios)
+ error = orangefs_writepages_work(ow, wbc);
blk_finish_plug(&plug);
- kfree(ow->pages);
+ kfree(ow->folios);
kfree(ow->bv);
kfree(ow);
- return ret;
+ return error;
}
static int orangefs_launder_folio(struct folio *);
@@ -484,7 +463,7 @@ static int orangefs_launder_folio(struct folio *folio)
};
folio_wait_writeback(folio);
if (folio_clear_dirty_for_io(folio)) {
- r = orangefs_writepage_locked(&folio->page, &wbc);
+ r = orangefs_writepage_locked(folio, &wbc);
folio_end_writeback(folio);
}
return r;
@@ -606,7 +585,6 @@ out:
/** ORANGEFS2 implementation of address space operations */
static const struct address_space_operations orangefs_address_operations = {
- .writepage = orangefs_writepage,
.readahead = orangefs_readahead,
.read_folio = orangefs_read_folio,
.writepages = orangefs_writepages,
@@ -616,6 +594,7 @@ static const struct address_space_operations orangefs_address_operations = {
.invalidate_folio = orangefs_invalidate_folio,
.release_folio = orangefs_release_folio,
.free_folio = orangefs_free_folio,
+ .migrate_folio = filemap_migrate_folio,
.launder_folio = orangefs_launder_folio,
.direct_IO = orangefs_direct_IO,
};
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 200558ec72f0..82395fe2b956 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -300,8 +300,8 @@ out:
return ret;
}
-static int orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct orangefs_inode_s *parent = ORANGEFS_I(dir);
struct orangefs_kernel_op_s *new_op;
@@ -312,7 +312,7 @@ static int orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
new_op = op_alloc(ORANGEFS_VFS_OP_MKDIR);
if (!new_op)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
new_op->upcall.req.mkdir.parent_refn = parent->refn;
@@ -366,7 +366,7 @@ static int orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
__orangefs_setattr(dir, &iattr);
out:
op_release(new_op);
- return ret;
+ return ERR_PTR(ret);
}
static int orangefs_rename(struct mnt_idmap *idmap,
diff --git a/fs/orangefs/orangefs-debug.h b/fs/orangefs/orangefs-debug.h
index 6e079d4230d0..d4463534cec6 100644
--- a/fs/orangefs/orangefs-debug.h
+++ b/fs/orangefs/orangefs-debug.h
@@ -43,47 +43,4 @@
#define GOSSIP_MAX_NR 16
#define GOSSIP_MAX_DEBUG (((__u64)1 << GOSSIP_MAX_NR) - 1)
-/* a private internal type */
-struct __keyword_mask_s {
- const char *keyword;
- __u64 mask_val;
-};
-
-/*
- * Map all kmod keywords to kmod debug masks here. Keep this
- * structure "packed":
- *
- * "all" is always last...
- *
- * keyword mask_val index
- * foo 1 0
- * bar 2 1
- * baz 4 2
- * qux 8 3
- * . . .
- */
-static struct __keyword_mask_s s_kmod_keyword_mask_map[] = {
- {"super", GOSSIP_SUPER_DEBUG},
- {"inode", GOSSIP_INODE_DEBUG},
- {"file", GOSSIP_FILE_DEBUG},
- {"dir", GOSSIP_DIR_DEBUG},
- {"utils", GOSSIP_UTILS_DEBUG},
- {"wait", GOSSIP_WAIT_DEBUG},
- {"acl", GOSSIP_ACL_DEBUG},
- {"dcache", GOSSIP_DCACHE_DEBUG},
- {"dev", GOSSIP_DEV_DEBUG},
- {"name", GOSSIP_NAME_DEBUG},
- {"bufmap", GOSSIP_BUFMAP_DEBUG},
- {"cache", GOSSIP_CACHE_DEBUG},
- {"debugfs", GOSSIP_DEBUGFS_DEBUG},
- {"xattr", GOSSIP_XATTR_DEBUG},
- {"init", GOSSIP_INIT_DEBUG},
- {"sysfs", GOSSIP_SYSFS_DEBUG},
- {"none", GOSSIP_NO_DEBUG},
- {"all", GOSSIP_MAX_DEBUG}
-};
-
-static const int num_kmod_keyword_mask_map = (int)
- (ARRAY_SIZE(s_kmod_keyword_mask_map));
-
#endif /* __ORANGEFS_DEBUG_H */
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index f52073022fae..f7095c91660c 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -44,6 +44,49 @@
#include "protocol.h"
#include "orangefs-kernel.h"
+/* a private internal type */
+struct __keyword_mask_s {
+ const char *keyword;
+ __u64 mask_val;
+};
+
+/*
+ * Map all kmod keywords to kmod debug masks here. Keep this
+ * structure "packed":
+ *
+ * "all" is always last...
+ *
+ * keyword mask_val index
+ * foo 1 0
+ * bar 2 1
+ * baz 4 2
+ * qux 8 3
+ * . . .
+ */
+static struct __keyword_mask_s s_kmod_keyword_mask_map[] = {
+ {"super", GOSSIP_SUPER_DEBUG},
+ {"inode", GOSSIP_INODE_DEBUG},
+ {"file", GOSSIP_FILE_DEBUG},
+ {"dir", GOSSIP_DIR_DEBUG},
+ {"utils", GOSSIP_UTILS_DEBUG},
+ {"wait", GOSSIP_WAIT_DEBUG},
+ {"acl", GOSSIP_ACL_DEBUG},
+ {"dcache", GOSSIP_DCACHE_DEBUG},
+ {"dev", GOSSIP_DEV_DEBUG},
+ {"name", GOSSIP_NAME_DEBUG},
+ {"bufmap", GOSSIP_BUFMAP_DEBUG},
+ {"cache", GOSSIP_CACHE_DEBUG},
+ {"debugfs", GOSSIP_DEBUGFS_DEBUG},
+ {"xattr", GOSSIP_XATTR_DEBUG},
+ {"init", GOSSIP_INIT_DEBUG},
+ {"sysfs", GOSSIP_SYSFS_DEBUG},
+ {"none", GOSSIP_NO_DEBUG},
+ {"all", GOSSIP_MAX_DEBUG}
+};
+
+static const int num_kmod_keyword_mask_map = (int)
+ (ARRAY_SIZE(s_kmod_keyword_mask_map));
+
#define DEBUG_HELP_STRING_SIZE 4096
#define HELP_STRING_UNINITIALIZED \
"Client Debug Keywords are unknown until the first time\n" \
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index c9993ff66fc2..fe493f3ed6b6 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -138,37 +138,6 @@ kill_whiteout:
goto out;
}
-int ovl_mkdir_real(struct ovl_fs *ofs, struct inode *dir,
- struct dentry **newdentry, umode_t mode)
-{
- int err;
- struct dentry *d, *dentry = *newdentry;
-
- err = ovl_do_mkdir(ofs, dir, dentry, mode);
- if (err)
- return err;
-
- if (likely(!d_unhashed(dentry)))
- return 0;
-
- /*
- * vfs_mkdir() may succeed and leave the dentry passed
- * to it unhashed and negative. If that happens, try to
- * lookup a new hashed and positive dentry.
- */
- d = ovl_lookup_upper(ofs, dentry->d_name.name, dentry->d_parent,
- dentry->d_name.len);
- if (IS_ERR(d)) {
- pr_warn("failed lookup after mkdir (%pd2, err=%i).\n",
- dentry, err);
- return PTR_ERR(d);
- }
- dput(dentry);
- *newdentry = d;
-
- return 0;
-}
-
struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir,
struct dentry *newdentry, struct ovl_cattr *attr)
{
@@ -191,7 +160,8 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir,
case S_IFDIR:
/* mkdir is special... */
- err = ovl_mkdir_real(ofs, dir, &newdentry, attr->mode);
+ newdentry = ovl_do_mkdir(ofs, dir, newdentry, attr->mode);
+ err = PTR_ERR_OR_ZERO(newdentry);
break;
case S_IFCHR:
@@ -219,7 +189,8 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir,
}
out:
if (err) {
- dput(newdentry);
+ if (!IS_ERR(newdentry))
+ dput(newdentry);
return ERR_PTR(err);
}
return newdentry;
@@ -282,7 +253,8 @@ static int ovl_instantiate(struct dentry *dentry, struct inode *inode,
* XXX: if we ever use ovl_obtain_alias() to decode directory
* file handles, need to use ovl_get_inode_locked() and
* d_instantiate_new() here to prevent from creating two
- * hashed directory inode aliases.
+ * hashed directory inode aliases. We then need to return
+ * the obtained alias to ovl_mkdir().
*/
inode = ovl_get_inode(dentry->d_sb, &oip);
if (IS_ERR(inode))
@@ -687,10 +659,10 @@ static int ovl_create(struct mnt_idmap *idmap, struct inode *dir,
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
}
-static int ovl_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *ovl_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
- return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
+ return ERR_PTR(ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL));
}
static int ovl_mknod(struct mnt_idmap *idmap, struct inode *dir,
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 0021e2025020..6f2f8f4cfbbc 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -241,13 +241,14 @@ static inline int ovl_do_create(struct ovl_fs *ofs,
return err;
}
-static inline int ovl_do_mkdir(struct ovl_fs *ofs,
- struct inode *dir, struct dentry *dentry,
- umode_t mode)
+static inline struct dentry *ovl_do_mkdir(struct ovl_fs *ofs,
+ struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode)
{
- int err = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode);
- pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
- return err;
+ dentry = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode);
+ pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, PTR_ERR_OR_ZERO(dentry));
+ return dentry;
}
static inline int ovl_do_mknod(struct ovl_fs *ofs,
@@ -838,8 +839,6 @@ struct ovl_cattr {
#define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) })
-int ovl_mkdir_real(struct ovl_fs *ofs, struct inode *dir,
- struct dentry **newdentry, umode_t mode);
struct dentry *ovl_create_real(struct ovl_fs *ofs,
struct inode *dir, struct dentry *newdentry,
struct ovl_cattr *attr);
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index 1115c22deca0..6759f7d040c8 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -59,6 +59,7 @@ enum ovl_opt {
Opt_metacopy,
Opt_verity,
Opt_volatile,
+ Opt_override_creds,
};
static const struct constant_table ovl_parameter_bool[] = {
@@ -155,6 +156,7 @@ const struct fs_parameter_spec ovl_parameter_spec[] = {
fsparam_enum("metacopy", Opt_metacopy, ovl_parameter_bool),
fsparam_enum("verity", Opt_verity, ovl_parameter_verity),
fsparam_flag("volatile", Opt_volatile),
+ fsparam_flag_no("override_creds", Opt_override_creds),
{}
};
@@ -662,6 +664,29 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_userxattr:
config->userxattr = true;
break;
+ case Opt_override_creds: {
+ const struct cred *cred = NULL;
+
+ if (result.negated) {
+ swap(cred, ofs->creator_cred);
+ put_cred(cred);
+ break;
+ }
+
+ if (!current_in_userns(fc->user_ns)) {
+ err = -EINVAL;
+ break;
+ }
+
+ cred = prepare_creds();
+ if (cred)
+ swap(cred, ofs->creator_cred);
+ else
+ err = -ENOMEM;
+
+ put_cred(cred);
+ break;
+ }
default:
pr_err("unrecognized mount option \"%s\" or missing value\n",
param->key);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 86ae6f6da36b..b63474d1b064 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -327,9 +327,10 @@ retry:
goto retry;
}
- err = ovl_mkdir_real(ofs, dir, &work, attr.ia_mode);
- if (err)
- goto out_dput;
+ work = ovl_do_mkdir(ofs, dir, work, attr.ia_mode);
+ err = PTR_ERR(work);
+ if (IS_ERR(work))
+ goto out_err;
/* Weird filesystem returning with hashed negative (kernfs)? */
err = -EINVAL;
@@ -1305,6 +1306,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct ovl_fs *ofs = sb->s_fs_info;
struct ovl_fs_context *ctx = fc->fs_private;
+ const struct cred *old_cred = NULL;
struct dentry *root_dentry;
struct ovl_entry *oe;
struct ovl_layer *layers;
@@ -1318,10 +1320,15 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_d_op = &ovl_dentry_operations;
err = -ENOMEM;
- ofs->creator_cred = cred = prepare_creds();
+ if (!ofs->creator_cred)
+ ofs->creator_cred = cred = prepare_creds();
+ else
+ cred = (struct cred *)ofs->creator_cred;
if (!cred)
goto out_err;
+ old_cred = ovl_override_creds(sb);
+
err = ovl_fs_params_verify(ctx, &ofs->config);
if (err)
goto out_err;
@@ -1481,11 +1488,19 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_root = root_dentry;
+ ovl_revert_creds(old_cred);
return 0;
out_free_oe:
ovl_free_entry(oe);
out_err:
+ /*
+ * Revert creds before calling ovl_free_fs() which will call
+ * put_cred() and put_cred() requires that the cred's that are
+ * put are not the caller's creds, i.e., current->cred.
+ */
+ if (old_cred)
+ ovl_revert_creds(old_cred);
ovl_free_fs(ofs);
sb->s_fs_info = NULL;
return err;
diff --git a/fs/pidfs.c b/fs/pidfs.c
index c0478b3c55d9..d64a4cbeb0da 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -24,6 +24,28 @@
#include "internal.h"
#include "mount.h"
+static struct kmem_cache *pidfs_cachep __ro_after_init;
+
+/*
+ * Stashes information that userspace needs to access even after the
+ * process has been reaped.
+ */
+struct pidfs_exit_info {
+ __u64 cgroupid;
+ __s32 exit_code;
+};
+
+struct pidfs_inode {
+ struct pidfs_exit_info __pei;
+ struct pidfs_exit_info *exit_info;
+ struct inode vfs_inode;
+};
+
+static inline struct pidfs_inode *pidfs_i(struct inode *inode)
+{
+ return container_of(inode, struct pidfs_inode, vfs_inode);
+}
+
static struct rb_root pidfs_ino_tree = RB_ROOT;
#if BITS_PER_LONG == 32
@@ -188,36 +210,48 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
{
struct pid *pid = pidfd_pid(file);
- bool thread = file->f_flags & PIDFD_THREAD;
struct task_struct *task;
__poll_t poll_flags = 0;
poll_wait(file, &pid->wait_pidfd, pts);
/*
- * Depending on PIDFD_THREAD, inform pollers when the thread
- * or the whole thread-group exits.
+ * Don't wake waiters if the thread-group leader exited
+ * prematurely. They either get notified when the last subthread
+ * exits or not at all if one of the remaining subthreads execs
+ * and assumes the struct pid of the old thread-group leader.
*/
guard(rcu)();
task = pid_task(pid, PIDTYPE_PID);
if (!task)
poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP;
- else if (task->exit_state && (thread || thread_group_empty(task)))
+ else if (task->exit_state && !delay_group_leader(task))
poll_flags = EPOLLIN | EPOLLRDNORM;
return poll_flags;
}
-static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg)
+static inline bool pid_in_current_pidns(const struct pid *pid)
+{
+ const struct pid_namespace *ns = task_active_pid_ns(current);
+
+ if (ns->level <= pid->level)
+ return pid->numbers[ns->level].ns == ns;
+
+ return false;
+}
+
+static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
{
struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
+ struct inode *inode = file_inode(file);
+ struct pid *pid = pidfd_pid(file);
size_t usize = _IOC_SIZE(cmd);
struct pidfd_info kinfo = {};
+ struct pidfs_exit_info *exit_info;
struct user_namespace *user_ns;
+ struct task_struct *task;
const struct cred *c;
__u64 mask;
-#ifdef CONFIG_CGROUPS
- struct cgroup *cgrp;
-#endif
if (!uinfo)
return -EINVAL;
@@ -227,6 +261,37 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
if (copy_from_user(&mask, &uinfo->mask, sizeof(mask)))
return -EFAULT;
+ /*
+ * Restrict information retrieval to tasks within the caller's pid
+ * namespace hierarchy.
+ */
+ if (!pid_in_current_pidns(pid))
+ return -ESRCH;
+
+ if (mask & PIDFD_INFO_EXIT) {
+ exit_info = READ_ONCE(pidfs_i(inode)->exit_info);
+ if (exit_info) {
+ kinfo.mask |= PIDFD_INFO_EXIT;
+#ifdef CONFIG_CGROUPS
+ kinfo.cgroupid = exit_info->cgroupid;
+ kinfo.mask |= PIDFD_INFO_CGROUPID;
+#endif
+ kinfo.exit_code = exit_info->exit_code;
+ }
+ }
+
+ task = get_pid_task(pid, PIDTYPE_PID);
+ if (!task) {
+ /*
+ * If the task has already been reaped, only exit
+ * information is available
+ */
+ if (!(mask & PIDFD_INFO_EXIT))
+ return -ESRCH;
+
+ goto copy_out;
+ }
+
c = get_task_cred(task);
if (!c)
return -ESRCH;
@@ -246,11 +311,15 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
put_cred(c);
#ifdef CONFIG_CGROUPS
- rcu_read_lock();
- cgrp = task_dfl_cgroup(task);
- kinfo.cgroupid = cgroup_id(cgrp);
- kinfo.mask |= PIDFD_INFO_CGROUPID;
- rcu_read_unlock();
+ if (!kinfo.cgroupid) {
+ struct cgroup *cgrp;
+
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(task);
+ kinfo.cgroupid = cgroup_id(cgrp);
+ kinfo.mask |= PIDFD_INFO_CGROUPID;
+ rcu_read_unlock();
+ }
#endif
/*
@@ -270,16 +339,14 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1))
return -ESRCH;
+copy_out:
/*
* If userspace and the kernel have the same struct size it can just
* be copied. If userspace provides an older struct, only the bits that
* userspace knows about will be copied. If userspace provides a new
* struct, only the bits that the kernel knows about will be copied.
*/
- if (copy_to_user(uinfo, &kinfo, min(usize, sizeof(kinfo))))
- return -EFAULT;
-
- return 0;
+ return copy_struct_to_user(uinfo, usize, &kinfo, sizeof(kinfo), NULL);
}
static bool pidfs_ioctl_valid(unsigned int cmd)
@@ -317,7 +384,6 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct task_struct *task __free(put_task) = NULL;
struct nsproxy *nsp __free(put_nsproxy) = NULL;
- struct pid *pid = pidfd_pid(file);
struct ns_common *ns_common = NULL;
struct pid_namespace *pid_ns;
@@ -332,13 +398,13 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return put_user(file_inode(file)->i_generation, argp);
}
- task = get_pid_task(pid, PIDTYPE_PID);
- if (!task)
- return -ESRCH;
-
/* Extensible IOCTL that does not open namespace FDs, take a shortcut */
if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO))
- return pidfd_info(task, cmd, arg);
+ return pidfd_info(file, cmd, arg);
+
+ task = get_pid_task(pidfd_pid(file), PIDTYPE_PID);
+ if (!task)
+ return -ESRCH;
if (arg)
return -EINVAL;
@@ -450,6 +516,49 @@ struct pid *pidfd_pid(const struct file *file)
return file_inode(file)->i_private;
}
+/*
+ * We're called from release_task(). We know there's at least one
+ * reference to struct pid being held that won't be released until the
+ * task has been reaped which cannot happen until we're out of
+ * release_task().
+ *
+ * If this struct pid is referred to by a pidfd then
+ * stashed_dentry_get() will return the dentry and inode for that struct
+ * pid. Since we've taken a reference on it there's now an additional
+ * reference from the exit path on it. Which is fine. We're going to put
+ * it again in a second and we know that the pid is kept alive anyway.
+ *
+ * Worst case is that we've filled in the info and immediately free the
+ * dentry and inode afterwards since the pidfd has been closed. Since
+ * pidfs_exit() currently is placed after exit_task_work() we know that
+ * it cannot be us aka the exiting task holding a pidfd to ourselves.
+ */
+void pidfs_exit(struct task_struct *tsk)
+{
+ struct dentry *dentry;
+
+ might_sleep();
+
+ dentry = stashed_dentry_get(&task_pid(tsk)->stashed);
+ if (dentry) {
+ struct inode *inode = d_inode(dentry);
+ struct pidfs_exit_info *exit_info = &pidfs_i(inode)->__pei;
+#ifdef CONFIG_CGROUPS
+ struct cgroup *cgrp;
+
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(tsk);
+ exit_info->cgroupid = cgroup_id(cgrp);
+ rcu_read_unlock();
+#endif
+ exit_info->exit_code = tsk->exit_code;
+
+ /* Ensure that PIDFD_GET_INFO sees either all or nothing. */
+ smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei);
+ dput(dentry);
+ }
+}
+
static struct vfsmount *pidfs_mnt __ro_after_init;
/*
@@ -505,9 +614,30 @@ static void pidfs_evict_inode(struct inode *inode)
put_pid(pid);
}
+static struct inode *pidfs_alloc_inode(struct super_block *sb)
+{
+ struct pidfs_inode *pi;
+
+ pi = alloc_inode_sb(sb, pidfs_cachep, GFP_KERNEL);
+ if (!pi)
+ return NULL;
+
+ memset(&pi->__pei, 0, sizeof(pi->__pei));
+ pi->exit_info = NULL;
+
+ return &pi->vfs_inode;
+}
+
+static void pidfs_free_inode(struct inode *inode)
+{
+ kmem_cache_free(pidfs_cachep, pidfs_i(inode));
+}
+
static const struct super_operations pidfs_sops = {
+ .alloc_inode = pidfs_alloc_inode,
.drop_inode = generic_delete_inode,
.evict_inode = pidfs_evict_inode,
+ .free_inode = pidfs_free_inode,
.statfs = simple_statfs,
};
@@ -633,8 +763,49 @@ static int pidfs_export_permission(struct handle_to_path_ctx *ctx,
return 0;
}
+static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path,
+ unsigned int flags)
+{
+ enum pid_type type;
+
+ if (flags & PIDFD_CLONE)
+ return true;
+
+ /*
+ * Make sure that if a pidfd is created PIDFD_INFO_EXIT
+ * information will be available. So after an inode for the
+ * pidfd has been allocated perform another check that the pid
+ * is still alive. If it is exit information is available even
+ * if the task gets reaped before the pidfd is returned to
+ * userspace. The only exception is PIDFD_CLONE where no task
+ * linkage has been established for @pid yet and the kernel is
+ * in the middle of process creation so there's nothing for
+ * pidfs to miss.
+ */
+ if (flags & PIDFD_THREAD)
+ type = PIDTYPE_PID;
+ else
+ type = PIDTYPE_TGID;
+
+ /*
+ * Since pidfs_exit() is called before struct pid's task linkage
+ * is removed the case where the task got reaped but a dentry
+ * was already attached to struct pid and exit information was
+ * recorded and published can be handled correctly.
+ */
+ if (unlikely(!pid_has_task(pid, type))) {
+ struct inode *inode = d_inode(path->dentry);
+ return !!READ_ONCE(pidfs_i(inode)->exit_info);
+ }
+
+ return true;
+}
+
static struct file *pidfs_export_open(struct path *path, unsigned int oflags)
{
+ if (!pidfs_pid_valid(d_inode(path->dentry)->i_private, path, oflags))
+ return ERR_PTR(-ESRCH);
+
/*
* Clear O_LARGEFILE as open_by_handle_at() forces it and raise
* O_RDWR as pidfds always are.
@@ -698,22 +869,46 @@ static struct file_system_type pidfs_type = {
struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
{
-
struct file *pidfd_file;
- struct path path;
+ struct path path __free(path_put) = {};
int ret;
+ /*
+ * Ensure that PIDFD_CLONE can be passed as a flag without
+ * overloading other uapi pidfd flags.
+ */
+ BUILD_BUG_ON(PIDFD_CLONE == PIDFD_THREAD);
+ BUILD_BUG_ON(PIDFD_CLONE == PIDFD_NONBLOCK);
+
ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
if (ret < 0)
return ERR_PTR(ret);
+ if (!pidfs_pid_valid(pid, &path, flags))
+ return ERR_PTR(-ESRCH);
+
+ flags &= ~PIDFD_CLONE;
pidfd_file = dentry_open(&path, flags, current_cred());
- path_put(&path);
+ /* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */
+ if (!IS_ERR(pidfd_file))
+ pidfd_file->f_flags |= (flags & PIDFD_THREAD);
+
return pidfd_file;
}
+static void pidfs_inode_init_once(void *data)
+{
+ struct pidfs_inode *pi = data;
+
+ inode_init_once(&pi->vfs_inode);
+}
+
void __init pidfs_init(void)
{
+ pidfs_cachep = kmem_cache_create("pidfs_cache", sizeof(struct pidfs_inode), 0,
+ (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
+ SLAB_ACCOUNT | SLAB_PANIC),
+ pidfs_inode_init_once);
pidfs_mnt = kern_mount(&pidfs_type);
if (IS_ERR(pidfs_mnt))
panic("Failed to mount pidfs pseudo filesystem");
diff --git a/fs/pipe.c b/fs/pipe.c
index 4d0799e4e719..da45edd68c41 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -112,20 +112,40 @@ void pipe_double_lock(struct pipe_inode_info *pipe1,
pipe_lock(pipe2);
}
+static struct page *anon_pipe_get_page(struct pipe_inode_info *pipe)
+{
+ for (int i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) {
+ if (pipe->tmp_page[i]) {
+ struct page *page = pipe->tmp_page[i];
+ pipe->tmp_page[i] = NULL;
+ return page;
+ }
+ }
+
+ return alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
+}
+
+static void anon_pipe_put_page(struct pipe_inode_info *pipe,
+ struct page *page)
+{
+ if (page_count(page) == 1) {
+ for (int i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) {
+ if (!pipe->tmp_page[i]) {
+ pipe->tmp_page[i] = page;
+ return;
+ }
+ }
+ }
+
+ put_page(page);
+}
+
static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
struct page *page = buf->page;
- /*
- * If nobody else uses this page, and we don't already have a
- * temporary page, let's keep track of it as a one-deep
- * allocation cache. (Otherwise just release our reference to it)
- */
- if (page_count(page) == 1 && !pipe->tmp_page)
- pipe->tmp_page = page;
- else
- put_page(page);
+ anon_pipe_put_page(pipe, page);
}
static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe,
@@ -247,7 +267,7 @@ static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe,
}
static ssize_t
-pipe_read(struct kiocb *iocb, struct iov_iter *to)
+anon_pipe_read(struct kiocb *iocb, struct iov_iter *to)
{
size_t total_len = iov_iter_count(to);
struct file *filp = iocb->ki_filp;
@@ -274,7 +294,6 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
/* Read ->head with a barrier vs post_one_notification() */
unsigned int head = smp_load_acquire(&pipe->head);
unsigned int tail = pipe->tail;
- unsigned int mask = pipe->ring_size - 1;
#ifdef CONFIG_WATCH_QUEUE
if (pipe->note_loss) {
@@ -301,7 +320,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
#endif
if (!pipe_empty(head, tail)) {
- struct pipe_buffer *buf = &pipe->bufs[tail & mask];
+ struct pipe_buffer *buf = pipe_buf(pipe, tail);
size_t chars = buf->len;
size_t written;
int error;
@@ -359,29 +378,9 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
break;
}
mutex_unlock(&pipe->mutex);
-
/*
* We only get here if we didn't actually read anything.
*
- * However, we could have seen (and removed) a zero-sized
- * pipe buffer, and might have made space in the buffers
- * that way.
- *
- * You can't make zero-sized pipe buffers by doing an empty
- * write (not even in packet mode), but they can happen if
- * the writer gets an EFAULT when trying to fill a buffer
- * that already got allocated and inserted in the buffer
- * array.
- *
- * So we still need to wake up any pending writers in the
- * _very_ unlikely case that the pipe was full, but we got
- * no data.
- */
- if (unlikely(wake_writer))
- wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
- kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
-
- /*
* But because we didn't read anything, at this point we can
* just return directly with -ERESTARTSYS if we're interrupted,
* since we've done any required wakeups and there's no need
@@ -390,7 +389,6 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
return -ERESTARTSYS;
- wake_writer = false;
wake_next_reader = true;
mutex_lock(&pipe->mutex);
}
@@ -403,8 +401,15 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
if (wake_next_reader)
wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+ return ret;
+}
+
+static ssize_t
+fifo_pipe_read(struct kiocb *iocb, struct iov_iter *to)
+{
+ int ret = anon_pipe_read(iocb, to);
if (ret > 0)
- file_accessed(filp);
+ file_accessed(iocb->ki_filp);
return ret;
}
@@ -424,7 +429,7 @@ static inline bool pipe_writable(const struct pipe_inode_info *pipe)
}
static ssize_t
-pipe_write(struct kiocb *iocb, struct iov_iter *from)
+anon_pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *filp = iocb->ki_filp;
struct pipe_inode_info *pipe = filp->private_data;
@@ -471,8 +476,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
was_empty = pipe_empty(head, pipe->tail);
chars = total_len & (PAGE_SIZE-1);
if (chars && !was_empty) {
- unsigned int mask = pipe->ring_size - 1;
- struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
+ struct pipe_buffer *buf = pipe_buf(pipe, head - 1);
int offset = buf->offset + buf->len;
if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
@@ -503,54 +507,44 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
head = pipe->head;
if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
- unsigned int mask = pipe->ring_size - 1;
struct pipe_buffer *buf;
- struct page *page = pipe->tmp_page;
+ struct page *page;
int copied;
- if (!page) {
- page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
- if (unlikely(!page)) {
- ret = ret ? : -ENOMEM;
- break;
- }
- pipe->tmp_page = page;
+ page = anon_pipe_get_page(pipe);
+ if (unlikely(!page)) {
+ if (!ret)
+ ret = -ENOMEM;
+ break;
}
- /* Allocate a slot in the ring in advance and attach an
- * empty buffer. If we fault or otherwise fail to use
- * it, either the reader will consume it or it'll still
- * be there for the next write.
- */
- pipe->head = head + 1;
+ copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
+ if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
+ anon_pipe_put_page(pipe, page);
+ if (!ret)
+ ret = -EFAULT;
+ break;
+ }
+ pipe->head = head + 1;
/* Insert it into the buffer array */
- buf = &pipe->bufs[head & mask];
+ buf = pipe_buf(pipe, head);
buf->page = page;
buf->ops = &anon_pipe_buf_ops;
buf->offset = 0;
- buf->len = 0;
if (is_packetized(filp))
buf->flags = PIPE_BUF_FLAG_PACKET;
else
buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
- pipe->tmp_page = NULL;
- copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
- if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
- if (!ret)
- ret = -EFAULT;
- break;
- }
- ret += copied;
buf->len = copied;
+ ret += copied;
if (!iov_iter_count(from))
break;
- }
- if (!pipe_full(head, pipe->tail, pipe->max_usage))
continue;
+ }
/* Wait for buffer space to become available. */
if ((filp->f_flags & O_NONBLOCK) ||
@@ -602,11 +596,21 @@ out:
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
if (wake_next_writer)
wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
- if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
- int err = file_update_time(filp);
- if (err)
- ret = err;
- sb_end_write(file_inode(filp)->i_sb);
+ return ret;
+}
+
+static ssize_t
+fifo_pipe_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ int ret = anon_pipe_write(iocb, from);
+ if (ret > 0) {
+ struct file *filp = iocb->ki_filp;
+ if (sb_start_write_trylock(file_inode(filp)->i_sb)) {
+ int err = file_update_time(filp);
+ if (err)
+ ret = err;
+ sb_end_write(file_inode(filp)->i_sb);
+ }
}
return ret;
}
@@ -853,8 +857,10 @@ void free_pipe_info(struct pipe_inode_info *pipe)
if (pipe->watch_queue)
put_watch_queue(pipe->watch_queue);
#endif
- if (pipe->tmp_page)
- __free_page(pipe->tmp_page);
+ for (i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) {
+ if (pipe->tmp_page[i])
+ __free_page(pipe->tmp_page[i]);
+ }
kfree(pipe->bufs);
kfree(pipe);
}
@@ -874,6 +880,8 @@ static const struct dentry_operations pipefs_dentry_operations = {
.d_dname = pipefs_dname,
};
+static const struct file_operations pipeanon_fops;
+
static struct inode * get_pipe_inode(void)
{
struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
@@ -891,7 +899,7 @@ static struct inode * get_pipe_inode(void)
inode->i_pipe = pipe;
pipe->files = 2;
pipe->readers = pipe->writers = 1;
- inode->i_fop = &pipefifo_fops;
+ inode->i_fop = &pipeanon_fops;
/*
* Mark the inode dirty from the very beginning,
@@ -934,7 +942,7 @@ int create_pipe_files(struct file **res, int flags)
f = alloc_file_pseudo(inode, pipe_mnt, "",
O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
- &pipefifo_fops);
+ &pipeanon_fops);
if (IS_ERR(f)) {
free_pipe_info(inode->i_pipe);
iput(inode);
@@ -945,7 +953,7 @@ int create_pipe_files(struct file **res, int flags)
f->f_pipe = 0;
res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
- &pipefifo_fops);
+ &pipeanon_fops);
if (IS_ERR(res[0])) {
put_pipe_info(inode, inode->i_pipe);
fput(f);
@@ -1109,8 +1117,8 @@ static void wake_up_partner(struct pipe_inode_info *pipe)
static int fifo_open(struct inode *inode, struct file *filp)
{
+ bool is_pipe = inode->i_fop == &pipeanon_fops;
struct pipe_inode_info *pipe;
- bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
int ret;
filp->f_pipe = 0;
@@ -1234,8 +1242,19 @@ err:
const struct file_operations pipefifo_fops = {
.open = fifo_open,
- .read_iter = pipe_read,
- .write_iter = pipe_write,
+ .read_iter = fifo_pipe_read,
+ .write_iter = fifo_pipe_write,
+ .poll = pipe_poll,
+ .unlocked_ioctl = pipe_ioctl,
+ .release = pipe_release,
+ .fasync = pipe_fasync,
+ .splice_write = iter_file_splice_write,
+};
+
+static const struct file_operations pipeanon_fops = {
+ .open = fifo_open,
+ .read_iter = anon_pipe_read,
+ .write_iter = anon_pipe_write,
.poll = pipe_poll,
.unlocked_ioctl = pipe_ioctl,
.release = pipe_release,
@@ -1271,6 +1290,10 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
struct pipe_buffer *bufs;
unsigned int head, tail, mask, n;
+ /* nr_slots larger than limits of pipe->{head,tail} */
+ if (unlikely(nr_slots > (pipe_index_t)-1u))
+ return -EINVAL;
+
bufs = kcalloc(nr_slots, sizeof(*bufs),
GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (unlikely(!bufs))
@@ -1390,7 +1413,9 @@ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
{
struct pipe_inode_info *pipe = file->private_data;
- if (file->f_op != &pipefifo_fops || !pipe)
+ if (!pipe)
+ return NULL;
+ if (file->f_op != &pipefifo_fops && file->f_op != &pipeanon_fops)
return NULL;
if (for_splice && pipe_has_watch_queue(pipe))
return NULL;
diff --git a/fs/pnode.c b/fs/pnode.c
index ef048f008bdd..7a062a5de10e 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -150,7 +150,7 @@ static struct mount *propagation_next(struct mount *m,
struct mount *origin)
{
/* are there any slaves of this mount? */
- if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
+ if (!IS_MNT_PROPAGATED(m) && !list_empty(&m->mnt_slave_list))
return first_slave(m);
while (1) {
@@ -174,7 +174,7 @@ static struct mount *skip_propagation_subtree(struct mount *m,
* Advance m such that propagation_next will not return
* the slaves of m.
*/
- if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
+ if (!IS_MNT_PROPAGATED(m) && !list_empty(&m->mnt_slave_list))
m = last_slave(m);
return m;
@@ -185,7 +185,7 @@ static struct mount *next_group(struct mount *m, struct mount *origin)
while (1) {
while (1) {
struct mount *next;
- if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
+ if (!IS_MNT_PROPAGATED(m) && !list_empty(&m->mnt_slave_list))
return first_slave(m);
next = next_peer(m);
if (m->mnt_group_id == origin->mnt_group_id) {
@@ -226,7 +226,7 @@ static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
struct mount *child;
int type;
/* skip ones added by this propagate_mnt() */
- if (IS_MNT_NEW(m))
+ if (IS_MNT_PROPAGATED(m))
return 0;
/* skip if mountpoint isn't covered by it */
if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root))
@@ -380,7 +380,7 @@ bool propagation_would_overmount(const struct mount *from,
if (!IS_MNT_SHARED(from))
return false;
- if (IS_MNT_NEW(to))
+ if (IS_MNT_PROPAGATED(to))
return false;
if (to->mnt.mnt_root != mp->m_dentry)
@@ -549,8 +549,10 @@ static void restore_mounts(struct list_head *to_restore)
mp = parent->mnt_mp;
parent = parent->mnt_parent;
}
- if (parent != mnt->mnt_parent)
+ if (parent != mnt->mnt_parent) {
mnt_change_mountpoint(parent, mp, mnt);
+ mnt_notify_add(mnt);
+ }
}
}
diff --git a/fs/pnode.h b/fs/pnode.h
index 0b02a6393891..ddafe0d087ca 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -12,7 +12,7 @@
#define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED)
#define IS_MNT_SLAVE(m) ((m)->mnt_master)
-#define IS_MNT_NEW(m) (!(m)->mnt_ns || is_anon_ns((m)->mnt_ns))
+#define IS_MNT_PROPAGATED(m) (!(m)->mnt_ns || ((m)->mnt_ns->mntns_flags & MNTNS_PROPAGATING))
#define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED)
#define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE)
#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index cd89e956c322..5538c4aee8fa 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1489,7 +1489,6 @@ static const struct file_operations proc_fail_nth_operations = {
#endif
-#ifdef CONFIG_SCHED_DEBUG
/*
* Print out various scheduling related per-task fields:
*/
@@ -1539,8 +1538,6 @@ static const struct file_operations proc_pid_sched_operations = {
.release = single_release,
};
-#endif
-
#ifdef CONFIG_SCHED_AUTOGROUP
/*
* Print out autogroup related information:
@@ -2497,11 +2494,9 @@ static const struct file_operations proc_map_files_operations = {
#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
struct timers_private {
- struct pid *pid;
- struct task_struct *task;
- struct sighand_struct *sighand;
- struct pid_namespace *ns;
- unsigned long flags;
+ struct pid *pid;
+ struct task_struct *task;
+ struct pid_namespace *ns;
};
static void *timers_start(struct seq_file *m, loff_t *pos)
@@ -2512,54 +2507,48 @@ static void *timers_start(struct seq_file *m, loff_t *pos)
if (!tp->task)
return ERR_PTR(-ESRCH);
- tp->sighand = lock_task_sighand(tp->task, &tp->flags);
- if (!tp->sighand)
- return ERR_PTR(-ESRCH);
-
- return seq_hlist_start(&tp->task->signal->posix_timers, *pos);
+ rcu_read_lock();
+ return seq_hlist_start_rcu(&tp->task->signal->posix_timers, *pos);
}
static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
{
struct timers_private *tp = m->private;
- return seq_hlist_next(v, &tp->task->signal->posix_timers, pos);
+
+ return seq_hlist_next_rcu(v, &tp->task->signal->posix_timers, pos);
}
static void timers_stop(struct seq_file *m, void *v)
{
struct timers_private *tp = m->private;
- if (tp->sighand) {
- unlock_task_sighand(tp->task, &tp->flags);
- tp->sighand = NULL;
- }
-
if (tp->task) {
put_task_struct(tp->task);
tp->task = NULL;
+ rcu_read_unlock();
}
}
static int show_timer(struct seq_file *m, void *v)
{
- struct k_itimer *timer;
- struct timers_private *tp = m->private;
- int notify;
static const char * const nstr[] = {
- [SIGEV_SIGNAL] = "signal",
- [SIGEV_NONE] = "none",
- [SIGEV_THREAD] = "thread",
+ [SIGEV_SIGNAL] = "signal",
+ [SIGEV_NONE] = "none",
+ [SIGEV_THREAD] = "thread",
};
- timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list);
- notify = timer->it_sigev_notify;
+ struct k_itimer *timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list);
+ struct timers_private *tp = m->private;
+ int notify = timer->it_sigev_notify;
+
+ guard(spinlock_irq)(&timer->it_lock);
+ if (!posixtimer_valid(timer))
+ return 0;
seq_printf(m, "ID: %d\n", timer->it_id);
- seq_printf(m, "signal: %d/%px\n",
- timer->sigq.info.si_signo,
+ seq_printf(m, "signal: %d/%px\n", timer->sigq.info.si_signo,
timer->sigq.info.si_value.sival_ptr);
- seq_printf(m, "notify: %s/%s.%d\n",
- nstr[notify & ~SIGEV_THREAD_ID],
+ seq_printf(m, "notify: %s/%s.%d\n", nstr[notify & ~SIGEV_THREAD_ID],
(notify & SIGEV_THREAD_ID) ? "tid" : "pid",
pid_nr_ns(timer->it_pid, tp->ns));
seq_printf(m, "ClockID: %d\n", timer->it_clock);
@@ -3331,9 +3320,7 @@ static const struct pid_entry tgid_base_stuff[] = {
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUSR, proc_pid_personality),
ONE("limits", S_IRUGO, proc_pid_limits),
-#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
-#endif
#ifdef CONFIG_SCHED_AUTOGROUP
REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
#endif
@@ -3682,9 +3669,7 @@ static const struct pid_entry tid_base_stuff[] = {
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUSR, proc_pid_personality),
ONE("limits", S_IRUGO, proc_pid_limits),
-#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
-#endif
NOD("comm", S_IFREG|S_IRUGO|S_IWUSR,
&proc_tid_comm_inode_operations,
&proc_pid_set_comm_operations, {}),
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 8ec90826a49e..a3e22803cddf 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -559,10 +559,16 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
return p;
}
-static inline void pde_set_flags(struct proc_dir_entry *pde)
+static void pde_set_flags(struct proc_dir_entry *pde)
{
if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT)
pde->flags |= PROC_ENTRY_PERMANENT;
+ if (pde->proc_ops->proc_read_iter)
+ pde->flags |= PROC_ENTRY_proc_read_iter;
+#ifdef CONFIG_COMPAT
+ if (pde->proc_ops->proc_compat_ioctl)
+ pde->flags |= PROC_ENTRY_proc_compat_ioctl;
+#endif
}
struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
@@ -626,6 +632,7 @@ struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode,
p->proc_ops = &proc_seq_ops;
p->seq_ops = ops;
p->state_size = state_size;
+ pde_set_flags(p);
return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_seq_private);
@@ -656,6 +663,7 @@ struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode,
return NULL;
p->proc_ops = &proc_single_ops;
p->single_show = show;
+ pde_set_flags(p);
return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_single_data);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 626ad7bd94f2..a3eb3b740f76 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -656,13 +656,13 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
if (S_ISREG(inode->i_mode)) {
inode->i_op = de->proc_iops;
- if (de->proc_ops->proc_read_iter)
+ if (pde_has_proc_read_iter(de))
inode->i_fop = &proc_iter_file_ops;
else
inode->i_fop = &proc_reg_file_ops;
#ifdef CONFIG_COMPAT
- if (de->proc_ops->proc_compat_ioctl) {
- if (de->proc_ops->proc_read_iter)
+ if (pde_has_proc_compat_ioctl(de)) {
+ if (pde_has_proc_read_iter(de))
inode->i_fop = &proc_iter_file_ops_compat;
else
inode->i_fop = &proc_reg_file_ops_compat;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 1695509370b8..77a517f91821 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -85,6 +85,20 @@ static inline void pde_make_permanent(struct proc_dir_entry *pde)
pde->flags |= PROC_ENTRY_PERMANENT;
}
+static inline bool pde_has_proc_read_iter(const struct proc_dir_entry *pde)
+{
+ return pde->flags & PROC_ENTRY_proc_read_iter;
+}
+
+static inline bool pde_has_proc_compat_ioctl(const struct proc_dir_entry *pde)
+{
+#ifdef CONFIG_COMPAT
+ return pde->flags & PROC_ENTRY_proc_compat_ioctl;
+#else
+ return false;
+#endif
+}
+
extern struct kmem_cache *proc_dir_entry_cache;
void pde_free(struct proc_dir_entry *pde);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 1cb33771bf9f..728630b10fdf 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -34,8 +34,6 @@
#include <asm/sections.h>
#include "internal.h"
-#define CORE_STR "CORE"
-
#ifndef ELF_CORE_EFLAGS
#define ELF_CORE_EFLAGS 0
#endif
@@ -122,7 +120,9 @@ static void update_kcore_size(void)
kcore_phdrs_len = kcore_nphdr * sizeof(struct elf_phdr);
kcore_notes_len = (4 * sizeof(struct elf_note) +
- 3 * ALIGN(sizeof(CORE_STR), 4) +
+ ALIGN(sizeof(NN_PRSTATUS), 4) +
+ ALIGN(sizeof(NN_PRPSINFO), 4) +
+ ALIGN(sizeof(NN_TASKSTRUCT), 4) +
VMCOREINFO_NOTE_NAME_BYTES +
ALIGN(sizeof(struct elf_prstatus), 4) +
ALIGN(sizeof(struct elf_prpsinfo), 4) +
@@ -443,11 +443,11 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
goto out;
}
- append_kcore_note(notes, &i, CORE_STR, NT_PRSTATUS, &prstatus,
+ append_kcore_note(notes, &i, NN_PRSTATUS, NT_PRSTATUS, &prstatus,
sizeof(prstatus));
- append_kcore_note(notes, &i, CORE_STR, NT_PRPSINFO, &prpsinfo,
+ append_kcore_note(notes, &i, NN_PRPSINFO, NT_PRPSINFO, &prpsinfo,
sizeof(prpsinfo));
- append_kcore_note(notes, &i, CORE_STR, NT_TASKSTRUCT, current,
+ append_kcore_note(notes, &i, NN_TASKSTRUCT, NT_TASKSTRUCT, current,
arch_task_struct_size);
/*
* vmcoreinfo_size is mostly constant after init time, but it
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 56815799ce79..bb3b769edc71 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -14,10 +14,10 @@
#include <linux/init.h>
#include <linux/list.h>
#include <linux/string.h>
-#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/ramfs.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
+#include <linux/fs_context.h>
#include <linux/sched.h>
#include <linux/magic.h>
#include <linux/pstore.h>
@@ -226,37 +226,38 @@ static struct inode *pstore_get_inode(struct super_block *sb)
}
enum {
- Opt_kmsg_bytes, Opt_err
+ Opt_kmsg_bytes
};
-static const match_table_t tokens = {
- {Opt_kmsg_bytes, "kmsg_bytes=%u"},
- {Opt_err, NULL}
+static const struct fs_parameter_spec pstore_param_spec[] = {
+ fsparam_u32 ("kmsg_bytes", Opt_kmsg_bytes),
+ {}
};
-static void parse_options(char *options)
-{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int option;
-
- if (!options)
- return;
+struct pstore_context {
+ unsigned int kmsg_bytes;
+};
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
+static int pstore_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct pstore_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ int opt;
- if (!*p)
- continue;
+ opt = fs_parse(fc, pstore_param_spec, param, &result);
+ /* pstore has historically ignored invalid kmsg_bytes param */
+ if (opt < 0)
+ return 0;
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_kmsg_bytes:
- if (!match_int(&args[0], &option))
- pstore_set_kmsg_bytes(option);
- break;
- }
+ switch (opt) {
+ case Opt_kmsg_bytes:
+ ctx->kmsg_bytes = result.uint_32;
+ break;
+ default:
+ return -EINVAL;
}
+
+ return 0;
}
/*
@@ -265,14 +266,16 @@ static void parse_options(char *options)
static int pstore_show_options(struct seq_file *m, struct dentry *root)
{
if (kmsg_bytes != CONFIG_PSTORE_DEFAULT_KMSG_BYTES)
- seq_printf(m, ",kmsg_bytes=%lu", kmsg_bytes);
+ seq_printf(m, ",kmsg_bytes=%u", kmsg_bytes);
return 0;
}
-static int pstore_remount(struct super_block *sb, int *flags, char *data)
+static int pstore_reconfigure(struct fs_context *fc)
{
- sync_filesystem(sb);
- parse_options(data);
+ struct pstore_context *ctx = fc->fs_private;
+
+ sync_filesystem(fc->root->d_sb);
+ pstore_set_kmsg_bytes(ctx->kmsg_bytes);
return 0;
}
@@ -281,7 +284,6 @@ static const struct super_operations pstore_ops = {
.statfs = simple_statfs,
.drop_inode = generic_delete_inode,
.evict_inode = pstore_evict_inode,
- .remount_fs = pstore_remount,
.show_options = pstore_show_options,
};
@@ -406,8 +408,9 @@ void pstore_get_records(int quiet)
inode_unlock(d_inode(root));
}
-static int pstore_fill_super(struct super_block *sb, void *data, int silent)
+static int pstore_fill_super(struct super_block *sb, struct fs_context *fc)
{
+ struct pstore_context *ctx = fc->fs_private;
struct inode *inode;
sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -417,7 +420,7 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent)
sb->s_op = &pstore_ops;
sb->s_time_gran = 1;
- parse_options(data);
+ pstore_set_kmsg_bytes(ctx->kmsg_bytes);
inode = pstore_get_inode(sb);
if (inode) {
@@ -438,12 +441,26 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent)
return 0;
}
-static struct dentry *pstore_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int pstore_get_tree(struct fs_context *fc)
+{
+ if (fc->root)
+ return pstore_reconfigure(fc);
+
+ return get_tree_single(fc, pstore_fill_super);
+}
+
+static void pstore_free_fc(struct fs_context *fc)
{
- return mount_single(fs_type, flags, data, pstore_fill_super);
+ kfree(fc->fs_private);
}
+static const struct fs_context_operations pstore_context_ops = {
+ .parse_param = pstore_parse_param,
+ .get_tree = pstore_get_tree,
+ .reconfigure = pstore_reconfigure,
+ .free = pstore_free_fc,
+};
+
static void pstore_kill_sb(struct super_block *sb)
{
guard(mutex)(&pstore_sb_lock);
@@ -456,11 +473,33 @@ static void pstore_kill_sb(struct super_block *sb)
INIT_LIST_HEAD(&records_list);
}
+static int pstore_init_fs_context(struct fs_context *fc)
+{
+ struct pstore_context *ctx;
+
+ ctx = kzalloc(sizeof(struct pstore_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ /*
+ * Global kmsg_bytes is initialized to default, and updated
+ * every time we (re)mount the single-sb filesystem with the
+ * option specified.
+ */
+ ctx->kmsg_bytes = kmsg_bytes;
+
+ fc->fs_private = ctx;
+ fc->ops = &pstore_context_ops;
+
+ return 0;
+}
+
static struct file_system_type pstore_fs_type = {
.owner = THIS_MODULE,
.name = "pstore",
- .mount = pstore_mount,
.kill_sb = pstore_kill_sb,
+ .init_fs_context = pstore_init_fs_context,
+ .parameters = pstore_param_spec,
};
int __init pstore_init_fs(void)
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 801d6c0b170c..a0fc51196910 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -6,7 +6,7 @@
#include <linux/time.h>
#include <linux/pstore.h>
-extern unsigned long kmsg_bytes;
+extern unsigned int kmsg_bytes;
#ifdef CONFIG_PSTORE_FTRACE
extern void pstore_register_ftrace(void);
@@ -35,7 +35,7 @@ static inline void pstore_unregister_pmsg(void) {}
extern struct pstore_info *psinfo;
-extern void pstore_set_kmsg_bytes(int);
+extern void pstore_set_kmsg_bytes(unsigned int bytes);
extern void pstore_get_records(int);
extern void pstore_get_backend_records(struct pstore_info *psi,
struct dentry *root, int quiet);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index f56b066ab80c..557cf9d40177 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -92,8 +92,8 @@ module_param(compress, charp, 0444);
MODULE_PARM_DESC(compress, "compression to use");
/* How much of the kernel log to snapshot */
-unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES;
-module_param(kmsg_bytes, ulong, 0444);
+unsigned int kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES;
+module_param(kmsg_bytes, uint, 0444);
MODULE_PARM_DESC(kmsg_bytes, "amount of kernel log to snapshot (in bytes)");
static void *compress_workspace;
@@ -107,9 +107,9 @@ static void *compress_workspace;
static char *big_oops_buf;
static size_t max_compressed_size;
-void pstore_set_kmsg_bytes(int bytes)
+void pstore_set_kmsg_bytes(unsigned int bytes)
{
- kmsg_bytes = bytes;
+ WRITE_ONCE(kmsg_bytes, bytes);
}
/* Tag each group of saved records with a sequence number */
@@ -278,6 +278,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
struct kmsg_dump_detail *detail)
{
struct kmsg_dump_iter iter;
+ unsigned int remaining = READ_ONCE(kmsg_bytes);
unsigned long total = 0;
const char *why;
unsigned int part = 1;
@@ -300,7 +301,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
kmsg_dump_rewind(&iter);
oopscount++;
- while (total < kmsg_bytes) {
+ while (total < remaining) {
char *dst;
size_t dst_size;
int header_size;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 8006faaaf0ec..775fa905fda0 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -119,13 +119,13 @@ out:
return error;
}
-static int ramfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *ramfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int retval = ramfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0);
if (!retval)
inc_nlink(dir);
- return retval;
+ return ERR_PTR(retval);
}
static int ramfs_create(struct mnt_idmap *idmap, struct inode *dir,
diff --git a/fs/read_write.c b/fs/read_write.c
index a6133241dfb8..bb0ed26a0b3a 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -169,11 +169,16 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence,
if (whence == SEEK_CUR) {
/*
- * f_lock protects against read/modify/write race with
- * other SEEK_CURs. Note that parallel writes and reads
- * behave like SEEK_SET.
+ * If the file requires locking via f_pos_lock we know
+ * that mutual exclusion for SEEK_CUR on the same file
+ * is guaranteed. If the file isn't locked, we take
+ * f_lock to protect against f_pos races with other
+ * SEEK_CURs.
*/
- guard(spinlock)(&file->f_lock);
+ if (file_seek_cur_needs_f_lock(file)) {
+ guard(spinlock)(&file->f_lock);
+ return vfs_setpos(file, file->f_pos + offset, maxsize);
+ }
return vfs_setpos(file, file->f_pos + offset, maxsize);
}
diff --git a/fs/signalfd.c b/fs/signalfd.c
index d1a5f43ce466..d469782f97f4 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -277,15 +277,14 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags)
return ufd;
}
- file = anon_inode_getfile("[signalfd]", &signalfd_fops, ctx,
- O_RDWR | (flags & O_NONBLOCK));
+ file = anon_inode_getfile_fmode("[signalfd]", &signalfd_fops,
+ ctx, O_RDWR | (flags & O_NONBLOCK),
+ FMODE_NOWAIT);
if (IS_ERR(file)) {
put_unused_fd(ufd);
kfree(ctx);
return PTR_ERR(file);
}
- file->f_mode |= FMODE_NOWAIT;
-
fd_install(ufd, file);
} else {
CLASS(fd, f)(ufd);
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index 831fee962c4d..8dea0cf3a8de 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -59,8 +59,8 @@ extern int cifs_unlink(struct inode *dir, struct dentry *dentry);
extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *);
extern int cifs_mknod(struct mnt_idmap *, struct inode *, struct dentry *,
umode_t, dev_t);
-extern int cifs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *,
- umode_t);
+extern struct dentry *cifs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *,
+ umode_t);
extern int cifs_rmdir(struct inode *, struct dentry *);
extern int cifs_rename2(struct mnt_idmap *, struct inode *,
struct dentry *, struct inode *, struct dentry *,
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index d07682020c64..4fc9485c5d91 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -114,19 +114,23 @@ again:
mutex_lock(&ses->session_mutex);
/*
- * Recheck after acquire mutex. If another thread is negotiating
- * and the server never sends an answer the socket will be closed
- * and tcpStatus set to reconnect.
+ * Handle the case where a concurrent thread failed to negotiate or
+ * killed a channel.
*/
spin_lock(&server->srv_lock);
- if (server->tcpStatus == CifsNeedReconnect) {
+ switch (server->tcpStatus) {
+ case CifsExiting:
spin_unlock(&server->srv_lock);
mutex_unlock(&ses->session_mutex);
-
- if (tcon->retry)
- goto again;
- rc = -EHOSTDOWN;
- goto out;
+ return -EHOSTDOWN;
+ case CifsNeedReconnect:
+ spin_unlock(&server->srv_lock);
+ mutex_unlock(&ses->session_mutex);
+ if (!tcon->retry)
+ return -EHOSTDOWN;
+ goto again;
+ default:
+ break;
}
spin_unlock(&server->srv_lock);
@@ -152,16 +156,20 @@ again:
spin_unlock(&ses->ses_lock);
rc = cifs_negotiate_protocol(0, ses, server);
- if (!rc) {
- rc = cifs_setup_session(0, ses, server, ses->local_nls);
- if ((rc == -EACCES) || (rc == -EHOSTDOWN) || (rc == -EKEYREVOKED)) {
- /*
- * Try alternate password for next reconnect if an alternate
- * password is available.
- */
- if (ses->password2)
- swap(ses->password2, ses->password);
- }
+ if (rc) {
+ mutex_unlock(&ses->session_mutex);
+ if (!tcon->retry)
+ return -EHOSTDOWN;
+ goto again;
+ }
+ rc = cifs_setup_session(0, ses, server, ses->local_nls);
+ if ((rc == -EACCES) || (rc == -EHOSTDOWN) || (rc == -EKEYREVOKED)) {
+ /*
+ * Try alternate password for next reconnect if an alternate
+ * password is available.
+ */
+ if (ses->password2)
+ swap(ses->password2, ses->password);
}
/* do we need to reconnect tcon? */
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 8582cf61242c..9e4f7378f30f 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -388,7 +388,7 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon)
spin_unlock(&tcon->tc_lock);
/*
- * BB Add call to invalidate_inodes(sb) for all superblocks mounted
+ * BB Add call to evict_inodes(sb) for all superblocks mounted
* to this tcon.
*/
}
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 616149c7f0a5..3bb21aa58474 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -2207,8 +2207,8 @@ posix_mkdir_get_info:
}
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
-int cifs_mkdir(struct mnt_idmap *idmap, struct inode *inode,
- struct dentry *direntry, umode_t mode)
+struct dentry *cifs_mkdir(struct mnt_idmap *idmap, struct inode *inode,
+ struct dentry *direntry, umode_t mode)
{
int rc = 0;
unsigned int xid;
@@ -2224,10 +2224,10 @@ int cifs_mkdir(struct mnt_idmap *idmap, struct inode *inode,
cifs_sb = CIFS_SB(inode->i_sb);
if (unlikely(cifs_forced_shutdown(cifs_sb)))
- return -EIO;
+ return ERR_PTR(-EIO);
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
- return PTR_ERR(tlink);
+ return ERR_CAST(tlink);
tcon = tlink_tcon(tlink);
xid = get_xid();
@@ -2283,7 +2283,7 @@ mkdir_out:
free_dentry_path(page);
free_xid(xid);
cifs_put_tlink(tlink);
- return rc;
+ return ERR_PTR(rc);
}
int cifs_rmdir(struct inode *inode, struct dentry *direntry)
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index ed7812247ebc..f9c521b3c65e 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -300,32 +300,23 @@ again:
mutex_lock(&ses->session_mutex);
/*
- * if this is called by delayed work, and the channel has been disabled
- * in parallel, the delayed work can continue to execute in parallel
- * there's a chance that this channel may not exist anymore
+ * Handle the case where a concurrent thread failed to negotiate or
+ * killed a channel.
*/
spin_lock(&server->srv_lock);
- if (server->tcpStatus == CifsExiting) {
+ switch (server->tcpStatus) {
+ case CifsExiting:
spin_unlock(&server->srv_lock);
mutex_unlock(&ses->session_mutex);
- rc = -EHOSTDOWN;
- goto out;
- }
-
- /*
- * Recheck after acquire mutex. If another thread is negotiating
- * and the server never sends an answer the socket will be closed
- * and tcpStatus set to reconnect.
- */
- if (server->tcpStatus == CifsNeedReconnect) {
+ return -EHOSTDOWN;
+ case CifsNeedReconnect:
spin_unlock(&server->srv_lock);
mutex_unlock(&ses->session_mutex);
-
- if (tcon->retry)
- goto again;
-
- rc = -EHOSTDOWN;
- goto out;
+ if (!tcon->retry)
+ return -EHOSTDOWN;
+ goto again;
+ default:
+ break;
}
spin_unlock(&server->srv_lock);
@@ -350,43 +341,41 @@ again:
spin_unlock(&ses->ses_lock);
rc = cifs_negotiate_protocol(0, ses, server);
- if (!rc) {
- /*
- * if server stopped supporting multichannel
- * and the first channel reconnected, disable all the others.
- */
- if (ses->chan_count > 1 &&
- !(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
- rc = cifs_chan_skip_or_disable(ses, server,
- from_reconnect);
- if (rc) {
- mutex_unlock(&ses->session_mutex);
- goto out;
- }
- }
-
- rc = cifs_setup_session(0, ses, server, ses->local_nls);
- if ((rc == -EACCES) || (rc == -EKEYEXPIRED) || (rc == -EKEYREVOKED)) {
- /*
- * Try alternate password for next reconnect (key rotation
- * could be enabled on the server e.g.) if an alternate
- * password is available and the current password is expired,
- * but do not swap on non pwd related errors like host down
- */
- if (ses->password2)
- swap(ses->password2, ses->password);
- }
-
- if ((rc == -EACCES) && !tcon->retry) {
- mutex_unlock(&ses->session_mutex);
- rc = -EHOSTDOWN;
- goto failed;
- } else if (rc) {
+ if (rc) {
+ mutex_unlock(&ses->session_mutex);
+ if (!tcon->retry)
+ return -EHOSTDOWN;
+ goto again;
+ }
+ /*
+ * if server stopped supporting multichannel
+ * and the first channel reconnected, disable all the others.
+ */
+ if (ses->chan_count > 1 &&
+ !(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
+ rc = cifs_chan_skip_or_disable(ses, server,
+ from_reconnect);
+ if (rc) {
mutex_unlock(&ses->session_mutex);
goto out;
}
- } else {
+ }
+
+ rc = cifs_setup_session(0, ses, server, ses->local_nls);
+ if ((rc == -EACCES) || (rc == -EKEYEXPIRED) || (rc == -EKEYREVOKED)) {
+ /*
+ * Try alternate password for next reconnect (key rotation
+ * could be enabled on the server e.g.) if an alternate
+ * password is available and the current password is expired,
+ * but do not swap on non pwd related errors like host down
+ */
+ if (ses->password2)
+ swap(ses->password2, ses->password);
+ }
+ if (rc) {
mutex_unlock(&ses->session_mutex);
+ if (rc == -EACCES && !tcon->retry)
+ return -EHOSTDOWN;
goto out;
}
@@ -490,7 +479,6 @@ out:
case SMB2_IOCTL:
rc = -EAGAIN;
}
-failed:
return rc;
}
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 6890016e1923..8554aa5a1059 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -113,11 +113,6 @@ static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf,
if (IS_ERR(d))
goto err_out;
- if (d_is_negative(d)) {
- dput(d);
- goto err_out;
- }
-
path->dentry = d;
path->mnt = mntget(parent_path->mnt);
@@ -211,8 +206,8 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode)
{
struct mnt_idmap *idmap;
struct path path;
- struct dentry *dentry;
- int err;
+ struct dentry *dentry, *d;
+ int err = 0;
dentry = ksmbd_vfs_kern_path_create(work, name,
LOOKUP_NO_SYMLINKS | LOOKUP_DIRECTORY,
@@ -227,27 +222,15 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode)
idmap = mnt_idmap(path.mnt);
mode |= S_IFDIR;
- err = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode);
- if (!err && d_unhashed(dentry)) {
- struct dentry *d;
-
- d = lookup_one(idmap, dentry->d_name.name, dentry->d_parent,
- dentry->d_name.len);
- if (IS_ERR(d)) {
- err = PTR_ERR(d);
- goto out_err;
- }
- if (unlikely(d_is_negative(d))) {
- dput(d);
- err = -ENOENT;
- goto out_err;
- }
-
- ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(d));
- dput(d);
- }
+ d = dentry;
+ dentry = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode);
+ if (IS_ERR(dentry))
+ err = PTR_ERR(dentry);
+ else if (d_is_negative(dentry))
+ err = -ENOENT;
+ if (!err && dentry != d)
+ ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(dentry));
-out_err:
done_path_create(&path, dentry);
if (err)
pr_err("mkdir(%s): creation failed (err:%d)\n", name, err);
@@ -693,6 +676,7 @@ int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path,
struct ksmbd_file *parent_fp;
int new_type;
int err, lookup_flags = LOOKUP_NO_SYMLINKS;
+ int target_lookup_flags = LOOKUP_RENAME_TARGET;
if (ksmbd_override_fsids(work))
return -ENOMEM;
@@ -703,6 +687,14 @@ int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path,
goto revert_fsids;
}
+ /*
+ * explicitly handle file overwrite case, for compatibility with
+ * filesystems that may not support rename flags (e.g: fuse)
+ */
+ if (flags & RENAME_NOREPLACE)
+ target_lookup_flags |= LOOKUP_EXCL;
+ flags &= ~(RENAME_NOREPLACE);
+
retry:
err = vfs_path_parent_lookup(to, lookup_flags | LOOKUP_BENEATH,
&new_path, &new_last, &new_type,
@@ -743,7 +735,7 @@ retry:
}
new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry,
- lookup_flags | LOOKUP_RENAME_TARGET);
+ lookup_flags | target_lookup_flags);
if (IS_ERR(new_dentry)) {
err = PTR_ERR(new_dentry);
goto out3;
@@ -754,16 +746,6 @@ retry:
goto out4;
}
- /*
- * explicitly handle file overwrite case, for compatibility with
- * filesystems that may not support rename flags (e.g: fuse)
- */
- if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry)) {
- err = -EEXIST;
- goto out4;
- }
- flags &= ~(RENAME_NOREPLACE);
-
if (old_child == trap) {
err = -EINVAL;
goto out4;
diff --git a/fs/splice.c b/fs/splice.c
index 23fa5561b944..90d464241f15 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -200,7 +200,6 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
unsigned int spd_pages = spd->nr_pages;
unsigned int tail = pipe->tail;
unsigned int head = pipe->head;
- unsigned int mask = pipe->ring_size - 1;
ssize_t ret = 0;
int page_nr = 0;
@@ -214,7 +213,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
}
while (!pipe_full(head, tail, pipe->max_usage)) {
- struct pipe_buffer *buf = &pipe->bufs[head & mask];
+ struct pipe_buffer *buf = pipe_buf(pipe, head);
buf->page = spd->pages[page_nr];
buf->offset = spd->partial[page_nr].offset;
@@ -247,7 +246,6 @@ ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
{
unsigned int head = pipe->head;
unsigned int tail = pipe->tail;
- unsigned int mask = pipe->ring_size - 1;
int ret;
if (unlikely(!pipe->readers)) {
@@ -256,7 +254,7 @@ ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
} else if (pipe_full(head, tail, pipe->max_usage)) {
ret = -EAGAIN;
} else {
- pipe->bufs[head & mask] = *buf;
+ *pipe_buf(pipe, head) = *buf;
pipe->head = head + 1;
return buf->len;
}
@@ -447,11 +445,10 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des
{
unsigned int head = pipe->head;
unsigned int tail = pipe->tail;
- unsigned int mask = pipe->ring_size - 1;
int ret;
while (!pipe_empty(head, tail)) {
- struct pipe_buffer *buf = &pipe->bufs[tail & mask];
+ struct pipe_buffer *buf = pipe_buf(pipe, tail);
sd->len = buf->len;
if (sd->len > sd->total_len)
@@ -495,8 +492,7 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des
static inline bool eat_empty_buffer(struct pipe_inode_info *pipe)
{
unsigned int tail = pipe->tail;
- unsigned int mask = pipe->ring_size - 1;
- struct pipe_buffer *buf = &pipe->bufs[tail & mask];
+ struct pipe_buffer *buf = pipe_buf(pipe, tail);
if (unlikely(!buf->len)) {
pipe_buf_release(pipe, buf);
@@ -690,7 +686,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
while (sd.total_len) {
struct kiocb kiocb;
struct iov_iter from;
- unsigned int head, tail, mask;
+ unsigned int head, tail;
size_t left;
int n;
@@ -711,12 +707,11 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
head = pipe->head;
tail = pipe->tail;
- mask = pipe->ring_size - 1;
/* build the vector */
left = sd.total_len;
for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) {
- struct pipe_buffer *buf = &pipe->bufs[tail & mask];
+ struct pipe_buffer *buf = pipe_buf(pipe, tail);
size_t this_len = buf->len;
/* zero-length bvecs are not supported, skip them */
@@ -752,7 +747,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
/* dismiss the fully eaten buffers, adjust the partial one */
tail = pipe->tail;
while (ret) {
- struct pipe_buffer *buf = &pipe->bufs[tail & mask];
+ struct pipe_buffer *buf = pipe_buf(pipe, tail);
if (ret >= buf->len) {
ret -= buf->len;
buf->len = 0;
@@ -809,7 +804,7 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
pipe_lock(pipe);
while (len > 0) {
- unsigned int head, tail, mask, bc = 0;
+ unsigned int head, tail, bc = 0;
size_t remain = len;
/*
@@ -846,10 +841,9 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
head = pipe->head;
tail = pipe->tail;
- mask = pipe->ring_size - 1;
while (!pipe_empty(head, tail)) {
- struct pipe_buffer *buf = &pipe->bufs[tail & mask];
+ struct pipe_buffer *buf = pipe_buf(pipe, tail);
size_t seg;
if (!buf->len) {
@@ -894,7 +888,7 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
len -= ret;
tail = pipe->tail;
while (ret > 0) {
- struct pipe_buffer *buf = &pipe->bufs[tail & mask];
+ struct pipe_buffer *buf = pipe_buf(pipe, tail);
size_t seg = min_t(size_t, ret, buf->len);
buf->offset += seg;
@@ -1725,7 +1719,6 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
struct pipe_buffer *ibuf, *obuf;
unsigned int i_head, o_head;
unsigned int i_tail, o_tail;
- unsigned int i_mask, o_mask;
int ret = 0;
bool input_wakeup = false;
@@ -1747,9 +1740,7 @@ retry:
pipe_double_lock(ipipe, opipe);
i_tail = ipipe->tail;
- i_mask = ipipe->ring_size - 1;
o_head = opipe->head;
- o_mask = opipe->ring_size - 1;
do {
size_t o_len;
@@ -1792,8 +1783,8 @@ retry:
goto retry;
}
- ibuf = &ipipe->bufs[i_tail & i_mask];
- obuf = &opipe->bufs[o_head & o_mask];
+ ibuf = pipe_buf(ipipe, i_tail);
+ obuf = pipe_buf(opipe, o_head);
if (len >= ibuf->len) {
/*
@@ -1862,7 +1853,6 @@ static ssize_t link_pipe(struct pipe_inode_info *ipipe,
struct pipe_buffer *ibuf, *obuf;
unsigned int i_head, o_head;
unsigned int i_tail, o_tail;
- unsigned int i_mask, o_mask;
ssize_t ret = 0;
/*
@@ -1873,9 +1863,7 @@ static ssize_t link_pipe(struct pipe_inode_info *ipipe,
pipe_double_lock(ipipe, opipe);
i_tail = ipipe->tail;
- i_mask = ipipe->ring_size - 1;
o_head = opipe->head;
- o_mask = opipe->ring_size - 1;
do {
if (!opipe->readers) {
@@ -1896,8 +1884,8 @@ static ssize_t link_pipe(struct pipe_inode_info *ipipe,
pipe_full(o_head, o_tail, opipe->max_usage))
break;
- ibuf = &ipipe->bufs[i_tail & i_mask];
- obuf = &opipe->bufs[o_head & o_mask];
+ ibuf = pipe_buf(ipipe, i_tail);
+ obuf = pipe_buf(opipe, o_head);
/*
* Get a reference to this pipe buffer,
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 4db0d2b0aab8..181260e72680 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -198,7 +198,7 @@ void squashfs_cache_delete(struct squashfs_cache *cache)
{
int i, j;
- if (cache == NULL)
+ if (IS_ERR(cache) || cache == NULL)
return;
for (i = 0; i < cache->entries; i++) {
diff --git a/fs/super.c b/fs/super.c
index 5a7db4a556e3..97a17f9d9023 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1417,7 +1417,7 @@ static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
if (!surprise)
sync_filesystem(sb);
shrink_dcache_sb(sb);
- invalidate_inodes(sb);
+ evict_inodes(sb);
if (sb->s_op->shutdown)
sb->s_op->shutdown(sb);
@@ -1737,61 +1737,6 @@ struct dentry *mount_nodev(struct file_system_type *fs_type,
}
EXPORT_SYMBOL(mount_nodev);
-int reconfigure_single(struct super_block *s,
- int flags, void *data)
-{
- struct fs_context *fc;
- int ret;
-
- /* The caller really need to be passing fc down into mount_single(),
- * then a chunk of this can be removed. [Bollocks -- AV]
- * Better yet, reconfiguration shouldn't happen, but rather the second
- * mount should be rejected if the parameters are not compatible.
- */
- fc = fs_context_for_reconfigure(s->s_root, flags, MS_RMT_MASK);
- if (IS_ERR(fc))
- return PTR_ERR(fc);
-
- ret = parse_monolithic_mount_data(fc, data);
- if (ret < 0)
- goto out;
-
- ret = reconfigure_super(fc);
-out:
- put_fs_context(fc);
- return ret;
-}
-
-static int compare_single(struct super_block *s, void *p)
-{
- return 1;
-}
-
-struct dentry *mount_single(struct file_system_type *fs_type,
- int flags, void *data,
- int (*fill_super)(struct super_block *, void *, int))
-{
- struct super_block *s;
- int error;
-
- s = sget(fs_type, compare_single, set_anon_super, flags, NULL);
- if (IS_ERR(s))
- return ERR_CAST(s);
- if (!s->s_root) {
- error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
- if (!error)
- s->s_flags |= SB_ACTIVE;
- } else {
- error = reconfigure_single(s, flags, data);
- }
- if (unlikely(error)) {
- deactivate_locked_super(s);
- return ERR_PTR(error);
- }
- return dget(s->s_root);
-}
-EXPORT_SYMBOL(mount_single);
-
/**
* vfs_get_tree - Get the mountable root
* @fc: The superblock configuration context.
diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig
deleted file mode 100644
index 67b3f90afbfd..000000000000
--- a/fs/sysv/Kconfig
+++ /dev/null
@@ -1,38 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-config SYSV_FS
- tristate "System V/Xenix/V7/Coherent file system support"
- depends on BLOCK
- select BUFFER_HEAD
- help
- SCO, Xenix and Coherent are commercial Unix systems for Intel
- machines, and Version 7 was used on the DEC PDP-11. Saying Y
- here would allow you to read from their floppies and hard disk
- partitions.
-
- If you have floppies or hard disk partitions like that, it is likely
- that they contain binaries from those other Unix systems; in order
- to run these binaries, you will want to install linux-abi which is
- a set of kernel modules that lets you run SCO, Xenix, Wyse,
- UnixWare, Dell Unix and System V programs under Linux. It is
- available via FTP (user: ftp) from
- <ftp://ftp.openlinux.org/pub/people/hch/linux-abi/>).
- NOTE: that will work only for binaries from Intel-based systems;
- PDP ones will have to wait until somebody ports Linux to -11 ;-)
-
- If you only intend to mount files from some other Unix over the
- network using NFS, you don't need the System V file system support
- (but you need NFS file system support obviously).
-
- Note that this option is generally not needed for floppies, since a
- good portable way to transport files and directories between unixes
- (and even other operating systems) is given by the tar program ("man
- tar" or preferably "info tar"). Note also that this option has
- nothing whatsoever to do with the option "System V IPC". Read about
- the System V file system in
- <file:Documentation/filesystems/sysv-fs.rst>.
- Saying Y here will enlarge your kernel by about 27 KB.
-
- To compile this as a module, choose M here: the module will be called
- sysv.
-
- If you haven't heard about all of this before, it's safe to say N.
diff --git a/fs/sysv/Makefile b/fs/sysv/Makefile
deleted file mode 100644
index 17d12ba04b18..000000000000
--- a/fs/sysv/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# Makefile for the Linux SystemV/Coherent filesystem routines.
-#
-
-obj-$(CONFIG_SYSV_FS) += sysv.o
-
-sysv-objs := ialloc.o balloc.o inode.o itree.o file.o dir.o \
- namei.o super.o
diff --git a/fs/sysv/balloc.c b/fs/sysv/balloc.c
deleted file mode 100644
index 0e69dbdf7277..000000000000
--- a/fs/sysv/balloc.c
+++ /dev/null
@@ -1,240 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/fs/sysv/balloc.c
- *
- * minix/bitmap.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * ext/freelists.c
- * Copyright (C) 1992 Remy Card (card@masi.ibp.fr)
- *
- * xenix/alloc.c
- * Copyright (C) 1992 Doug Evans
- *
- * coh/alloc.c
- * Copyright (C) 1993 Pascal Haible, Bruno Haible
- *
- * sysv/balloc.c
- * Copyright (C) 1993 Bruno Haible
- *
- * This file contains code for allocating/freeing blocks.
- */
-
-#include <linux/buffer_head.h>
-#include <linux/string.h>
-#include "sysv.h"
-
-/* We don't trust the value of
- sb->sv_sbd2->s_tfree = *sb->sv_free_blocks
- but we nevertheless keep it up to date. */
-
-static inline sysv_zone_t *get_chunk(struct super_block *sb, struct buffer_head *bh)
-{
- char *bh_data = bh->b_data;
-
- if (SYSV_SB(sb)->s_type == FSTYPE_SYSV4)
- return (sysv_zone_t*)(bh_data+4);
- else
- return (sysv_zone_t*)(bh_data+2);
-}
-
-/* NOTE NOTE NOTE: nr is a block number _as_ _stored_ _on_ _disk_ */
-
-void sysv_free_block(struct super_block * sb, sysv_zone_t nr)
-{
- struct sysv_sb_info * sbi = SYSV_SB(sb);
- struct buffer_head * bh;
- sysv_zone_t *blocks = sbi->s_bcache;
- unsigned count;
- unsigned block = fs32_to_cpu(sbi, nr);
-
- /*
- * This code does not work at all for AFS (it has a bitmap
- * free list). As AFS is supposed to be read-only no one
- * should call this for an AFS filesystem anyway...
- */
- if (sbi->s_type == FSTYPE_AFS)
- return;
-
- if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) {
- printk("sysv_free_block: trying to free block not in datazone\n");
- return;
- }
-
- mutex_lock(&sbi->s_lock);
- count = fs16_to_cpu(sbi, *sbi->s_bcache_count);
-
- if (count > sbi->s_flc_size) {
- printk("sysv_free_block: flc_count > flc_size\n");
- mutex_unlock(&sbi->s_lock);
- return;
- }
- /* If the free list head in super-block is full, it is copied
- * into this block being freed, ditto if it's completely empty
- * (applies only on Coherent).
- */
- if (count == sbi->s_flc_size || count == 0) {
- block += sbi->s_block_base;
- bh = sb_getblk(sb, block);
- if (!bh) {
- printk("sysv_free_block: getblk() failed\n");
- mutex_unlock(&sbi->s_lock);
- return;
- }
- memset(bh->b_data, 0, sb->s_blocksize);
- *(__fs16*)bh->b_data = cpu_to_fs16(sbi, count);
- memcpy(get_chunk(sb,bh), blocks, count * sizeof(sysv_zone_t));
- mark_buffer_dirty(bh);
- set_buffer_uptodate(bh);
- brelse(bh);
- count = 0;
- }
- sbi->s_bcache[count++] = nr;
-
- *sbi->s_bcache_count = cpu_to_fs16(sbi, count);
- fs32_add(sbi, sbi->s_free_blocks, 1);
- dirty_sb(sb);
- mutex_unlock(&sbi->s_lock);
-}
-
-sysv_zone_t sysv_new_block(struct super_block * sb)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- unsigned int block;
- sysv_zone_t nr;
- struct buffer_head * bh;
- unsigned count;
-
- mutex_lock(&sbi->s_lock);
- count = fs16_to_cpu(sbi, *sbi->s_bcache_count);
-
- if (count == 0) /* Applies only to Coherent FS */
- goto Enospc;
- nr = sbi->s_bcache[--count];
- if (nr == 0) /* Applies only to Xenix FS, SystemV FS */
- goto Enospc;
-
- block = fs32_to_cpu(sbi, nr);
-
- *sbi->s_bcache_count = cpu_to_fs16(sbi, count);
-
- if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) {
- printk("sysv_new_block: new block %d is not in data zone\n",
- block);
- goto Enospc;
- }
-
- if (count == 0) { /* the last block continues the free list */
- unsigned count;
-
- block += sbi->s_block_base;
- if (!(bh = sb_bread(sb, block))) {
- printk("sysv_new_block: cannot read free-list block\n");
- /* retry this same block next time */
- *sbi->s_bcache_count = cpu_to_fs16(sbi, 1);
- goto Enospc;
- }
- count = fs16_to_cpu(sbi, *(__fs16*)bh->b_data);
- if (count > sbi->s_flc_size) {
- printk("sysv_new_block: free-list block with >flc_size entries\n");
- brelse(bh);
- goto Enospc;
- }
- *sbi->s_bcache_count = cpu_to_fs16(sbi, count);
- memcpy(sbi->s_bcache, get_chunk(sb, bh),
- count * sizeof(sysv_zone_t));
- brelse(bh);
- }
- /* Now the free list head in the superblock is valid again. */
- fs32_add(sbi, sbi->s_free_blocks, -1);
- dirty_sb(sb);
- mutex_unlock(&sbi->s_lock);
- return nr;
-
-Enospc:
- mutex_unlock(&sbi->s_lock);
- return 0;
-}
-
-unsigned long sysv_count_free_blocks(struct super_block * sb)
-{
- struct sysv_sb_info * sbi = SYSV_SB(sb);
- int sb_count;
- int count;
- struct buffer_head * bh = NULL;
- sysv_zone_t *blocks;
- unsigned block;
- int n;
-
- /*
- * This code does not work at all for AFS (it has a bitmap
- * free list). As AFS is supposed to be read-only we just
- * lie and say it has no free block at all.
- */
- if (sbi->s_type == FSTYPE_AFS)
- return 0;
-
- mutex_lock(&sbi->s_lock);
- sb_count = fs32_to_cpu(sbi, *sbi->s_free_blocks);
-
- if (0)
- goto trust_sb;
-
- /* this causes a lot of disk traffic ... */
- count = 0;
- n = fs16_to_cpu(sbi, *sbi->s_bcache_count);
- blocks = sbi->s_bcache;
- while (1) {
- sysv_zone_t zone;
- if (n > sbi->s_flc_size)
- goto E2big;
- zone = 0;
- while (n && (zone = blocks[--n]) != 0)
- count++;
- if (zone == 0)
- break;
-
- block = fs32_to_cpu(sbi, zone);
- if (bh)
- brelse(bh);
-
- if (block < sbi->s_firstdatazone || block >= sbi->s_nzones)
- goto Einval;
- block += sbi->s_block_base;
- bh = sb_bread(sb, block);
- if (!bh)
- goto Eio;
- n = fs16_to_cpu(sbi, *(__fs16*)bh->b_data);
- blocks = get_chunk(sb, bh);
- }
- if (bh)
- brelse(bh);
- if (count != sb_count)
- goto Ecount;
-done:
- mutex_unlock(&sbi->s_lock);
- return count;
-
-Einval:
- printk("sysv_count_free_blocks: new block %d is not in data zone\n",
- block);
- goto trust_sb;
-Eio:
- printk("sysv_count_free_blocks: cannot read free-list block\n");
- goto trust_sb;
-E2big:
- printk("sysv_count_free_blocks: >flc_size entries in free-list block\n");
- if (bh)
- brelse(bh);
-trust_sb:
- count = sb_count;
- goto done;
-Ecount:
- printk("sysv_count_free_blocks: free block count was %d, "
- "correcting to %d\n", sb_count, count);
- if (!sb_rdonly(sb)) {
- *sbi->s_free_blocks = cpu_to_fs32(sbi, count);
- dirty_sb(sb);
- }
- goto done;
-}
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
deleted file mode 100644
index 639307e2ff8c..000000000000
--- a/fs/sysv/dir.c
+++ /dev/null
@@ -1,378 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/fs/sysv/dir.c
- *
- * minix/dir.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * coh/dir.c
- * Copyright (C) 1993 Pascal Haible, Bruno Haible
- *
- * sysv/dir.c
- * Copyright (C) 1993 Bruno Haible
- *
- * SystemV/Coherent directory handling functions
- */
-
-#include <linux/pagemap.h>
-#include <linux/highmem.h>
-#include <linux/swap.h>
-#include "sysv.h"
-
-static int sysv_readdir(struct file *, struct dir_context *);
-
-const struct file_operations sysv_dir_operations = {
- .llseek = generic_file_llseek,
- .read = generic_read_dir,
- .iterate_shared = sysv_readdir,
- .fsync = generic_file_fsync,
-};
-
-static void dir_commit_chunk(struct folio *folio, loff_t pos, unsigned len)
-{
- struct address_space *mapping = folio->mapping;
- struct inode *dir = mapping->host;
-
- block_write_end(NULL, mapping, pos, len, len, folio, NULL);
- if (pos+len > dir->i_size) {
- i_size_write(dir, pos+len);
- mark_inode_dirty(dir);
- }
- folio_unlock(folio);
-}
-
-static int sysv_handle_dirsync(struct inode *dir)
-{
- int err;
-
- err = filemap_write_and_wait(dir->i_mapping);
- if (!err)
- err = sync_inode_metadata(dir, 1);
- return err;
-}
-
-/*
- * Calls to dir_get_folio()/folio_release_kmap() must be nested according to the
- * rules documented in mm/highmem.rst.
- *
- * NOTE: sysv_find_entry() and sysv_dotdot() act as calls to dir_get_folio()
- * and must be treated accordingly for nesting purposes.
- */
-static void *dir_get_folio(struct inode *dir, unsigned long n,
- struct folio **foliop)
-{
- struct folio *folio = read_mapping_folio(dir->i_mapping, n, NULL);
-
- if (IS_ERR(folio))
- return ERR_CAST(folio);
- *foliop = folio;
- return kmap_local_folio(folio, 0);
-}
-
-static int sysv_readdir(struct file *file, struct dir_context *ctx)
-{
- unsigned long pos = ctx->pos;
- struct inode *inode = file_inode(file);
- struct super_block *sb = inode->i_sb;
- unsigned long npages = dir_pages(inode);
- unsigned offset;
- unsigned long n;
-
- ctx->pos = pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1);
- if (pos >= inode->i_size)
- return 0;
-
- offset = pos & ~PAGE_MASK;
- n = pos >> PAGE_SHIFT;
-
- for ( ; n < npages; n++, offset = 0) {
- char *kaddr, *limit;
- struct sysv_dir_entry *de;
- struct folio *folio;
-
- kaddr = dir_get_folio(inode, n, &folio);
- if (IS_ERR(kaddr))
- continue;
- de = (struct sysv_dir_entry *)(kaddr+offset);
- limit = kaddr + PAGE_SIZE - SYSV_DIRSIZE;
- for ( ;(char*)de <= limit; de++, ctx->pos += sizeof(*de)) {
- char *name = de->name;
-
- if (!de->inode)
- continue;
-
- if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN),
- fs16_to_cpu(SYSV_SB(sb), de->inode),
- DT_UNKNOWN)) {
- folio_release_kmap(folio, kaddr);
- return 0;
- }
- }
- folio_release_kmap(folio, kaddr);
- }
- return 0;
-}
-
-/* compare strings: name[0..len-1] (not zero-terminated) and
- * buffer[0..] (filled with zeroes up to buffer[0..maxlen-1])
- */
-static inline int namecompare(int len, int maxlen,
- const char * name, const char * buffer)
-{
- if (len < maxlen && buffer[len])
- return 0;
- return !memcmp(name, buffer, len);
-}
-
-/*
- * sysv_find_entry()
- *
- * finds an entry in the specified directory with the wanted name.
- * It does NOT read the inode of the
- * entry - you'll have to do that yourself if you want to.
- *
- * On Success folio_release_kmap() should be called on *foliop.
- *
- * sysv_find_entry() acts as a call to dir_get_folio() and must be treated
- * accordingly for nesting purposes.
- */
-struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct folio **foliop)
-{
- const char * name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
- struct inode * dir = d_inode(dentry->d_parent);
- unsigned long start, n;
- unsigned long npages = dir_pages(dir);
- struct sysv_dir_entry *de;
-
- start = SYSV_I(dir)->i_dir_start_lookup;
- if (start >= npages)
- start = 0;
- n = start;
-
- do {
- char *kaddr = dir_get_folio(dir, n, foliop);
-
- if (!IS_ERR(kaddr)) {
- de = (struct sysv_dir_entry *)kaddr;
- kaddr += folio_size(*foliop) - SYSV_DIRSIZE;
- for ( ; (char *) de <= kaddr ; de++) {
- if (!de->inode)
- continue;
- if (namecompare(namelen, SYSV_NAMELEN,
- name, de->name))
- goto found;
- }
- folio_release_kmap(*foliop, kaddr);
- }
-
- if (++n >= npages)
- n = 0;
- } while (n != start);
-
- return NULL;
-
-found:
- SYSV_I(dir)->i_dir_start_lookup = n;
- return de;
-}
-
-int sysv_add_link(struct dentry *dentry, struct inode *inode)
-{
- struct inode *dir = d_inode(dentry->d_parent);
- const char * name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
- struct folio *folio = NULL;
- struct sysv_dir_entry * de;
- unsigned long npages = dir_pages(dir);
- unsigned long n;
- char *kaddr;
- loff_t pos;
- int err;
-
- /* We take care of directory expansion in the same loop */
- for (n = 0; n <= npages; n++) {
- kaddr = dir_get_folio(dir, n, &folio);
- if (IS_ERR(kaddr))
- return PTR_ERR(kaddr);
- de = (struct sysv_dir_entry *)kaddr;
- kaddr += PAGE_SIZE - SYSV_DIRSIZE;
- while ((char *)de <= kaddr) {
- if (!de->inode)
- goto got_it;
- err = -EEXIST;
- if (namecompare(namelen, SYSV_NAMELEN, name, de->name))
- goto out_folio;
- de++;
- }
- folio_release_kmap(folio, kaddr);
- }
- BUG();
- return -EINVAL;
-
-got_it:
- pos = folio_pos(folio) + offset_in_folio(folio, de);
- folio_lock(folio);
- err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE);
- if (err)
- goto out_unlock;
- memcpy (de->name, name, namelen);
- memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2);
- de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
- dir_commit_chunk(folio, pos, SYSV_DIRSIZE);
- inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
- mark_inode_dirty(dir);
- err = sysv_handle_dirsync(dir);
-out_folio:
- folio_release_kmap(folio, kaddr);
- return err;
-out_unlock:
- folio_unlock(folio);
- goto out_folio;
-}
-
-int sysv_delete_entry(struct sysv_dir_entry *de, struct folio *folio)
-{
- struct inode *inode = folio->mapping->host;
- loff_t pos = folio_pos(folio) + offset_in_folio(folio, de);
- int err;
-
- folio_lock(folio);
- err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE);
- if (err) {
- folio_unlock(folio);
- return err;
- }
- de->inode = 0;
- dir_commit_chunk(folio, pos, SYSV_DIRSIZE);
- inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
- mark_inode_dirty(inode);
- return sysv_handle_dirsync(inode);
-}
-
-int sysv_make_empty(struct inode *inode, struct inode *dir)
-{
- struct folio *folio = filemap_grab_folio(inode->i_mapping, 0);
- struct sysv_dir_entry * de;
- char *kaddr;
- int err;
-
- if (IS_ERR(folio))
- return PTR_ERR(folio);
- err = sysv_prepare_chunk(folio, 0, 2 * SYSV_DIRSIZE);
- if (err) {
- folio_unlock(folio);
- goto fail;
- }
- kaddr = kmap_local_folio(folio, 0);
- memset(kaddr, 0, folio_size(folio));
-
- de = (struct sysv_dir_entry *)kaddr;
- de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
- strcpy(de->name,".");
- de++;
- de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), dir->i_ino);
- strcpy(de->name,"..");
-
- kunmap_local(kaddr);
- dir_commit_chunk(folio, 0, 2 * SYSV_DIRSIZE);
- err = sysv_handle_dirsync(inode);
-fail:
- folio_put(folio);
- return err;
-}
-
-/*
- * routine to check that the specified directory is empty (for rmdir)
- */
-int sysv_empty_dir(struct inode * inode)
-{
- struct super_block *sb = inode->i_sb;
- struct folio *folio = NULL;
- unsigned long i, npages = dir_pages(inode);
- char *kaddr;
-
- for (i = 0; i < npages; i++) {
- struct sysv_dir_entry *de;
-
- kaddr = dir_get_folio(inode, i, &folio);
- if (IS_ERR(kaddr))
- continue;
-
- de = (struct sysv_dir_entry *)kaddr;
- kaddr += folio_size(folio) - SYSV_DIRSIZE;
-
- for ( ;(char *)de <= kaddr; de++) {
- if (!de->inode)
- continue;
- /* check for . and .. */
- if (de->name[0] != '.')
- goto not_empty;
- if (!de->name[1]) {
- if (de->inode == cpu_to_fs16(SYSV_SB(sb),
- inode->i_ino))
- continue;
- goto not_empty;
- }
- if (de->name[1] != '.' || de->name[2])
- goto not_empty;
- }
- folio_release_kmap(folio, kaddr);
- }
- return 1;
-
-not_empty:
- folio_release_kmap(folio, kaddr);
- return 0;
-}
-
-/* Releases the page */
-int sysv_set_link(struct sysv_dir_entry *de, struct folio *folio,
- struct inode *inode)
-{
- struct inode *dir = folio->mapping->host;
- loff_t pos = folio_pos(folio) + offset_in_folio(folio, de);
- int err;
-
- folio_lock(folio);
- err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE);
- if (err) {
- folio_unlock(folio);
- return err;
- }
- de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
- dir_commit_chunk(folio, pos, SYSV_DIRSIZE);
- inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
- mark_inode_dirty(dir);
- return sysv_handle_dirsync(inode);
-}
-
-/*
- * Calls to dir_get_folio()/folio_release_kmap() must be nested according to the
- * rules documented in mm/highmem.rst.
- *
- * sysv_dotdot() acts as a call to dir_get_folio() and must be treated
- * accordingly for nesting purposes.
- */
-struct sysv_dir_entry *sysv_dotdot(struct inode *dir, struct folio **foliop)
-{
- struct sysv_dir_entry *de = dir_get_folio(dir, 0, foliop);
-
- if (IS_ERR(de))
- return NULL;
- /* ".." is the second directory entry */
- return de + 1;
-}
-
-ino_t sysv_inode_by_name(struct dentry *dentry)
-{
- struct folio *folio;
- struct sysv_dir_entry *de = sysv_find_entry (dentry, &folio);
- ino_t res = 0;
-
- if (de) {
- res = fs16_to_cpu(SYSV_SB(dentry->d_sb), de->inode);
- folio_release_kmap(folio, de);
- }
- return res;
-}
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
deleted file mode 100644
index c645f60bdb7f..000000000000
--- a/fs/sysv/file.c
+++ /dev/null
@@ -1,59 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/fs/sysv/file.c
- *
- * minix/file.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * coh/file.c
- * Copyright (C) 1993 Pascal Haible, Bruno Haible
- *
- * sysv/file.c
- * Copyright (C) 1993 Bruno Haible
- *
- * SystemV/Coherent regular file handling primitives
- */
-
-#include "sysv.h"
-
-/*
- * We have mostly NULLs here: the current defaults are OK for
- * the coh filesystem.
- */
-const struct file_operations sysv_file_operations = {
- .llseek = generic_file_llseek,
- .read_iter = generic_file_read_iter,
- .write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
- .fsync = generic_file_fsync,
- .splice_read = filemap_splice_read,
-};
-
-static int sysv_setattr(struct mnt_idmap *idmap,
- struct dentry *dentry, struct iattr *attr)
-{
- struct inode *inode = d_inode(dentry);
- int error;
-
- error = setattr_prepare(&nop_mnt_idmap, dentry, attr);
- if (error)
- return error;
-
- if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode)) {
- error = inode_newsize_ok(inode, attr->ia_size);
- if (error)
- return error;
- truncate_setsize(inode, attr->ia_size);
- sysv_truncate(inode);
- }
-
- setattr_copy(&nop_mnt_idmap, inode, attr);
- mark_inode_dirty(inode);
- return 0;
-}
-
-const struct inode_operations sysv_file_inode_operations = {
- .setattr = sysv_setattr,
- .getattr = sysv_getattr,
-};
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
deleted file mode 100644
index 269df6d49815..000000000000
--- a/fs/sysv/ialloc.c
+++ /dev/null
@@ -1,235 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/fs/sysv/ialloc.c
- *
- * minix/bitmap.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * ext/freelists.c
- * Copyright (C) 1992 Remy Card (card@masi.ibp.fr)
- *
- * xenix/alloc.c
- * Copyright (C) 1992 Doug Evans
- *
- * coh/alloc.c
- * Copyright (C) 1993 Pascal Haible, Bruno Haible
- *
- * sysv/ialloc.c
- * Copyright (C) 1993 Bruno Haible
- *
- * This file contains code for allocating/freeing inodes.
- */
-
-#include <linux/kernel.h>
-#include <linux/stddef.h>
-#include <linux/sched.h>
-#include <linux/stat.h>
-#include <linux/string.h>
-#include <linux/buffer_head.h>
-#include <linux/writeback.h>
-#include "sysv.h"
-
-/* We don't trust the value of
- sb->sv_sbd2->s_tinode = *sb->sv_sb_total_free_inodes
- but we nevertheless keep it up to date. */
-
-/* An inode on disk is considered free if both i_mode == 0 and i_nlink == 0. */
-
-/* return &sb->sv_sb_fic_inodes[i] = &sbd->s_inode[i]; */
-static inline sysv_ino_t *
-sv_sb_fic_inode(struct super_block * sb, unsigned int i)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
-
- if (sbi->s_bh1 == sbi->s_bh2)
- return &sbi->s_sb_fic_inodes[i];
- else {
- /* 512 byte Xenix FS */
- unsigned int offset = offsetof(struct xenix_super_block, s_inode[i]);
- if (offset < 512)
- return (sysv_ino_t*)(sbi->s_sbd1 + offset);
- else
- return (sysv_ino_t*)(sbi->s_sbd2 + offset);
- }
-}
-
-struct sysv_inode *
-sysv_raw_inode(struct super_block *sb, unsigned ino, struct buffer_head **bh)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- struct sysv_inode *res;
- int block = sbi->s_firstinodezone + sbi->s_block_base;
-
- block += (ino-1) >> sbi->s_inodes_per_block_bits;
- *bh = sb_bread(sb, block);
- if (!*bh)
- return NULL;
- res = (struct sysv_inode *)(*bh)->b_data;
- return res + ((ino-1) & sbi->s_inodes_per_block_1);
-}
-
-static int refill_free_cache(struct super_block *sb)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- struct buffer_head * bh;
- struct sysv_inode * raw_inode;
- int i = 0, ino;
-
- ino = SYSV_ROOT_INO+1;
- raw_inode = sysv_raw_inode(sb, ino, &bh);
- if (!raw_inode)
- goto out;
- while (ino <= sbi->s_ninodes) {
- if (raw_inode->i_mode == 0 && raw_inode->i_nlink == 0) {
- *sv_sb_fic_inode(sb,i++) = cpu_to_fs16(SYSV_SB(sb), ino);
- if (i == sbi->s_fic_size)
- break;
- }
- if ((ino++ & sbi->s_inodes_per_block_1) == 0) {
- brelse(bh);
- raw_inode = sysv_raw_inode(sb, ino, &bh);
- if (!raw_inode)
- goto out;
- } else
- raw_inode++;
- }
- brelse(bh);
-out:
- return i;
-}
-
-void sysv_free_inode(struct inode * inode)
-{
- struct super_block *sb = inode->i_sb;
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- unsigned int ino;
- struct buffer_head * bh;
- struct sysv_inode * raw_inode;
- unsigned count;
-
- sb = inode->i_sb;
- ino = inode->i_ino;
- if (ino <= SYSV_ROOT_INO || ino > sbi->s_ninodes) {
- printk("sysv_free_inode: inode 0,1,2 or nonexistent inode\n");
- return;
- }
- raw_inode = sysv_raw_inode(sb, ino, &bh);
- if (!raw_inode) {
- printk("sysv_free_inode: unable to read inode block on device "
- "%s\n", inode->i_sb->s_id);
- return;
- }
- mutex_lock(&sbi->s_lock);
- count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count);
- if (count < sbi->s_fic_size) {
- *sv_sb_fic_inode(sb,count++) = cpu_to_fs16(sbi, ino);
- *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count);
- }
- fs16_add(sbi, sbi->s_sb_total_free_inodes, 1);
- dirty_sb(sb);
- memset(raw_inode, 0, sizeof(struct sysv_inode));
- mark_buffer_dirty(bh);
- mutex_unlock(&sbi->s_lock);
- brelse(bh);
-}
-
-struct inode * sysv_new_inode(const struct inode * dir, umode_t mode)
-{
- struct super_block *sb = dir->i_sb;
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- struct inode *inode;
- sysv_ino_t ino;
- unsigned count;
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE
- };
-
- inode = new_inode(sb);
- if (!inode)
- return ERR_PTR(-ENOMEM);
-
- mutex_lock(&sbi->s_lock);
- count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count);
- if (count == 0 || (*sv_sb_fic_inode(sb,count-1) == 0)) {
- count = refill_free_cache(sb);
- if (count == 0) {
- iput(inode);
- mutex_unlock(&sbi->s_lock);
- return ERR_PTR(-ENOSPC);
- }
- }
- /* Now count > 0. */
- ino = *sv_sb_fic_inode(sb,--count);
- *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count);
- fs16_add(sbi, sbi->s_sb_total_free_inodes, -1);
- dirty_sb(sb);
- inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
- inode->i_ino = fs16_to_cpu(sbi, ino);
- simple_inode_init_ts(inode);
- inode->i_blocks = 0;
- memset(SYSV_I(inode)->i_data, 0, sizeof(SYSV_I(inode)->i_data));
- SYSV_I(inode)->i_dir_start_lookup = 0;
- insert_inode_hash(inode);
- mark_inode_dirty(inode);
-
- sysv_write_inode(inode, &wbc); /* ensure inode not allocated again */
- mark_inode_dirty(inode); /* cleared by sysv_write_inode() */
- /* That's it. */
- mutex_unlock(&sbi->s_lock);
- return inode;
-}
-
-unsigned long sysv_count_free_inodes(struct super_block * sb)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- struct buffer_head * bh;
- struct sysv_inode * raw_inode;
- int ino, count, sb_count;
-
- mutex_lock(&sbi->s_lock);
-
- sb_count = fs16_to_cpu(sbi, *sbi->s_sb_total_free_inodes);
-
- if (0)
- goto trust_sb;
-
- /* this causes a lot of disk traffic ... */
- count = 0;
- ino = SYSV_ROOT_INO+1;
- raw_inode = sysv_raw_inode(sb, ino, &bh);
- if (!raw_inode)
- goto Eio;
- while (ino <= sbi->s_ninodes) {
- if (raw_inode->i_mode == 0 && raw_inode->i_nlink == 0)
- count++;
- if ((ino++ & sbi->s_inodes_per_block_1) == 0) {
- brelse(bh);
- raw_inode = sysv_raw_inode(sb, ino, &bh);
- if (!raw_inode)
- goto Eio;
- } else
- raw_inode++;
- }
- brelse(bh);
- if (count != sb_count)
- goto Einval;
-out:
- mutex_unlock(&sbi->s_lock);
- return count;
-
-Einval:
- printk("sysv_count_free_inodes: "
- "free inode count was %d, correcting to %d\n",
- sb_count, count);
- if (!sb_rdonly(sb)) {
- *sbi->s_sb_total_free_inodes = cpu_to_fs16(SYSV_SB(sb), count);
- dirty_sb(sb);
- }
- goto out;
-
-Eio:
- printk("sysv_count_free_inodes: unable to read inode table\n");
-trust_sb:
- count = sb_count;
- goto out;
-}
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
deleted file mode 100644
index 76bc2d5e75a9..000000000000
--- a/fs/sysv/inode.c
+++ /dev/null
@@ -1,354 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/fs/sysv/inode.c
- *
- * minix/inode.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * xenix/inode.c
- * Copyright (C) 1992 Doug Evans
- *
- * coh/inode.c
- * Copyright (C) 1993 Pascal Haible, Bruno Haible
- *
- * sysv/inode.c
- * Copyright (C) 1993 Paul B. Monday
- *
- * sysv/inode.c
- * Copyright (C) 1993 Bruno Haible
- * Copyright (C) 1997, 1998 Krzysztof G. Baranowski
- *
- * This file contains code for allocating/freeing inodes and for read/writing
- * the superblock.
- */
-
-#include <linux/highuid.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/buffer_head.h>
-#include <linux/vfs.h>
-#include <linux/writeback.h>
-#include <linux/namei.h>
-#include <asm/byteorder.h>
-#include "sysv.h"
-
-static int sysv_sync_fs(struct super_block *sb, int wait)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- u32 time = (u32)ktime_get_real_seconds(), old_time;
-
- mutex_lock(&sbi->s_lock);
-
- /*
- * If we are going to write out the super block,
- * then attach current time stamp.
- * But if the filesystem was marked clean, keep it clean.
- */
- old_time = fs32_to_cpu(sbi, *sbi->s_sb_time);
- if (sbi->s_type == FSTYPE_SYSV4) {
- if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38u - old_time))
- *sbi->s_sb_state = cpu_to_fs32(sbi, 0x7c269d38u - time);
- *sbi->s_sb_time = cpu_to_fs32(sbi, time);
- mark_buffer_dirty(sbi->s_bh2);
- }
-
- mutex_unlock(&sbi->s_lock);
-
- return 0;
-}
-
-static int sysv_remount(struct super_block *sb, int *flags, char *data)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
-
- sync_filesystem(sb);
- if (sbi->s_forced_ro)
- *flags |= SB_RDONLY;
- return 0;
-}
-
-static void sysv_put_super(struct super_block *sb)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
-
- if (!sb_rdonly(sb)) {
- /* XXX ext2 also updates the state here */
- mark_buffer_dirty(sbi->s_bh1);
- if (sbi->s_bh1 != sbi->s_bh2)
- mark_buffer_dirty(sbi->s_bh2);
- }
-
- brelse(sbi->s_bh1);
- if (sbi->s_bh1 != sbi->s_bh2)
- brelse(sbi->s_bh2);
-
- kfree(sbi);
-}
-
-static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
- struct super_block *sb = dentry->d_sb;
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
-
- buf->f_type = sb->s_magic;
- buf->f_bsize = sb->s_blocksize;
- buf->f_blocks = sbi->s_ndatazones;
- buf->f_bavail = buf->f_bfree = sysv_count_free_blocks(sb);
- buf->f_files = sbi->s_ninodes;
- buf->f_ffree = sysv_count_free_inodes(sb);
- buf->f_namelen = SYSV_NAMELEN;
- buf->f_fsid = u64_to_fsid(id);
- return 0;
-}
-
-/*
- * NXI <-> N0XI for PDP, XIN <-> XIN0 for le32, NIX <-> 0NIX for be32
- */
-static inline void read3byte(struct sysv_sb_info *sbi,
- unsigned char * from, unsigned char * to)
-{
- if (sbi->s_bytesex == BYTESEX_PDP) {
- to[0] = from[0];
- to[1] = 0;
- to[2] = from[1];
- to[3] = from[2];
- } else if (sbi->s_bytesex == BYTESEX_LE) {
- to[0] = from[0];
- to[1] = from[1];
- to[2] = from[2];
- to[3] = 0;
- } else {
- to[0] = 0;
- to[1] = from[0];
- to[2] = from[1];
- to[3] = from[2];
- }
-}
-
-static inline void write3byte(struct sysv_sb_info *sbi,
- unsigned char * from, unsigned char * to)
-{
- if (sbi->s_bytesex == BYTESEX_PDP) {
- to[0] = from[0];
- to[1] = from[2];
- to[2] = from[3];
- } else if (sbi->s_bytesex == BYTESEX_LE) {
- to[0] = from[0];
- to[1] = from[1];
- to[2] = from[2];
- } else {
- to[0] = from[1];
- to[1] = from[2];
- to[2] = from[3];
- }
-}
-
-static const struct inode_operations sysv_symlink_inode_operations = {
- .get_link = page_get_link,
- .getattr = sysv_getattr,
-};
-
-void sysv_set_inode(struct inode *inode, dev_t rdev)
-{
- if (S_ISREG(inode->i_mode)) {
- inode->i_op = &sysv_file_inode_operations;
- inode->i_fop = &sysv_file_operations;
- inode->i_mapping->a_ops = &sysv_aops;
- } else if (S_ISDIR(inode->i_mode)) {
- inode->i_op = &sysv_dir_inode_operations;
- inode->i_fop = &sysv_dir_operations;
- inode->i_mapping->a_ops = &sysv_aops;
- } else if (S_ISLNK(inode->i_mode)) {
- inode->i_op = &sysv_symlink_inode_operations;
- inode_nohighmem(inode);
- inode->i_mapping->a_ops = &sysv_aops;
- } else
- init_special_inode(inode, inode->i_mode, rdev);
-}
-
-struct inode *sysv_iget(struct super_block *sb, unsigned int ino)
-{
- struct sysv_sb_info * sbi = SYSV_SB(sb);
- struct buffer_head * bh;
- struct sysv_inode * raw_inode;
- struct sysv_inode_info * si;
- struct inode *inode;
- unsigned int block;
-
- if (!ino || ino > sbi->s_ninodes) {
- printk("Bad inode number on dev %s: %d is out of range\n",
- sb->s_id, ino);
- return ERR_PTR(-EIO);
- }
-
- inode = iget_locked(sb, ino);
- if (!inode)
- return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
- return inode;
-
- raw_inode = sysv_raw_inode(sb, ino, &bh);
- if (!raw_inode) {
- printk("Major problem: unable to read inode from dev %s\n",
- inode->i_sb->s_id);
- goto bad_inode;
- }
- /* SystemV FS: kludge permissions if ino==SYSV_ROOT_INO ?? */
- inode->i_mode = fs16_to_cpu(sbi, raw_inode->i_mode);
- i_uid_write(inode, (uid_t)fs16_to_cpu(sbi, raw_inode->i_uid));
- i_gid_write(inode, (gid_t)fs16_to_cpu(sbi, raw_inode->i_gid));
- set_nlink(inode, fs16_to_cpu(sbi, raw_inode->i_nlink));
- inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size);
- inode_set_atime(inode, fs32_to_cpu(sbi, raw_inode->i_atime), 0);
- inode_set_mtime(inode, fs32_to_cpu(sbi, raw_inode->i_mtime), 0);
- inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->i_ctime), 0);
- inode->i_blocks = 0;
-
- si = SYSV_I(inode);
- for (block = 0; block < 10+1+1+1; block++)
- read3byte(sbi, &raw_inode->i_data[3*block],
- (u8 *)&si->i_data[block]);
- brelse(bh);
- si->i_dir_start_lookup = 0;
- if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
- sysv_set_inode(inode,
- old_decode_dev(fs32_to_cpu(sbi, si->i_data[0])));
- else
- sysv_set_inode(inode, 0);
- unlock_new_inode(inode);
- return inode;
-
-bad_inode:
- iget_failed(inode);
- return ERR_PTR(-EIO);
-}
-
-static int __sysv_write_inode(struct inode *inode, int wait)
-{
- struct super_block * sb = inode->i_sb;
- struct sysv_sb_info * sbi = SYSV_SB(sb);
- struct buffer_head * bh;
- struct sysv_inode * raw_inode;
- struct sysv_inode_info * si;
- unsigned int ino, block;
- int err = 0;
-
- ino = inode->i_ino;
- if (!ino || ino > sbi->s_ninodes) {
- printk("Bad inode number on dev %s: %d is out of range\n",
- inode->i_sb->s_id, ino);
- return -EIO;
- }
- raw_inode = sysv_raw_inode(sb, ino, &bh);
- if (!raw_inode) {
- printk("unable to read i-node block\n");
- return -EIO;
- }
-
- raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode);
- raw_inode->i_uid = cpu_to_fs16(sbi, fs_high2lowuid(i_uid_read(inode)));
- raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(i_gid_read(inode)));
- raw_inode->i_nlink = cpu_to_fs16(sbi, inode->i_nlink);
- raw_inode->i_size = cpu_to_fs32(sbi, inode->i_size);
- raw_inode->i_atime = cpu_to_fs32(sbi, inode_get_atime_sec(inode));
- raw_inode->i_mtime = cpu_to_fs32(sbi, inode_get_mtime_sec(inode));
- raw_inode->i_ctime = cpu_to_fs32(sbi, inode_get_ctime_sec(inode));
-
- si = SYSV_I(inode);
- if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
- si->i_data[0] = cpu_to_fs32(sbi, old_encode_dev(inode->i_rdev));
- for (block = 0; block < 10+1+1+1; block++)
- write3byte(sbi, (u8 *)&si->i_data[block],
- &raw_inode->i_data[3*block]);
- mark_buffer_dirty(bh);
- if (wait) {
- sync_dirty_buffer(bh);
- if (buffer_req(bh) && !buffer_uptodate(bh)) {
- printk ("IO error syncing sysv inode [%s:%08x]\n",
- sb->s_id, ino);
- err = -EIO;
- }
- }
- brelse(bh);
- return err;
-}
-
-int sysv_write_inode(struct inode *inode, struct writeback_control *wbc)
-{
- return __sysv_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
-}
-
-int sysv_sync_inode(struct inode *inode)
-{
- return __sysv_write_inode(inode, 1);
-}
-
-static void sysv_evict_inode(struct inode *inode)
-{
- truncate_inode_pages_final(&inode->i_data);
- if (!inode->i_nlink) {
- inode->i_size = 0;
- sysv_truncate(inode);
- }
- invalidate_inode_buffers(inode);
- clear_inode(inode);
- if (!inode->i_nlink)
- sysv_free_inode(inode);
-}
-
-static struct kmem_cache *sysv_inode_cachep;
-
-static struct inode *sysv_alloc_inode(struct super_block *sb)
-{
- struct sysv_inode_info *si;
-
- si = alloc_inode_sb(sb, sysv_inode_cachep, GFP_KERNEL);
- if (!si)
- return NULL;
- return &si->vfs_inode;
-}
-
-static void sysv_free_in_core_inode(struct inode *inode)
-{
- kmem_cache_free(sysv_inode_cachep, SYSV_I(inode));
-}
-
-static void init_once(void *p)
-{
- struct sysv_inode_info *si = (struct sysv_inode_info *)p;
-
- inode_init_once(&si->vfs_inode);
-}
-
-const struct super_operations sysv_sops = {
- .alloc_inode = sysv_alloc_inode,
- .free_inode = sysv_free_in_core_inode,
- .write_inode = sysv_write_inode,
- .evict_inode = sysv_evict_inode,
- .put_super = sysv_put_super,
- .sync_fs = sysv_sync_fs,
- .remount_fs = sysv_remount,
- .statfs = sysv_statfs,
-};
-
-int __init sysv_init_icache(void)
-{
- sysv_inode_cachep = kmem_cache_create("sysv_inode_cache",
- sizeof(struct sysv_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
- init_once);
- if (!sysv_inode_cachep)
- return -ENOMEM;
- return 0;
-}
-
-void sysv_destroy_icache(void)
-{
- /*
- * Make sure all delayed rcu free inodes are flushed before we
- * destroy cache.
- */
- rcu_barrier();
- kmem_cache_destroy(sysv_inode_cachep);
-}
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
deleted file mode 100644
index 451e95f474fa..000000000000
--- a/fs/sysv/itree.c
+++ /dev/null
@@ -1,511 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/fs/sysv/itree.c
- *
- * Handling of indirect blocks' trees.
- * AV, Sep--Dec 2000
- */
-
-#include <linux/buffer_head.h>
-#include <linux/mount.h>
-#include <linux/mpage.h>
-#include <linux/string.h>
-#include "sysv.h"
-
-enum {DIRECT = 10, DEPTH = 4}; /* Have triple indirect */
-
-static inline void dirty_indirect(struct buffer_head *bh, struct inode *inode)
-{
- mark_buffer_dirty_inode(bh, inode);
- if (IS_SYNC(inode))
- sync_dirty_buffer(bh);
-}
-
-static int block_to_path(struct inode *inode, long block, int offsets[DEPTH])
-{
- struct super_block *sb = inode->i_sb;
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- int ptrs_bits = sbi->s_ind_per_block_bits;
- unsigned long indirect_blocks = sbi->s_ind_per_block,
- double_blocks = sbi->s_ind_per_block_2;
- int n = 0;
-
- if (block < 0) {
- printk("sysv_block_map: block < 0\n");
- } else if (block < DIRECT) {
- offsets[n++] = block;
- } else if ( (block -= DIRECT) < indirect_blocks) {
- offsets[n++] = DIRECT;
- offsets[n++] = block;
- } else if ((block -= indirect_blocks) < double_blocks) {
- offsets[n++] = DIRECT+1;
- offsets[n++] = block >> ptrs_bits;
- offsets[n++] = block & (indirect_blocks - 1);
- } else if (((block -= double_blocks) >> (ptrs_bits * 2)) < indirect_blocks) {
- offsets[n++] = DIRECT+2;
- offsets[n++] = block >> (ptrs_bits * 2);
- offsets[n++] = (block >> ptrs_bits) & (indirect_blocks - 1);
- offsets[n++] = block & (indirect_blocks - 1);
- } else {
- /* nothing */;
- }
- return n;
-}
-
-static inline int block_to_cpu(struct sysv_sb_info *sbi, sysv_zone_t nr)
-{
- return sbi->s_block_base + fs32_to_cpu(sbi, nr);
-}
-
-typedef struct {
- sysv_zone_t *p;
- sysv_zone_t key;
- struct buffer_head *bh;
-} Indirect;
-
-static DEFINE_RWLOCK(pointers_lock);
-
-static inline void add_chain(Indirect *p, struct buffer_head *bh, sysv_zone_t *v)
-{
- p->key = *(p->p = v);
- p->bh = bh;
-}
-
-static inline int verify_chain(Indirect *from, Indirect *to)
-{
- while (from <= to && from->key == *from->p)
- from++;
- return (from > to);
-}
-
-static inline sysv_zone_t *block_end(struct buffer_head *bh)
-{
- return (sysv_zone_t*)((char*)bh->b_data + bh->b_size);
-}
-
-static Indirect *get_branch(struct inode *inode,
- int depth,
- int offsets[],
- Indirect chain[],
- int *err)
-{
- struct super_block *sb = inode->i_sb;
- Indirect *p = chain;
- struct buffer_head *bh;
-
- *err = 0;
- add_chain(chain, NULL, SYSV_I(inode)->i_data + *offsets);
- if (!p->key)
- goto no_block;
- while (--depth) {
- int block = block_to_cpu(SYSV_SB(sb), p->key);
- bh = sb_bread(sb, block);
- if (!bh)
- goto failure;
- read_lock(&pointers_lock);
- if (!verify_chain(chain, p))
- goto changed;
- add_chain(++p, bh, (sysv_zone_t*)bh->b_data + *++offsets);
- read_unlock(&pointers_lock);
- if (!p->key)
- goto no_block;
- }
- return NULL;
-
-changed:
- read_unlock(&pointers_lock);
- brelse(bh);
- *err = -EAGAIN;
- goto no_block;
-failure:
- *err = -EIO;
-no_block:
- return p;
-}
-
-static int alloc_branch(struct inode *inode,
- int num,
- int *offsets,
- Indirect *branch)
-{
- int blocksize = inode->i_sb->s_blocksize;
- int n = 0;
- int i;
-
- branch[0].key = sysv_new_block(inode->i_sb);
- if (branch[0].key) for (n = 1; n < num; n++) {
- struct buffer_head *bh;
- int parent;
- /* Allocate the next block */
- branch[n].key = sysv_new_block(inode->i_sb);
- if (!branch[n].key)
- break;
- /*
- * Get buffer_head for parent block, zero it out and set
- * the pointer to new one, then send parent to disk.
- */
- parent = block_to_cpu(SYSV_SB(inode->i_sb), branch[n-1].key);
- bh = sb_getblk(inode->i_sb, parent);
- if (!bh) {
- sysv_free_block(inode->i_sb, branch[n].key);
- break;
- }
- lock_buffer(bh);
- memset(bh->b_data, 0, blocksize);
- branch[n].bh = bh;
- branch[n].p = (sysv_zone_t*) bh->b_data + offsets[n];
- *branch[n].p = branch[n].key;
- set_buffer_uptodate(bh);
- unlock_buffer(bh);
- dirty_indirect(bh, inode);
- }
- if (n == num)
- return 0;
-
- /* Allocation failed, free what we already allocated */
- for (i = 1; i < n; i++)
- bforget(branch[i].bh);
- for (i = 0; i < n; i++)
- sysv_free_block(inode->i_sb, branch[i].key);
- return -ENOSPC;
-}
-
-static inline int splice_branch(struct inode *inode,
- Indirect chain[],
- Indirect *where,
- int num)
-{
- int i;
-
- /* Verify that place we are splicing to is still there and vacant */
- write_lock(&pointers_lock);
- if (!verify_chain(chain, where-1) || *where->p)
- goto changed;
- *where->p = where->key;
- write_unlock(&pointers_lock);
-
- inode_set_ctime_current(inode);
-
- /* had we spliced it onto indirect block? */
- if (where->bh)
- dirty_indirect(where->bh, inode);
-
- if (IS_SYNC(inode))
- sysv_sync_inode(inode);
- else
- mark_inode_dirty(inode);
- return 0;
-
-changed:
- write_unlock(&pointers_lock);
- for (i = 1; i < num; i++)
- bforget(where[i].bh);
- for (i = 0; i < num; i++)
- sysv_free_block(inode->i_sb, where[i].key);
- return -EAGAIN;
-}
-
-static int get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
-{
- int err = -EIO;
- int offsets[DEPTH];
- Indirect chain[DEPTH];
- struct super_block *sb = inode->i_sb;
- Indirect *partial;
- int left;
- int depth = block_to_path(inode, iblock, offsets);
-
- if (depth == 0)
- goto out;
-
-reread:
- partial = get_branch(inode, depth, offsets, chain, &err);
-
- /* Simplest case - block found, no allocation needed */
- if (!partial) {
-got_it:
- map_bh(bh_result, sb, block_to_cpu(SYSV_SB(sb),
- chain[depth-1].key));
- /* Clean up and exit */
- partial = chain+depth-1; /* the whole chain */
- goto cleanup;
- }
-
- /* Next simple case - plain lookup or failed read of indirect block */
- if (!create || err == -EIO) {
-cleanup:
- while (partial > chain) {
- brelse(partial->bh);
- partial--;
- }
-out:
- return err;
- }
-
- /*
- * Indirect block might be removed by truncate while we were
- * reading it. Handling of that case (forget what we've got and
- * reread) is taken out of the main path.
- */
- if (err == -EAGAIN)
- goto changed;
-
- left = (chain + depth) - partial;
- err = alloc_branch(inode, left, offsets+(partial-chain), partial);
- if (err)
- goto cleanup;
-
- if (splice_branch(inode, chain, partial, left) < 0)
- goto changed;
-
- set_buffer_new(bh_result);
- goto got_it;
-
-changed:
- while (partial > chain) {
- brelse(partial->bh);
- partial--;
- }
- goto reread;
-}
-
-static inline int all_zeroes(sysv_zone_t *p, sysv_zone_t *q)
-{
- while (p < q)
- if (*p++)
- return 0;
- return 1;
-}
-
-static Indirect *find_shared(struct inode *inode,
- int depth,
- int offsets[],
- Indirect chain[],
- sysv_zone_t *top)
-{
- Indirect *partial, *p;
- int k, err;
-
- *top = 0;
- for (k = depth; k > 1 && !offsets[k-1]; k--)
- ;
- partial = get_branch(inode, k, offsets, chain, &err);
-
- write_lock(&pointers_lock);
- if (!partial)
- partial = chain + k-1;
- /*
- * If the branch acquired continuation since we've looked at it -
- * fine, it should all survive and (new) top doesn't belong to us.
- */
- if (!partial->key && *partial->p) {
- write_unlock(&pointers_lock);
- goto no_top;
- }
- for (p=partial; p>chain && all_zeroes((sysv_zone_t*)p->bh->b_data,p->p); p--)
- ;
- /*
- * OK, we've found the last block that must survive. The rest of our
- * branch should be detached before unlocking. However, if that rest
- * of branch is all ours and does not grow immediately from the inode
- * it's easier to cheat and just decrement partial->p.
- */
- if (p == chain + k - 1 && p > chain) {
- p->p--;
- } else {
- *top = *p->p;
- *p->p = 0;
- }
- write_unlock(&pointers_lock);
-
- while (partial > p) {
- brelse(partial->bh);
- partial--;
- }
-no_top:
- return partial;
-}
-
-static inline void free_data(struct inode *inode, sysv_zone_t *p, sysv_zone_t *q)
-{
- for ( ; p < q ; p++) {
- sysv_zone_t nr = *p;
- if (nr) {
- *p = 0;
- sysv_free_block(inode->i_sb, nr);
- mark_inode_dirty(inode);
- }
- }
-}
-
-static void free_branches(struct inode *inode, sysv_zone_t *p, sysv_zone_t *q, int depth)
-{
- struct buffer_head * bh;
- struct super_block *sb = inode->i_sb;
-
- if (depth--) {
- for ( ; p < q ; p++) {
- int block;
- sysv_zone_t nr = *p;
- if (!nr)
- continue;
- *p = 0;
- block = block_to_cpu(SYSV_SB(sb), nr);
- bh = sb_bread(sb, block);
- if (!bh)
- continue;
- free_branches(inode, (sysv_zone_t*)bh->b_data,
- block_end(bh), depth);
- bforget(bh);
- sysv_free_block(sb, nr);
- mark_inode_dirty(inode);
- }
- } else
- free_data(inode, p, q);
-}
-
-void sysv_truncate (struct inode * inode)
-{
- sysv_zone_t *i_data = SYSV_I(inode)->i_data;
- int offsets[DEPTH];
- Indirect chain[DEPTH];
- Indirect *partial;
- sysv_zone_t nr = 0;
- int n;
- long iblock;
- unsigned blocksize;
-
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)))
- return;
-
- blocksize = inode->i_sb->s_blocksize;
- iblock = (inode->i_size + blocksize-1)
- >> inode->i_sb->s_blocksize_bits;
-
- block_truncate_page(inode->i_mapping, inode->i_size, get_block);
-
- n = block_to_path(inode, iblock, offsets);
- if (n == 0)
- return;
-
- if (n == 1) {
- free_data(inode, i_data+offsets[0], i_data + DIRECT);
- goto do_indirects;
- }
-
- partial = find_shared(inode, n, offsets, chain, &nr);
- /* Kill the top of shared branch (already detached) */
- if (nr) {
- if (partial == chain)
- mark_inode_dirty(inode);
- else
- dirty_indirect(partial->bh, inode);
- free_branches(inode, &nr, &nr+1, (chain+n-1) - partial);
- }
- /* Clear the ends of indirect blocks on the shared branch */
- while (partial > chain) {
- free_branches(inode, partial->p + 1, block_end(partial->bh),
- (chain+n-1) - partial);
- dirty_indirect(partial->bh, inode);
- brelse (partial->bh);
- partial--;
- }
-do_indirects:
- /* Kill the remaining (whole) subtrees (== subtrees deeper than...) */
- while (n < DEPTH) {
- nr = i_data[DIRECT + n - 1];
- if (nr) {
- i_data[DIRECT + n - 1] = 0;
- mark_inode_dirty(inode);
- free_branches(inode, &nr, &nr+1, n);
- }
- n++;
- }
- inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
- if (IS_SYNC(inode))
- sysv_sync_inode (inode);
- else
- mark_inode_dirty(inode);
-}
-
-static unsigned sysv_nblocks(struct super_block *s, loff_t size)
-{
- struct sysv_sb_info *sbi = SYSV_SB(s);
- int ptrs_bits = sbi->s_ind_per_block_bits;
- unsigned blocks, res, direct = DIRECT, i = DEPTH;
- blocks = (size + s->s_blocksize - 1) >> s->s_blocksize_bits;
- res = blocks;
- while (--i && blocks > direct) {
- blocks = ((blocks - direct - 1) >> ptrs_bits) + 1;
- res += blocks;
- direct = 1;
- }
- return res;
-}
-
-int sysv_getattr(struct mnt_idmap *idmap, const struct path *path,
- struct kstat *stat, u32 request_mask, unsigned int flags)
-{
- struct super_block *s = path->dentry->d_sb;
- generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry),
- stat);
- stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size);
- stat->blksize = s->s_blocksize;
- return 0;
-}
-
-static int sysv_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- return mpage_writepages(mapping, wbc, get_block);
-}
-
-static int sysv_read_folio(struct file *file, struct folio *folio)
-{
- return block_read_full_folio(folio, get_block);
-}
-
-int sysv_prepare_chunk(struct folio *folio, loff_t pos, unsigned len)
-{
- return __block_write_begin(folio, pos, len, get_block);
-}
-
-static void sysv_write_failed(struct address_space *mapping, loff_t to)
-{
- struct inode *inode = mapping->host;
-
- if (to > inode->i_size) {
- truncate_pagecache(inode, inode->i_size);
- sysv_truncate(inode);
- }
-}
-
-static int sysv_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
-{
- int ret;
-
- ret = block_write_begin(mapping, pos, len, foliop, get_block);
- if (unlikely(ret))
- sysv_write_failed(mapping, pos + len);
-
- return ret;
-}
-
-static sector_t sysv_bmap(struct address_space *mapping, sector_t block)
-{
- return generic_block_bmap(mapping,block,get_block);
-}
-
-const struct address_space_operations sysv_aops = {
- .dirty_folio = block_dirty_folio,
- .invalidate_folio = block_invalidate_folio,
- .read_folio = sysv_read_folio,
- .writepages = sysv_writepages,
- .write_begin = sysv_write_begin,
- .write_end = generic_write_end,
- .migrate_folio = buffer_migrate_folio,
- .bmap = sysv_bmap
-};
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
deleted file mode 100644
index fb8bd8437872..000000000000
--- a/fs/sysv/namei.c
+++ /dev/null
@@ -1,280 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/fs/sysv/namei.c
- *
- * minix/namei.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * coh/namei.c
- * Copyright (C) 1993 Pascal Haible, Bruno Haible
- *
- * sysv/namei.c
- * Copyright (C) 1993 Bruno Haible
- * Copyright (C) 1997, 1998 Krzysztof G. Baranowski
- */
-
-#include <linux/pagemap.h>
-#include "sysv.h"
-
-static int add_nondir(struct dentry *dentry, struct inode *inode)
-{
- int err = sysv_add_link(dentry, inode);
- if (!err) {
- d_instantiate(dentry, inode);
- return 0;
- }
- inode_dec_link_count(inode);
- iput(inode);
- return err;
-}
-
-static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
-{
- struct inode * inode = NULL;
- ino_t ino;
-
- if (dentry->d_name.len > SYSV_NAMELEN)
- return ERR_PTR(-ENAMETOOLONG);
- ino = sysv_inode_by_name(dentry);
- if (ino)
- inode = sysv_iget(dir->i_sb, ino);
- return d_splice_alias(inode, dentry);
-}
-
-static int sysv_mknod(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode, dev_t rdev)
-{
- struct inode * inode;
- int err;
-
- if (!old_valid_dev(rdev))
- return -EINVAL;
-
- inode = sysv_new_inode(dir, mode);
- err = PTR_ERR(inode);
-
- if (!IS_ERR(inode)) {
- sysv_set_inode(inode, rdev);
- mark_inode_dirty(inode);
- err = add_nondir(dentry, inode);
- }
- return err;
-}
-
-static int sysv_create(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode, bool excl)
-{
- return sysv_mknod(&nop_mnt_idmap, dir, dentry, mode, 0);
-}
-
-static int sysv_symlink(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, const char *symname)
-{
- int err = -ENAMETOOLONG;
- int l = strlen(symname)+1;
- struct inode * inode;
-
- if (l > dir->i_sb->s_blocksize)
- goto out;
-
- inode = sysv_new_inode(dir, S_IFLNK|0777);
- err = PTR_ERR(inode);
- if (IS_ERR(inode))
- goto out;
-
- sysv_set_inode(inode, 0);
- err = page_symlink(inode, symname, l);
- if (err)
- goto out_fail;
-
- mark_inode_dirty(inode);
- err = add_nondir(dentry, inode);
-out:
- return err;
-
-out_fail:
- inode_dec_link_count(inode);
- iput(inode);
- goto out;
-}
-
-static int sysv_link(struct dentry * old_dentry, struct inode * dir,
- struct dentry * dentry)
-{
- struct inode *inode = d_inode(old_dentry);
-
- inode_set_ctime_current(inode);
- inode_inc_link_count(inode);
- ihold(inode);
-
- return add_nondir(dentry, inode);
-}
-
-static int sysv_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
-{
- struct inode * inode;
- int err;
-
- inode_inc_link_count(dir);
-
- inode = sysv_new_inode(dir, S_IFDIR|mode);
- err = PTR_ERR(inode);
- if (IS_ERR(inode))
- goto out_dir;
-
- sysv_set_inode(inode, 0);
-
- inode_inc_link_count(inode);
-
- err = sysv_make_empty(inode, dir);
- if (err)
- goto out_fail;
-
- err = sysv_add_link(dentry, inode);
- if (err)
- goto out_fail;
-
- d_instantiate(dentry, inode);
-out:
- return err;
-
-out_fail:
- inode_dec_link_count(inode);
- inode_dec_link_count(inode);
- iput(inode);
-out_dir:
- inode_dec_link_count(dir);
- goto out;
-}
-
-static int sysv_unlink(struct inode * dir, struct dentry * dentry)
-{
- struct inode * inode = d_inode(dentry);
- struct folio *folio;
- struct sysv_dir_entry * de;
- int err;
-
- de = sysv_find_entry(dentry, &folio);
- if (!de)
- return -ENOENT;
-
- err = sysv_delete_entry(de, folio);
- if (!err) {
- inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
- inode_dec_link_count(inode);
- }
- folio_release_kmap(folio, de);
- return err;
-}
-
-static int sysv_rmdir(struct inode * dir, struct dentry * dentry)
-{
- struct inode *inode = d_inode(dentry);
- int err = -ENOTEMPTY;
-
- if (sysv_empty_dir(inode)) {
- err = sysv_unlink(dir, dentry);
- if (!err) {
- inode->i_size = 0;
- inode_dec_link_count(inode);
- inode_dec_link_count(dir);
- }
- }
- return err;
-}
-
-/*
- * Anybody can rename anything with this: the permission checks are left to the
- * higher-level routines.
- */
-static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir,
- struct dentry *old_dentry, struct inode *new_dir,
- struct dentry *new_dentry, unsigned int flags)
-{
- struct inode * old_inode = d_inode(old_dentry);
- struct inode * new_inode = d_inode(new_dentry);
- struct folio *dir_folio;
- struct sysv_dir_entry * dir_de = NULL;
- struct folio *old_folio;
- struct sysv_dir_entry * old_de;
- int err = -ENOENT;
-
- if (flags & ~RENAME_NOREPLACE)
- return -EINVAL;
-
- old_de = sysv_find_entry(old_dentry, &old_folio);
- if (!old_de)
- goto out;
-
- if (S_ISDIR(old_inode->i_mode)) {
- err = -EIO;
- dir_de = sysv_dotdot(old_inode, &dir_folio);
- if (!dir_de)
- goto out_old;
- }
-
- if (new_inode) {
- struct folio *new_folio;
- struct sysv_dir_entry * new_de;
-
- err = -ENOTEMPTY;
- if (dir_de && !sysv_empty_dir(new_inode))
- goto out_dir;
-
- err = -ENOENT;
- new_de = sysv_find_entry(new_dentry, &new_folio);
- if (!new_de)
- goto out_dir;
- err = sysv_set_link(new_de, new_folio, old_inode);
- folio_release_kmap(new_folio, new_de);
- if (err)
- goto out_dir;
- inode_set_ctime_current(new_inode);
- if (dir_de)
- drop_nlink(new_inode);
- inode_dec_link_count(new_inode);
- } else {
- err = sysv_add_link(new_dentry, old_inode);
- if (err)
- goto out_dir;
- if (dir_de)
- inode_inc_link_count(new_dir);
- }
-
- err = sysv_delete_entry(old_de, old_folio);
- if (err)
- goto out_dir;
-
- mark_inode_dirty(old_inode);
-
- if (dir_de) {
- err = sysv_set_link(dir_de, dir_folio, new_dir);
- if (!err)
- inode_dec_link_count(old_dir);
- }
-
-out_dir:
- if (dir_de)
- folio_release_kmap(dir_folio, dir_de);
-out_old:
- folio_release_kmap(old_folio, old_de);
-out:
- return err;
-}
-
-/*
- * directories can handle most operations...
- */
-const struct inode_operations sysv_dir_inode_operations = {
- .create = sysv_create,
- .lookup = sysv_lookup,
- .link = sysv_link,
- .unlink = sysv_unlink,
- .symlink = sysv_symlink,
- .mkdir = sysv_mkdir,
- .rmdir = sysv_rmdir,
- .mknod = sysv_mknod,
- .rename = sysv_rename,
- .getattr = sysv_getattr,
-};
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
deleted file mode 100644
index 5c0d07ddbda2..000000000000
--- a/fs/sysv/super.c
+++ /dev/null
@@ -1,595 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/fs/sysv/inode.c
- *
- * minix/inode.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * xenix/inode.c
- * Copyright (C) 1992 Doug Evans
- *
- * coh/inode.c
- * Copyright (C) 1993 Pascal Haible, Bruno Haible
- *
- * sysv/inode.c
- * Copyright (C) 1993 Paul B. Monday
- *
- * sysv/inode.c
- * Copyright (C) 1993 Bruno Haible
- * Copyright (C) 1997, 1998 Krzysztof G. Baranowski
- *
- * This file contains code for read/parsing the superblock.
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/buffer_head.h>
-#include "sysv.h"
-
-/*
- * The following functions try to recognize specific filesystems.
- *
- * We recognize:
- * - Xenix FS by its magic number.
- * - SystemV FS by its magic number.
- * - Coherent FS by its funny fname/fpack field.
- * - SCO AFS by s_nfree == 0xffff
- * - V7 FS has no distinguishing features.
- *
- * We discriminate among SystemV4 and SystemV2 FS by the assumption that
- * the time stamp is not < 01-01-1980.
- */
-
-enum {
- JAN_1_1980 = (10*365 + 2) * 24 * 60 * 60
-};
-
-static void detected_xenix(struct sysv_sb_info *sbi, unsigned *max_links)
-{
- struct buffer_head *bh1 = sbi->s_bh1;
- struct buffer_head *bh2 = sbi->s_bh2;
- struct xenix_super_block * sbd1;
- struct xenix_super_block * sbd2;
-
- if (bh1 != bh2)
- sbd1 = sbd2 = (struct xenix_super_block *) bh1->b_data;
- else {
- /* block size = 512, so bh1 != bh2 */
- sbd1 = (struct xenix_super_block *) bh1->b_data;
- sbd2 = (struct xenix_super_block *) (bh2->b_data - 512);
- }
-
- *max_links = XENIX_LINK_MAX;
- sbi->s_fic_size = XENIX_NICINOD;
- sbi->s_flc_size = XENIX_NICFREE;
- sbi->s_sbd1 = (char *)sbd1;
- sbi->s_sbd2 = (char *)sbd2;
- sbi->s_sb_fic_count = &sbd1->s_ninode;
- sbi->s_sb_fic_inodes = &sbd1->s_inode[0];
- sbi->s_sb_total_free_inodes = &sbd2->s_tinode;
- sbi->s_bcache_count = &sbd1->s_nfree;
- sbi->s_bcache = &sbd1->s_free[0];
- sbi->s_free_blocks = &sbd2->s_tfree;
- sbi->s_sb_time = &sbd2->s_time;
- sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd1->s_isize);
- sbi->s_nzones = fs32_to_cpu(sbi, sbd1->s_fsize);
-}
-
-static void detected_sysv4(struct sysv_sb_info *sbi, unsigned *max_links)
-{
- struct sysv4_super_block * sbd;
- struct buffer_head *bh1 = sbi->s_bh1;
- struct buffer_head *bh2 = sbi->s_bh2;
-
- if (bh1 == bh2)
- sbd = (struct sysv4_super_block *) (bh1->b_data + BLOCK_SIZE/2);
- else
- sbd = (struct sysv4_super_block *) bh2->b_data;
-
- *max_links = SYSV_LINK_MAX;
- sbi->s_fic_size = SYSV_NICINOD;
- sbi->s_flc_size = SYSV_NICFREE;
- sbi->s_sbd1 = (char *)sbd;
- sbi->s_sbd2 = (char *)sbd;
- sbi->s_sb_fic_count = &sbd->s_ninode;
- sbi->s_sb_fic_inodes = &sbd->s_inode[0];
- sbi->s_sb_total_free_inodes = &sbd->s_tinode;
- sbi->s_bcache_count = &sbd->s_nfree;
- sbi->s_bcache = &sbd->s_free[0];
- sbi->s_free_blocks = &sbd->s_tfree;
- sbi->s_sb_time = &sbd->s_time;
- sbi->s_sb_state = &sbd->s_state;
- sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize);
- sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize);
-}
-
-static void detected_sysv2(struct sysv_sb_info *sbi, unsigned *max_links)
-{
- struct sysv2_super_block *sbd;
- struct buffer_head *bh1 = sbi->s_bh1;
- struct buffer_head *bh2 = sbi->s_bh2;
-
- if (bh1 == bh2)
- sbd = (struct sysv2_super_block *) (bh1->b_data + BLOCK_SIZE/2);
- else
- sbd = (struct sysv2_super_block *) bh2->b_data;
-
- *max_links = SYSV_LINK_MAX;
- sbi->s_fic_size = SYSV_NICINOD;
- sbi->s_flc_size = SYSV_NICFREE;
- sbi->s_sbd1 = (char *)sbd;
- sbi->s_sbd2 = (char *)sbd;
- sbi->s_sb_fic_count = &sbd->s_ninode;
- sbi->s_sb_fic_inodes = &sbd->s_inode[0];
- sbi->s_sb_total_free_inodes = &sbd->s_tinode;
- sbi->s_bcache_count = &sbd->s_nfree;
- sbi->s_bcache = &sbd->s_free[0];
- sbi->s_free_blocks = &sbd->s_tfree;
- sbi->s_sb_time = &sbd->s_time;
- sbi->s_sb_state = &sbd->s_state;
- sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize);
- sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize);
-}
-
-static void detected_coherent(struct sysv_sb_info *sbi, unsigned *max_links)
-{
- struct coh_super_block * sbd;
- struct buffer_head *bh1 = sbi->s_bh1;
-
- sbd = (struct coh_super_block *) bh1->b_data;
-
- *max_links = COH_LINK_MAX;
- sbi->s_fic_size = COH_NICINOD;
- sbi->s_flc_size = COH_NICFREE;
- sbi->s_sbd1 = (char *)sbd;
- sbi->s_sbd2 = (char *)sbd;
- sbi->s_sb_fic_count = &sbd->s_ninode;
- sbi->s_sb_fic_inodes = &sbd->s_inode[0];
- sbi->s_sb_total_free_inodes = &sbd->s_tinode;
- sbi->s_bcache_count = &sbd->s_nfree;
- sbi->s_bcache = &sbd->s_free[0];
- sbi->s_free_blocks = &sbd->s_tfree;
- sbi->s_sb_time = &sbd->s_time;
- sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize);
- sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize);
-}
-
-static void detected_v7(struct sysv_sb_info *sbi, unsigned *max_links)
-{
- struct buffer_head *bh2 = sbi->s_bh2;
- struct v7_super_block *sbd = (struct v7_super_block *)bh2->b_data;
-
- *max_links = V7_LINK_MAX;
- sbi->s_fic_size = V7_NICINOD;
- sbi->s_flc_size = V7_NICFREE;
- sbi->s_sbd1 = (char *)sbd;
- sbi->s_sbd2 = (char *)sbd;
- sbi->s_sb_fic_count = &sbd->s_ninode;
- sbi->s_sb_fic_inodes = &sbd->s_inode[0];
- sbi->s_sb_total_free_inodes = &sbd->s_tinode;
- sbi->s_bcache_count = &sbd->s_nfree;
- sbi->s_bcache = &sbd->s_free[0];
- sbi->s_free_blocks = &sbd->s_tfree;
- sbi->s_sb_time = &sbd->s_time;
- sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize);
- sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize);
-}
-
-static int detect_xenix(struct sysv_sb_info *sbi, struct buffer_head *bh)
-{
- struct xenix_super_block *sbd = (struct xenix_super_block *)bh->b_data;
- if (*(__le32 *)&sbd->s_magic == cpu_to_le32(0x2b5544))
- sbi->s_bytesex = BYTESEX_LE;
- else if (*(__be32 *)&sbd->s_magic == cpu_to_be32(0x2b5544))
- sbi->s_bytesex = BYTESEX_BE;
- else
- return 0;
- switch (fs32_to_cpu(sbi, sbd->s_type)) {
- case 1:
- sbi->s_type = FSTYPE_XENIX;
- return 1;
- case 2:
- sbi->s_type = FSTYPE_XENIX;
- return 2;
- default:
- return 0;
- }
-}
-
-static int detect_sysv(struct sysv_sb_info *sbi, struct buffer_head *bh)
-{
- struct super_block *sb = sbi->s_sb;
- /* All relevant fields are at the same offsets in R2 and R4 */
- struct sysv4_super_block * sbd;
- u32 type;
-
- sbd = (struct sysv4_super_block *) (bh->b_data + BLOCK_SIZE/2);
- if (*(__le32 *)&sbd->s_magic == cpu_to_le32(0xfd187e20))
- sbi->s_bytesex = BYTESEX_LE;
- else if (*(__be32 *)&sbd->s_magic == cpu_to_be32(0xfd187e20))
- sbi->s_bytesex = BYTESEX_BE;
- else
- return 0;
-
- type = fs32_to_cpu(sbi, sbd->s_type);
-
- if (fs16_to_cpu(sbi, sbd->s_nfree) == 0xffff) {
- sbi->s_type = FSTYPE_AFS;
- sbi->s_forced_ro = 1;
- if (!sb_rdonly(sb)) {
- printk("SysV FS: SCO EAFS on %s detected, "
- "forcing read-only mode.\n",
- sb->s_id);
- }
- return type;
- }
-
- if (fs32_to_cpu(sbi, sbd->s_time) < JAN_1_1980) {
- /* this is likely to happen on SystemV2 FS */
- if (type > 3 || type < 1)
- return 0;
- sbi->s_type = FSTYPE_SYSV2;
- return type;
- }
- if ((type > 3 || type < 1) && (type > 0x30 || type < 0x10))
- return 0;
-
- /* On Interactive Unix (ISC) Version 4.0/3.x s_type field = 0x10,
- 0x20 or 0x30 indicates that symbolic links and the 14-character
- filename limit is gone. Due to lack of information about this
- feature read-only mode seems to be a reasonable approach... -KGB */
-
- if (type >= 0x10) {
- printk("SysV FS: can't handle long file names on %s, "
- "forcing read-only mode.\n", sb->s_id);
- sbi->s_forced_ro = 1;
- }
-
- sbi->s_type = FSTYPE_SYSV4;
- return type >= 0x10 ? type >> 4 : type;
-}
-
-static int detect_coherent(struct sysv_sb_info *sbi, struct buffer_head *bh)
-{
- struct coh_super_block * sbd;
-
- sbd = (struct coh_super_block *) (bh->b_data + BLOCK_SIZE/2);
- if ((memcmp(sbd->s_fname,"noname",6) && memcmp(sbd->s_fname,"xxxxx ",6))
- || (memcmp(sbd->s_fpack,"nopack",6) && memcmp(sbd->s_fpack,"xxxxx\n",6)))
- return 0;
- sbi->s_bytesex = BYTESEX_PDP;
- sbi->s_type = FSTYPE_COH;
- return 1;
-}
-
-static int detect_sysv_odd(struct sysv_sb_info *sbi, struct buffer_head *bh)
-{
- int size = detect_sysv(sbi, bh);
-
- return size>2 ? 0 : size;
-}
-
-static struct {
- int block;
- int (*test)(struct sysv_sb_info *, struct buffer_head *);
-} flavours[] = {
- {1, detect_xenix},
- {0, detect_sysv},
- {0, detect_coherent},
- {9, detect_sysv_odd},
- {15,detect_sysv_odd},
- {18,detect_sysv},
-};
-
-static char *flavour_names[] = {
- [FSTYPE_XENIX] = "Xenix",
- [FSTYPE_SYSV4] = "SystemV",
- [FSTYPE_SYSV2] = "SystemV Release 2",
- [FSTYPE_COH] = "Coherent",
- [FSTYPE_V7] = "V7",
- [FSTYPE_AFS] = "AFS",
-};
-
-static void (*flavour_setup[])(struct sysv_sb_info *, unsigned *) = {
- [FSTYPE_XENIX] = detected_xenix,
- [FSTYPE_SYSV4] = detected_sysv4,
- [FSTYPE_SYSV2] = detected_sysv2,
- [FSTYPE_COH] = detected_coherent,
- [FSTYPE_V7] = detected_v7,
- [FSTYPE_AFS] = detected_sysv4,
-};
-
-static int complete_read_super(struct super_block *sb, int silent, int size)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- struct inode *root_inode;
- char *found = flavour_names[sbi->s_type];
- u_char n_bits = size+8;
- int bsize = 1 << n_bits;
- int bsize_4 = bsize >> 2;
-
- sbi->s_firstinodezone = 2;
-
- flavour_setup[sbi->s_type](sbi, &sb->s_max_links);
- if (sbi->s_firstdatazone < sbi->s_firstinodezone)
- return 0;
-
- sbi->s_ndatazones = sbi->s_nzones - sbi->s_firstdatazone;
- sbi->s_inodes_per_block = bsize >> 6;
- sbi->s_inodes_per_block_1 = (bsize >> 6)-1;
- sbi->s_inodes_per_block_bits = n_bits-6;
- sbi->s_ind_per_block = bsize_4;
- sbi->s_ind_per_block_2 = bsize_4*bsize_4;
- sbi->s_toobig_block = 10 + bsize_4 * (1 + bsize_4 * (1 + bsize_4));
- sbi->s_ind_per_block_bits = n_bits-2;
-
- sbi->s_ninodes = (sbi->s_firstdatazone - sbi->s_firstinodezone)
- << sbi->s_inodes_per_block_bits;
-
- if (!silent)
- printk("VFS: Found a %s FS (block size = %ld) on device %s\n",
- found, sb->s_blocksize, sb->s_id);
-
- sb->s_magic = SYSV_MAGIC_BASE + sbi->s_type;
- /* set up enough so that it can read an inode */
- sb->s_op = &sysv_sops;
- if (sbi->s_forced_ro)
- sb->s_flags |= SB_RDONLY;
- root_inode = sysv_iget(sb, SYSV_ROOT_INO);
- if (IS_ERR(root_inode)) {
- printk("SysV FS: get root inode failed\n");
- return 0;
- }
- sb->s_root = d_make_root(root_inode);
- if (!sb->s_root) {
- printk("SysV FS: get root dentry failed\n");
- return 0;
- }
- return 1;
-}
-
-static int sysv_fill_super(struct super_block *sb, void *data, int silent)
-{
- struct buffer_head *bh1, *bh = NULL;
- struct sysv_sb_info *sbi;
- unsigned long blocknr;
- int size = 0, i;
-
- BUILD_BUG_ON(1024 != sizeof (struct xenix_super_block));
- BUILD_BUG_ON(512 != sizeof (struct sysv4_super_block));
- BUILD_BUG_ON(512 != sizeof (struct sysv2_super_block));
- BUILD_BUG_ON(500 != sizeof (struct coh_super_block));
- BUILD_BUG_ON(64 != sizeof (struct sysv_inode));
-
- sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL);
- if (!sbi)
- return -ENOMEM;
-
- sbi->s_sb = sb;
- sbi->s_block_base = 0;
- mutex_init(&sbi->s_lock);
- sb->s_fs_info = sbi;
- sb->s_time_min = 0;
- sb->s_time_max = U32_MAX;
- sb_set_blocksize(sb, BLOCK_SIZE);
-
- for (i = 0; i < ARRAY_SIZE(flavours) && !size; i++) {
- brelse(bh);
- bh = sb_bread(sb, flavours[i].block);
- if (!bh)
- continue;
- size = flavours[i].test(SYSV_SB(sb), bh);
- }
-
- if (!size)
- goto Eunknown;
-
- switch (size) {
- case 1:
- blocknr = bh->b_blocknr << 1;
- brelse(bh);
- sb_set_blocksize(sb, 512);
- bh1 = sb_bread(sb, blocknr);
- bh = sb_bread(sb, blocknr + 1);
- break;
- case 2:
- bh1 = bh;
- break;
- case 3:
- blocknr = bh->b_blocknr >> 1;
- brelse(bh);
- sb_set_blocksize(sb, 2048);
- bh1 = bh = sb_bread(sb, blocknr);
- break;
- default:
- goto Ebadsize;
- }
-
- if (bh && bh1) {
- sbi->s_bh1 = bh1;
- sbi->s_bh2 = bh;
- if (complete_read_super(sb, silent, size))
- return 0;
- }
-
- brelse(bh1);
- brelse(bh);
- sb_set_blocksize(sb, BLOCK_SIZE);
- printk("oldfs: cannot read superblock\n");
-failed:
- kfree(sbi);
- return -EINVAL;
-
-Eunknown:
- brelse(bh);
- if (!silent)
- printk("VFS: unable to find oldfs superblock on device %s\n",
- sb->s_id);
- goto failed;
-Ebadsize:
- brelse(bh);
- if (!silent)
- printk("VFS: oldfs: unsupported block size (%dKb)\n",
- 1<<(size-2));
- goto failed;
-}
-
-static int v7_sanity_check(struct super_block *sb, struct buffer_head *bh)
-{
- struct v7_super_block *v7sb;
- struct sysv_inode *v7i;
- struct buffer_head *bh2;
- struct sysv_sb_info *sbi;
-
- sbi = sb->s_fs_info;
-
- /* plausibility check on superblock */
- v7sb = (struct v7_super_block *) bh->b_data;
- if (fs16_to_cpu(sbi, v7sb->s_nfree) > V7_NICFREE ||
- fs16_to_cpu(sbi, v7sb->s_ninode) > V7_NICINOD ||
- fs32_to_cpu(sbi, v7sb->s_fsize) > V7_MAXSIZE)
- return 0;
-
- /* plausibility check on root inode: it is a directory,
- with a nonzero size that is a multiple of 16 */
- bh2 = sb_bread(sb, 2);
- if (bh2 == NULL)
- return 0;
-
- v7i = (struct sysv_inode *)(bh2->b_data + 64);
- if ((fs16_to_cpu(sbi, v7i->i_mode) & ~0777) != S_IFDIR ||
- (fs32_to_cpu(sbi, v7i->i_size) == 0) ||
- (fs32_to_cpu(sbi, v7i->i_size) & 017) ||
- (fs32_to_cpu(sbi, v7i->i_size) > V7_NFILES *
- sizeof(struct sysv_dir_entry))) {
- brelse(bh2);
- return 0;
- }
-
- brelse(bh2);
- return 1;
-}
-
-static int v7_fill_super(struct super_block *sb, void *data, int silent)
-{
- struct sysv_sb_info *sbi;
- struct buffer_head *bh;
-
- BUILD_BUG_ON(sizeof(struct v7_super_block) != 440);
- BUILD_BUG_ON(sizeof(struct sysv_inode) != 64);
-
- sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL);
- if (!sbi)
- return -ENOMEM;
-
- sbi->s_sb = sb;
- sbi->s_block_base = 0;
- sbi->s_type = FSTYPE_V7;
- mutex_init(&sbi->s_lock);
- sb->s_fs_info = sbi;
- sb->s_time_min = 0;
- sb->s_time_max = U32_MAX;
-
- sb_set_blocksize(sb, 512);
-
- if ((bh = sb_bread(sb, 1)) == NULL) {
- if (!silent)
- printk("VFS: unable to read V7 FS superblock on "
- "device %s.\n", sb->s_id);
- goto failed;
- }
-
- /* Try PDP-11 UNIX */
- sbi->s_bytesex = BYTESEX_PDP;
- if (v7_sanity_check(sb, bh))
- goto detected;
-
- /* Try PC/IX, v7/x86 */
- sbi->s_bytesex = BYTESEX_LE;
- if (v7_sanity_check(sb, bh))
- goto detected;
-
- goto failed;
-
-detected:
- sbi->s_bh1 = bh;
- sbi->s_bh2 = bh;
- if (complete_read_super(sb, silent, 1))
- return 0;
-
-failed:
- printk(KERN_ERR "VFS: could not find a valid V7 on %s.\n",
- sb->s_id);
- brelse(bh);
- kfree(sbi);
- return -EINVAL;
-}
-
-/* Every kernel module contains stuff like this. */
-
-static struct dentry *sysv_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
-{
- return mount_bdev(fs_type, flags, dev_name, data, sysv_fill_super);
-}
-
-static struct dentry *v7_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
-{
- return mount_bdev(fs_type, flags, dev_name, data, v7_fill_super);
-}
-
-static struct file_system_type sysv_fs_type = {
- .owner = THIS_MODULE,
- .name = "sysv",
- .mount = sysv_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
-};
-MODULE_ALIAS_FS("sysv");
-
-static struct file_system_type v7_fs_type = {
- .owner = THIS_MODULE,
- .name = "v7",
- .mount = v7_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
-};
-MODULE_ALIAS_FS("v7");
-MODULE_ALIAS("v7");
-
-static int __init init_sysv_fs(void)
-{
- int error;
-
- error = sysv_init_icache();
- if (error)
- goto out;
- error = register_filesystem(&sysv_fs_type);
- if (error)
- goto destroy_icache;
- error = register_filesystem(&v7_fs_type);
- if (error)
- goto unregister;
- return 0;
-
-unregister:
- unregister_filesystem(&sysv_fs_type);
-destroy_icache:
- sysv_destroy_icache();
-out:
- return error;
-}
-
-static void __exit exit_sysv_fs(void)
-{
- unregister_filesystem(&sysv_fs_type);
- unregister_filesystem(&v7_fs_type);
- sysv_destroy_icache();
-}
-
-module_init(init_sysv_fs)
-module_exit(exit_sysv_fs)
-MODULE_DESCRIPTION("SystemV Filesystem");
-MODULE_LICENSE("GPL");
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
deleted file mode 100644
index 0a48b2e7edb1..000000000000
--- a/fs/sysv/sysv.h
+++ /dev/null
@@ -1,245 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _SYSV_H
-#define _SYSV_H
-
-#include <linux/buffer_head.h>
-
-typedef __u16 __bitwise __fs16;
-typedef __u32 __bitwise __fs32;
-
-#include <linux/sysv_fs.h>
-
-/*
- * SystemV/V7/Coherent super-block data in memory
- *
- * The SystemV/V7/Coherent superblock contains dynamic data (it gets modified
- * while the system is running). This is in contrast to the Minix and Berkeley
- * filesystems (where the superblock is never modified). This affects the
- * sync() operation: we must keep the superblock in a disk buffer and use this
- * one as our "working copy".
- */
-
-struct sysv_sb_info {
- struct super_block *s_sb; /* VFS superblock */
- int s_type; /* file system type: FSTYPE_{XENIX|SYSV|COH} */
- char s_bytesex; /* bytesex (le/be/pdp) */
- unsigned int s_inodes_per_block; /* number of inodes per block */
- unsigned int s_inodes_per_block_1; /* inodes_per_block - 1 */
- unsigned int s_inodes_per_block_bits; /* log2(inodes_per_block) */
- unsigned int s_ind_per_block; /* number of indirections per block */
- unsigned int s_ind_per_block_bits; /* log2(ind_per_block) */
- unsigned int s_ind_per_block_2; /* ind_per_block ^ 2 */
- unsigned int s_toobig_block; /* 10 + ipb + ipb^2 + ipb^3 */
- unsigned int s_block_base; /* physical block number of block 0 */
- unsigned short s_fic_size; /* free inode cache size, NICINOD */
- unsigned short s_flc_size; /* free block list chunk size, NICFREE */
- /* The superblock is kept in one or two disk buffers: */
- struct buffer_head *s_bh1;
- struct buffer_head *s_bh2;
- /* These are pointers into the disk buffer, to compensate for
- different superblock layout. */
- char * s_sbd1; /* entire superblock data, for part 1 */
- char * s_sbd2; /* entire superblock data, for part 2 */
- __fs16 *s_sb_fic_count; /* pointer to s_sbd->s_ninode */
- sysv_ino_t *s_sb_fic_inodes; /* pointer to s_sbd->s_inode */
- __fs16 *s_sb_total_free_inodes; /* pointer to s_sbd->s_tinode */
- __fs16 *s_bcache_count; /* pointer to s_sbd->s_nfree */
- sysv_zone_t *s_bcache; /* pointer to s_sbd->s_free */
- __fs32 *s_free_blocks; /* pointer to s_sbd->s_tfree */
- __fs32 *s_sb_time; /* pointer to s_sbd->s_time */
- __fs32 *s_sb_state; /* pointer to s_sbd->s_state, only FSTYPE_SYSV */
- /* We keep those superblock entities that don't change here;
- this saves us an indirection and perhaps a conversion. */
- u32 s_firstinodezone; /* index of first inode zone */
- u32 s_firstdatazone; /* same as s_sbd->s_isize */
- u32 s_ninodes; /* total number of inodes */
- u32 s_ndatazones; /* total number of data zones */
- u32 s_nzones; /* same as s_sbd->s_fsize */
- u16 s_namelen; /* max length of dir entry */
- int s_forced_ro;
- struct mutex s_lock;
-};
-
-/*
- * SystemV/V7/Coherent FS inode data in memory
- */
-struct sysv_inode_info {
- __fs32 i_data[13];
- u32 i_dir_start_lookup;
- struct inode vfs_inode;
-};
-
-
-static inline struct sysv_inode_info *SYSV_I(struct inode *inode)
-{
- return container_of(inode, struct sysv_inode_info, vfs_inode);
-}
-
-static inline struct sysv_sb_info *SYSV_SB(struct super_block *sb)
-{
- return sb->s_fs_info;
-}
-
-
-/* identify the FS in memory */
-enum {
- FSTYPE_NONE = 0,
- FSTYPE_XENIX,
- FSTYPE_SYSV4,
- FSTYPE_SYSV2,
- FSTYPE_COH,
- FSTYPE_V7,
- FSTYPE_AFS,
- FSTYPE_END,
-};
-
-#define SYSV_MAGIC_BASE 0x012FF7B3
-
-#define XENIX_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_XENIX)
-#define SYSV4_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_SYSV4)
-#define SYSV2_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_SYSV2)
-#define COH_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_COH)
-
-
-/* Admissible values for i_nlink: 0.._LINK_MAX */
-enum {
- XENIX_LINK_MAX = 126, /* ?? */
- SYSV_LINK_MAX = 126, /* 127? 251? */
- V7_LINK_MAX = 126, /* ?? */
- COH_LINK_MAX = 10000,
-};
-
-
-static inline void dirty_sb(struct super_block *sb)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
-
- mark_buffer_dirty(sbi->s_bh1);
- if (sbi->s_bh1 != sbi->s_bh2)
- mark_buffer_dirty(sbi->s_bh2);
-}
-
-
-/* ialloc.c */
-extern struct sysv_inode *sysv_raw_inode(struct super_block *, unsigned,
- struct buffer_head **);
-extern struct inode * sysv_new_inode(const struct inode *, umode_t);
-extern void sysv_free_inode(struct inode *);
-extern unsigned long sysv_count_free_inodes(struct super_block *);
-
-/* balloc.c */
-extern sysv_zone_t sysv_new_block(struct super_block *);
-extern void sysv_free_block(struct super_block *, sysv_zone_t);
-extern unsigned long sysv_count_free_blocks(struct super_block *);
-
-/* itree.c */
-void sysv_truncate(struct inode *);
-int sysv_prepare_chunk(struct folio *folio, loff_t pos, unsigned len);
-
-/* inode.c */
-extern struct inode *sysv_iget(struct super_block *, unsigned int);
-extern int sysv_write_inode(struct inode *, struct writeback_control *wbc);
-extern int sysv_sync_inode(struct inode *);
-extern void sysv_set_inode(struct inode *, dev_t);
-extern int sysv_getattr(struct mnt_idmap *, const struct path *,
- struct kstat *, u32, unsigned int);
-extern int sysv_init_icache(void);
-extern void sysv_destroy_icache(void);
-
-
-/* dir.c */
-struct sysv_dir_entry *sysv_find_entry(struct dentry *, struct folio **);
-int sysv_add_link(struct dentry *, struct inode *);
-int sysv_delete_entry(struct sysv_dir_entry *, struct folio *);
-int sysv_make_empty(struct inode *, struct inode *);
-int sysv_empty_dir(struct inode *);
-int sysv_set_link(struct sysv_dir_entry *, struct folio *,
- struct inode *);
-struct sysv_dir_entry *sysv_dotdot(struct inode *, struct folio **);
-ino_t sysv_inode_by_name(struct dentry *);
-
-
-extern const struct inode_operations sysv_file_inode_operations;
-extern const struct inode_operations sysv_dir_inode_operations;
-extern const struct file_operations sysv_file_operations;
-extern const struct file_operations sysv_dir_operations;
-extern const struct address_space_operations sysv_aops;
-extern const struct super_operations sysv_sops;
-
-
-enum {
- BYTESEX_LE,
- BYTESEX_PDP,
- BYTESEX_BE,
-};
-
-static inline u32 PDP_swab(u32 x)
-{
-#ifdef __LITTLE_ENDIAN
- return ((x & 0xffff) << 16) | ((x & 0xffff0000) >> 16);
-#else
-#ifdef __BIG_ENDIAN
- return ((x & 0xff00ff) << 8) | ((x & 0xff00ff00) >> 8);
-#else
-#error BYTESEX
-#endif
-#endif
-}
-
-static inline __u32 fs32_to_cpu(struct sysv_sb_info *sbi, __fs32 n)
-{
- if (sbi->s_bytesex == BYTESEX_PDP)
- return PDP_swab((__force __u32)n);
- else if (sbi->s_bytesex == BYTESEX_LE)
- return le32_to_cpu((__force __le32)n);
- else
- return be32_to_cpu((__force __be32)n);
-}
-
-static inline __fs32 cpu_to_fs32(struct sysv_sb_info *sbi, __u32 n)
-{
- if (sbi->s_bytesex == BYTESEX_PDP)
- return (__force __fs32)PDP_swab(n);
- else if (sbi->s_bytesex == BYTESEX_LE)
- return (__force __fs32)cpu_to_le32(n);
- else
- return (__force __fs32)cpu_to_be32(n);
-}
-
-static inline __fs32 fs32_add(struct sysv_sb_info *sbi, __fs32 *n, int d)
-{
- if (sbi->s_bytesex == BYTESEX_PDP)
- *(__u32*)n = PDP_swab(PDP_swab(*(__u32*)n)+d);
- else if (sbi->s_bytesex == BYTESEX_LE)
- le32_add_cpu((__le32 *)n, d);
- else
- be32_add_cpu((__be32 *)n, d);
- return *n;
-}
-
-static inline __u16 fs16_to_cpu(struct sysv_sb_info *sbi, __fs16 n)
-{
- if (sbi->s_bytesex != BYTESEX_BE)
- return le16_to_cpu((__force __le16)n);
- else
- return be16_to_cpu((__force __be16)n);
-}
-
-static inline __fs16 cpu_to_fs16(struct sysv_sb_info *sbi, __u16 n)
-{
- if (sbi->s_bytesex != BYTESEX_BE)
- return (__force __fs16)cpu_to_le16(n);
- else
- return (__force __fs16)cpu_to_be16(n);
-}
-
-static inline __fs16 fs16_add(struct sysv_sb_info *sbi, __fs16 *n, int d)
-{
- if (sbi->s_bytesex != BYTESEX_BE)
- le16_add_cpu((__le16 *)n, d);
- else
- be16_add_cpu((__be16 *)n, d);
- return *n;
-}
-
-#endif /* _SYSV_H */
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 9f7eb451a60f..c68f28d9c426 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -205,9 +205,8 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
ALARM_REALTIME : ALARM_BOOTTIME,
timerfd_alarmproc);
} else {
- hrtimer_init(&ctx->t.tmr, clockid, htmode);
+ hrtimer_setup(&ctx->t.tmr, timerfd_tmrproc, clockid, htmode);
hrtimer_set_expires(&ctx->t.tmr, texp);
- ctx->t.tmr.function = timerfd_tmrproc;
}
if (texp != 0) {
@@ -429,7 +428,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
ALARM_REALTIME : ALARM_BOOTTIME,
timerfd_alarmproc);
else
- hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS);
+ hrtimer_setup(&ctx->t.tmr, timerfd_tmrproc, clockid, HRTIMER_MODE_ABS);
ctx->moffs = ktime_mono_to_real(0);
@@ -439,15 +438,15 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
return ufd;
}
- file = anon_inode_getfile("[timerfd]", &timerfd_fops, ctx,
- O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS));
+ file = anon_inode_getfile_fmode("[timerfd]", &timerfd_fops, ctx,
+ O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS),
+ FMODE_NOWAIT);
if (IS_ERR(file)) {
put_unused_fd(ufd);
kfree(ctx);
return PTR_ERR(file);
}
- file->f_mode |= FMODE_NOWAIT;
fd_install(ufd, file);
return ufd;
}
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 53214499e384..cb1af30b49f5 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -109,9 +109,9 @@ static char *get_dname(struct dentry *dentry)
return name;
}
-static int tracefs_syscall_mkdir(struct mnt_idmap *idmap,
- struct inode *inode, struct dentry *dentry,
- umode_t mode)
+static struct dentry *tracefs_syscall_mkdir(struct mnt_idmap *idmap,
+ struct inode *inode, struct dentry *dentry,
+ umode_t mode)
{
struct tracefs_inode *ti;
char *name;
@@ -119,7 +119,7 @@ static int tracefs_syscall_mkdir(struct mnt_idmap *idmap,
name = get_dname(dentry);
if (!name)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
/*
* This is a new directory that does not take the default of
@@ -141,7 +141,7 @@ static int tracefs_syscall_mkdir(struct mnt_idmap *idmap,
kfree(name);
- return ret;
+ return ERR_PTR(ret);
}
static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry)
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index fda82f3e16e8..3c3d3ad4fa6c 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1002,8 +1002,8 @@ out_fname:
return err;
}
-static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
struct ubifs_inode *dir_ui = ubifs_inode(dir);
@@ -1023,7 +1023,7 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
err = ubifs_budget_space(c, &req);
if (err)
- return err;
+ return ERR_PTR(err);
err = ubifs_prepare_create(dir, dentry, &nm);
if (err)
@@ -1060,7 +1060,7 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
ubifs_release_budget(c, &req);
d_instantiate(dentry, inode);
fscrypt_free_filename(&nm);
- return 0;
+ return NULL;
out_cancel:
dir->i_size -= sz_change;
@@ -1074,7 +1074,7 @@ out_fname:
fscrypt_free_filename(&nm);
out_budg:
ubifs_release_budget(c, &req);
- return err;
+ return ERR_PTR(err);
}
static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir,
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 01d8eb170382..a79f229df475 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -1179,8 +1179,7 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
wbuf->c = c;
wbuf->next_ino = 0;
- hrtimer_init(&wbuf->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- wbuf->timer.function = wbuf_timer_callback_nolock;
+ hrtimer_setup(&wbuf->timer, wbuf_timer_callback_nolock, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
return 0;
}
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 2cb49b6b0716..5f2e9a892bff 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -419,8 +419,8 @@ static int udf_mknod(struct mnt_idmap *idmap, struct inode *dir,
return udf_add_nondir(dentry, inode);
}
-static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *udf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
struct udf_fileident_iter iter;
@@ -430,7 +430,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
inode = udf_new_inode(dir, S_IFDIR | mode);
if (IS_ERR(inode))
- return PTR_ERR(inode);
+ return ERR_CAST(inode);
iinfo = UDF_I(inode);
inode->i_op = &udf_dir_inode_operations;
@@ -439,7 +439,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
if (err) {
clear_nlink(inode);
discard_new_inode(inode);
- return err;
+ return ERR_PTR(err);
}
set_nlink(inode, 2);
iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
@@ -456,7 +456,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
if (err) {
clear_nlink(inode);
discard_new_inode(inode);
- return err;
+ return ERR_PTR(err);
}
iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
iter.fi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
@@ -471,7 +471,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
mark_inode_dirty(dir);
d_instantiate_new(dentry, inode);
- return 0;
+ return NULL;
}
static int empty_dir(struct inode *dir)
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 38a024c8cccd..5b3c85c93242 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -166,8 +166,8 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
return error;
}
-static int ufs_mkdir(struct mnt_idmap * idmap, struct inode * dir,
- struct dentry * dentry, umode_t mode)
+static struct dentry *ufs_mkdir(struct mnt_idmap * idmap, struct inode * dir,
+ struct dentry * dentry, umode_t mode)
{
struct inode * inode;
int err;
@@ -194,7 +194,7 @@ static int ufs_mkdir(struct mnt_idmap * idmap, struct inode * dir,
goto out_fail;
d_instantiate_new(dentry, inode);
- return 0;
+ return NULL;
out_fail:
inode_dec_link_count(inode);
@@ -202,7 +202,7 @@ out_fail:
discard_new_inode(inode);
out_dir:
inode_dec_link_count(dir);
- return err;
+ return ERR_PTR(err);
}
static int ufs_unlink(struct inode *dir, struct dentry *dentry)
diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig
index da786a687fdc..4ad2c36550f1 100644
--- a/fs/unicode/Kconfig
+++ b/fs/unicode/Kconfig
@@ -10,6 +10,7 @@ config UNICODE
be a separate loadable module that gets requested only when a file
system actually use it.
-config UNICODE_NORMALIZATION_SELFTEST
+config UNICODE_NORMALIZATION_KUNIT_TEST
tristate "Test UTF-8 normalization support"
- depends on UNICODE
+ depends on UNICODE && KUNIT
+ default KUNIT_ALL_TESTS
diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
index e309afe2b2bb..d95be7fb9f6b 100644
--- a/fs/unicode/Makefile
+++ b/fs/unicode/Makefile
@@ -4,7 +4,7 @@ ifneq ($(CONFIG_UNICODE),)
obj-y += unicode.o
endif
obj-$(CONFIG_UNICODE) += utf8data.o
-obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o
+obj-$(CONFIG_UNICODE_NORMALIZATION_KUNIT_TEST) += tests/utf8_kunit.o
unicode-y := utf8-norm.o utf8-core.o
diff --git a/fs/unicode/tests/.kunitconfig b/fs/unicode/tests/.kunitconfig
new file mode 100644
index 000000000000..62dd5c171f9c
--- /dev/null
+++ b/fs/unicode/tests/.kunitconfig
@@ -0,0 +1,3 @@
+CONFIG_KUNIT=y
+CONFIG_UNICODE=y
+CONFIG_UNICODE_NORMALIZATION_KUNIT_TEST=y
diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/tests/utf8_kunit.c
index 5ddaf27b21a6..5063e8138aec 100644
--- a/fs/unicode/utf8-selftest.c
+++ b/fs/unicode/tests/utf8_kunit.c
@@ -1,34 +1,14 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * Kernel module for testing utf-8 support.
+ * KUnit tests for utf-8 support.
*
* Copyright 2017 Collabora Ltd.
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/module.h>
-#include <linux/printk.h>
#include <linux/unicode.h>
-#include <linux/dcache.h>
-
-#include "utf8n.h"
-
-static unsigned int failed_tests;
-static unsigned int total_tests;
-
-#define _test(cond, func, line, fmt, ...) do { \
- total_tests++; \
- if (!cond) { \
- failed_tests++; \
- pr_err("test %s:%d Failed: %s%s", \
- func, line, #cond, (fmt?":":".")); \
- if (fmt) \
- pr_err(fmt, ##__VA_ARGS__); \
- } \
- } while (0)
-#define test_f(cond, fmt, ...) _test(cond, __func__, __LINE__, fmt, ##__VA_ARGS__)
-#define test(cond) _test(cond, __func__, __LINE__, "")
+#include <kunit/test.h>
+
+#include "../utf8n.h"
static const struct {
/* UTF-8 strings in this vector _must_ be NULL-terminated. */
@@ -167,69 +147,74 @@ static int utf8cursor(struct utf8cursor *u8c, const struct unicode_map *um,
return utf8ncursor(u8c, um, n, s, (unsigned int)-1);
}
-static void check_utf8_nfdi(struct unicode_map *um)
+static void check_utf8_nfdi(struct kunit *test)
{
int i;
struct utf8cursor u8c;
+ struct unicode_map *um = test->priv;
for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
int len = strlen(nfdi_test_data[i].str);
int nlen = strlen(nfdi_test_data[i].dec);
int j = 0;
unsigned char c;
+ int ret;
+
+ KUNIT_EXPECT_EQ(test, utf8len(um, UTF8_NFDI, nfdi_test_data[i].str), nlen);
+ KUNIT_EXPECT_EQ(test, utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len),
+ nlen);
- test((utf8len(um, UTF8_NFDI, nfdi_test_data[i].str) == nlen));
- test((utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len) ==
- nlen));
- if (utf8cursor(&u8c, um, UTF8_NFDI, nfdi_test_data[i].str) < 0)
- pr_err("can't create cursor\n");
+ ret = utf8cursor(&u8c, um, UTF8_NFDI, nfdi_test_data[i].str);
+ KUNIT_EXPECT_TRUE_MSG(test, ret >= 0, "Can't create cursor\n");
while ((c = utf8byte(&u8c)) > 0) {
- test_f((c == nfdi_test_data[i].dec[j]),
- "Unexpected byte 0x%x should be 0x%x\n",
- c, nfdi_test_data[i].dec[j]);
+ KUNIT_EXPECT_EQ_MSG(test, c, nfdi_test_data[i].dec[j],
+ "Unexpected byte 0x%x should be 0x%x\n",
+ c, nfdi_test_data[i].dec[j]);
j++;
}
- test((j == nlen));
+ KUNIT_EXPECT_EQ(test, j, nlen);
}
}
-static void check_utf8_nfdicf(struct unicode_map *um)
+static void check_utf8_nfdicf(struct kunit *test)
{
int i;
struct utf8cursor u8c;
+ struct unicode_map *um = test->priv;
for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) {
int len = strlen(nfdicf_test_data[i].str);
int nlen = strlen(nfdicf_test_data[i].ncf);
int j = 0;
+ int ret;
unsigned char c;
- test((utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str) ==
- nlen));
- test((utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len) ==
- nlen));
+ KUNIT_EXPECT_EQ(test, utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str),
+ nlen);
+ KUNIT_EXPECT_EQ(test, utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len),
+ nlen);
- if (utf8cursor(&u8c, um, UTF8_NFDICF,
- nfdicf_test_data[i].str) < 0)
- pr_err("can't create cursor\n");
+ ret = utf8cursor(&u8c, um, UTF8_NFDICF, nfdicf_test_data[i].str);
+ KUNIT_EXPECT_TRUE_MSG(test, ret >= 0, "Can't create cursor\n");
while ((c = utf8byte(&u8c)) > 0) {
- test_f((c == nfdicf_test_data[i].ncf[j]),
- "Unexpected byte 0x%x should be 0x%x\n",
- c, nfdicf_test_data[i].ncf[j]);
+ KUNIT_EXPECT_EQ_MSG(test, c, nfdicf_test_data[i].ncf[j],
+ "Unexpected byte 0x%x should be 0x%x\n",
+ c, nfdicf_test_data[i].ncf[j]);
j++;
}
- test((j == nlen));
+ KUNIT_EXPECT_EQ(test, j, nlen);
}
}
-static void check_utf8_comparisons(struct unicode_map *table)
+static void check_utf8_comparisons(struct kunit *test)
{
int i;
+ struct unicode_map *um = test->priv;
for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
const struct qstr s1 = {.name = nfdi_test_data[i].str,
@@ -237,8 +222,9 @@ static void check_utf8_comparisons(struct unicode_map *table)
const struct qstr s2 = {.name = nfdi_test_data[i].dec,
.len = sizeof(nfdi_test_data[i].dec)};
- test_f(!utf8_strncmp(table, &s1, &s2),
- "%s %s comparison mismatch\n", s1.name, s2.name);
+ /* strncmp returns 0 when strings are equal */
+ KUNIT_EXPECT_TRUE_MSG(test, utf8_strncmp(um, &s1, &s2) == 0,
+ "%s %s comparison mismatch\n", s1.name, s2.name);
}
for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) {
@@ -247,62 +233,65 @@ static void check_utf8_comparisons(struct unicode_map *table)
const struct qstr s2 = {.name = nfdicf_test_data[i].ncf,
.len = sizeof(nfdicf_test_data[i].ncf)};
- test_f(!utf8_strncasecmp(table, &s1, &s2),
- "%s %s comparison mismatch\n", s1.name, s2.name);
+ /* strncasecmp returns 0 when strings are equal */
+ KUNIT_EXPECT_TRUE_MSG(test, utf8_strncasecmp(um, &s1, &s2) == 0,
+ "%s %s comparison mismatch\n", s1.name, s2.name);
}
}
-static void check_supported_versions(struct unicode_map *um)
+static void check_supported_versions(struct kunit *test)
{
+ struct unicode_map *um = test->priv;
/* Unicode 7.0.0 should be supported. */
- test(utf8version_is_supported(um, UNICODE_AGE(7, 0, 0)));
+ KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UNICODE_AGE(7, 0, 0)));
/* Unicode 9.0.0 should be supported. */
- test(utf8version_is_supported(um, UNICODE_AGE(9, 0, 0)));
+ KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UNICODE_AGE(9, 0, 0)));
/* Unicode 1x.0.0 (the latest version) should be supported. */
- test(utf8version_is_supported(um, UTF8_LATEST));
+ KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UTF8_LATEST));
/* Next versions don't exist. */
- test(!utf8version_is_supported(um, UNICODE_AGE(13, 0, 0)));
- test(!utf8version_is_supported(um, UNICODE_AGE(0, 0, 0)));
- test(!utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1)));
+ KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(13, 0, 0)));
+ KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(0, 0, 0)));
+ KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1)));
}
-static int __init init_test_ucd(void)
+static struct kunit_case unicode_normalization_test_cases[] = {
+ KUNIT_CASE(check_supported_versions),
+ KUNIT_CASE(check_utf8_comparisons),
+ KUNIT_CASE(check_utf8_nfdicf),
+ KUNIT_CASE(check_utf8_nfdi),
+ {}
+};
+
+static int init_test_ucd(struct kunit *test)
{
- struct unicode_map *um;
+ struct unicode_map *um = utf8_load(UTF8_LATEST);
- failed_tests = 0;
- total_tests = 0;
+ test->priv = um;
- um = utf8_load(UTF8_LATEST);
- if (IS_ERR(um)) {
- pr_err("%s: Unable to load utf8 table.\n", __func__);
- return PTR_ERR(um);
- }
+ KUNIT_EXPECT_EQ_MSG(test, IS_ERR(um), 0,
+ "%s: Unable to load utf8 table.\n", __func__);
- check_supported_versions(um);
- check_utf8_nfdi(um);
- check_utf8_nfdicf(um);
- check_utf8_comparisons(um);
-
- if (!failed_tests)
- pr_info("All %u tests passed\n", total_tests);
- else
- pr_err("%u out of %u tests failed\n", failed_tests,
- total_tests);
- utf8_unload(um);
return 0;
}
-static void __exit exit_test_ucd(void)
+static void exit_test_ucd(struct kunit *test)
{
+ utf8_unload(test->priv);
}
-module_init(init_test_ucd);
-module_exit(exit_test_ucd);
+static struct kunit_suite unicode_normalization_test_suite = {
+ .name = "unicode_normalization",
+ .test_cases = unicode_normalization_test_cases,
+ .init = init_test_ucd,
+ .exit = exit_test_ucd,
+};
+
+kunit_test_suite(unicode_normalization_test_suite);
+
MODULE_AUTHOR("Gabriel Krisman Bertazi <krisman@collabora.co.uk>");
-MODULE_DESCRIPTION("Kernel module for testing utf-8 support");
+MODULE_DESCRIPTION("KUnit tests for utf-8 support.");
MODULE_LICENSE("GPL");
diff --git a/fs/unicode/utf8-norm.c b/fs/unicode/utf8-norm.c
index 768f8ab448b8..7b998c99c88d 100644
--- a/fs/unicode/utf8-norm.c
+++ b/fs/unicode/utf8-norm.c
@@ -586,7 +586,7 @@ ccc_mismatch:
}
}
-#ifdef CONFIG_UNICODE_NORMALIZATION_SELFTEST_MODULE
+#if IS_MODULE(CONFIG_UNICODE_NORMALIZATION_KUNIT_TEST)
EXPORT_SYMBOL_GPL(utf8version_is_supported);
EXPORT_SYMBOL_GPL(utf8nlen);
EXPORT_SYMBOL_GPL(utf8ncursor);
diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c
index a859ac9b74ba..770e29ec3557 100644
--- a/fs/vboxsf/dir.c
+++ b/fs/vboxsf/dir.c
@@ -303,11 +303,11 @@ static int vboxsf_dir_mkfile(struct mnt_idmap *idmap,
return vboxsf_dir_create(parent, dentry, mode, false, excl, NULL);
}
-static int vboxsf_dir_mkdir(struct mnt_idmap *idmap,
- struct inode *parent, struct dentry *dentry,
- umode_t mode)
+static struct dentry *vboxsf_dir_mkdir(struct mnt_idmap *idmap,
+ struct inode *parent, struct dentry *dentry,
+ umode_t mode)
{
- return vboxsf_dir_create(parent, dentry, mode, true, true, NULL);
+ return ERR_PTR(vboxsf_dir_create(parent, dentry, mode, true, true, NULL));
}
static int vboxsf_dir_atomic_open(struct inode *parent, struct dentry *dentry,
diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c
index c287c755f2c5..3537f3cca6d5 100644
--- a/fs/xfs/scrub/orphanage.c
+++ b/fs/xfs/scrub/orphanage.c
@@ -167,10 +167,11 @@ xrep_orphanage_create(
* directory to control access to a file we put in here.
*/
if (d_really_is_negative(orphanage_dentry)) {
- error = vfs_mkdir(&nop_mnt_idmap, root_inode, orphanage_dentry,
- 0750);
- if (error)
- goto out_dput_orphanage;
+ orphanage_dentry = vfs_mkdir(&nop_mnt_idmap, root_inode,
+ orphanage_dentry, 0750);
+ error = PTR_ERR(orphanage_dentry);
+ if (IS_ERR(orphanage_dentry))
+ goto out_unlock_root;
}
/* Not a directory? Bail out. */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 6d9965b546cb..5077d52a775d 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -115,7 +115,7 @@ xfs_end_ioend(
*/
error = blk_status_to_errno(ioend->io_bio.bi_status);
if (unlikely(error)) {
- if (ioend->io_flags & IOMAP_F_SHARED) {
+ if (ioend->io_flags & IOMAP_IOEND_SHARED) {
xfs_reflink_cancel_cow_range(ip, offset, size, true);
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
offset + size);
@@ -126,9 +126,9 @@ xfs_end_ioend(
/*
* Success: commit the COW or unwritten blocks if needed.
*/
- if (ioend->io_flags & IOMAP_F_SHARED)
+ if (ioend->io_flags & IOMAP_IOEND_SHARED)
error = xfs_reflink_end_cow(ip, offset, size);
- else if (ioend->io_type == IOMAP_UNWRITTEN)
+ else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN)
error = xfs_iomap_write_unwritten(ip, offset, size, false);
if (!error && xfs_ioend_is_append(ioend))
@@ -396,10 +396,11 @@ allocate_blocks:
}
static int
-xfs_prepare_ioend(
- struct iomap_ioend *ioend,
+xfs_submit_ioend(
+ struct iomap_writepage_ctx *wpc,
int status)
{
+ struct iomap_ioend *ioend = wpc->ioend;
unsigned int nofs_flag;
/*
@@ -410,7 +411,7 @@ xfs_prepare_ioend(
nofs_flag = memalloc_nofs_save();
/* Convert CoW extents to regular */
- if (!status && (ioend->io_flags & IOMAP_F_SHARED)) {
+ if (!status && (ioend->io_flags & IOMAP_IOEND_SHARED)) {
status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
ioend->io_offset, ioend->io_size);
}
@@ -418,10 +419,14 @@ xfs_prepare_ioend(
memalloc_nofs_restore(nofs_flag);
/* send ioends that might require a transaction to the completion wq */
- if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
- (ioend->io_flags & IOMAP_F_SHARED))
+ if (xfs_ioend_is_append(ioend) ||
+ (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED)))
ioend->io_bio.bi_end_io = xfs_end_bio;
- return status;
+
+ if (status)
+ return status;
+ submit_bio(&ioend->io_bio);
+ return 0;
}
/*
@@ -463,7 +468,7 @@ xfs_discard_folio(
static const struct iomap_writeback_ops xfs_writeback_ops = {
.map_blocks = xfs_map_blocks,
- .prepare_ioend = xfs_prepare_ioend,
+ .submit_ioend = xfs_submit_ioend,
.discard_folio = xfs_discard_folio,
};
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 9a435b1ff264..85b857805d6d 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1498,7 +1498,8 @@ xfs_write_fault(
if (IS_DAX(inode))
ret = xfs_dax_fault_locked(vmf, order, true);
else
- ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops);
+ ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops,
+ NULL);
xfs_iunlock(ip, lock_mode);
sb_end_pagefault(inode->i_sb);
@@ -1613,7 +1614,8 @@ const struct file_operations xfs_file_operations = {
.fadvise = xfs_file_fadvise,
.remap_file_range = xfs_file_remap_range,
.fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
- FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE,
+ FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE |
+ FOP_DONTCACHE,
};
const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index d61460309a78..f631177ac320 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -828,6 +828,10 @@ xfs_direct_write_iomap_begin(
if (offset + length > i_size_read(inode))
iomap_flags |= IOMAP_F_DIRTY;
+ /* HW-offload atomics are always used in this path */
+ if (flags & IOMAP_ATOMIC)
+ iomap_flags |= IOMAP_F_ATOMIC_BIO;
+
/*
* COW writes may allocate delalloc space or convert unwritten COW
* extents, so we need to make sure to take the lock exclusively here.
@@ -1495,7 +1499,7 @@ xfs_zero_range(
return dax_zero_range(inode, pos, len, did_zero,
&xfs_dax_write_iomap_ops);
return iomap_zero_range(inode, pos, len, did_zero,
- &xfs_buffered_write_iomap_ops);
+ &xfs_buffered_write_iomap_ops, NULL);
}
int
@@ -1510,5 +1514,5 @@ xfs_truncate_page(
return dax_truncate_page(inode, pos, did_zero,
&xfs_dax_write_iomap_ops);
return iomap_truncate_page(inode, pos, did_zero,
- &xfs_buffered_write_iomap_ops);
+ &xfs_buffered_write_iomap_ops, NULL);
}
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 40289fe6f5b2..a4480098d2bf 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -298,14 +298,14 @@ xfs_vn_create(
return xfs_generic_create(idmap, dir, dentry, mode, 0, NULL);
}
-STATIC int
+STATIC struct dentry *
xfs_vn_mkdir(
struct mnt_idmap *idmap,
struct inode *dir,
struct dentry *dentry,
umode_t mode)
{
- return xfs_generic_create(idmap, dir, dentry, mode | S_IFDIR, 0, NULL);
+ return ERR_PTR(xfs_generic_create(idmap, dir, dentry, mode | S_IFDIR, 0, NULL));
}
STATIC struct dentry *
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 0055066fb1d9..62d04f4843cf 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -2122,7 +2122,8 @@ static struct file_system_type xfs_fs_type = {
.init_fs_context = xfs_init_fs_context,
.parameters = xfs_fs_parameters,
.kill_sb = xfs_kill_sb,
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME |
+ FS_LBS,
};
MODULE_ALIAS_FS("xfs");
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index 35166c92420c..42e2c0065bb3 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -299,7 +299,7 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
/* Serialize against truncates */
filemap_invalidate_lock_shared(inode->i_mapping);
- ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops);
+ ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops, NULL);
filemap_invalidate_unlock_shared(inode->i_mapping);
sb_end_pagefault(inode->i_sb);