diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-09-16 09:14:02 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-09-16 09:14:02 +0200 |
commit | 3352633ce6b221d64bf40644d412d9670e7d56e3 (patch) | |
tree | f74add2d0a46ac33034955c1fa8dcff1cedd7dc0 /fs/fcntl.c | |
parent | 2775df6e5e324be9dc375f7db2c8d3042df72bbf (diff) | |
parent | 24a988f75c8a5f16ef935c51039700e985767eb9 (diff) | |
download | lwn-3352633ce6b221d64bf40644d412d9670e7d56e3.tar.gz lwn-3352633ce6b221d64bf40644d412d9670e7d56e3.zip |
Merge tag 'vfs-6.12.file' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs file updates from Christian Brauner:
"This is the work to cleanup and shrink struct file significantly.
Right now, (focusing on x86) struct file is 232 bytes. After this
series struct file will be 184 bytes aka 3 cacheline and a spare 8
bytes for future extensions at the end of the struct.
With struct file being as ubiquitous as it is this should make a
difference for file heavy workloads and allow further optimizations in
the future.
- struct fown_struct was embedded into struct file letting it take up
32 bytes in total when really it shouldn't even be embedded in
struct file in the first place. Instead, actual users of struct
fown_struct now allocate the struct on demand. This frees up 24
bytes.
- Move struct file_ra_state into the union containg the cleanup hooks
and move f_iocb_flags out of the union. This closes a 4 byte hole
we created earlier and brings struct file to 192 bytes. Which means
struct file is 3 cachelines and we managed to shrink it by 40
bytes.
- Reorder struct file so that nothing crosses a cacheline.
I suspect that in the future we will end up reordering some members
to mitigate false sharing issues or just because someone does
actually provide really good perf data.
- Shrinking struct file to 192 bytes is only part of the work.
Files use a slab that is SLAB_TYPESAFE_BY_RCU and when a kmem cache
is created with SLAB_TYPESAFE_BY_RCU the free pointer must be
located outside of the object because the cache doesn't know what
part of the memory can safely be overwritten as it may be needed to
prevent object recycling.
That has the consequence that SLAB_TYPESAFE_BY_RCU may end up
adding a new cacheline.
So this also contains work to add a new kmem_cache_create_rcu()
function that allows the caller to specify an offset where the
freelist pointer is supposed to be placed. Thus avoiding the
implicit addition of a fourth cacheline.
- And finally this removes the f_version member in struct file.
The f_version member isn't particularly well-defined. It is mainly
used as a cookie to detect concurrent seeks when iterating
directories. But it is also abused by some subsystems for
completely unrelated things.
It is mostly a directory and filesystem specific thing that doesn't
really need to live in struct file and with its wonky semantics it
really lacks a specific function.
For pipes, f_version is (ab)used to defer poll notifications until
a write has happened. And struct pipe_inode_info is used by
multiple struct files in their ->private_data so there's no chance
of pushing that down into file->private_data without introducing
another pointer indirection.
But pipes don't rely on f_pos_lock so this adds a union into struct
file encompassing f_pos_lock and a pipe specific f_pipe member that
pipes can use. This union of course can be extended to other file
types and is similar to what we do in struct inode already"
* tag 'vfs-6.12.file' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (26 commits)
fs: remove f_version
pipe: use f_pipe
fs: add f_pipe
ubifs: store cookie in private data
ufs: store cookie in private data
udf: store cookie in private data
proc: store cookie in private data
ocfs2: store cookie in private data
input: remove f_version abuse
ext4: store cookie in private data
ext2: store cookie in private data
affs: store cookie in private data
fs: add generic_llseek_cookie()
fs: use must_set_pos()
fs: add must_set_pos()
fs: add vfs_setpos_cookie()
s390: remove unused f_version
ceph: remove unused f_version
adi: remove unused f_version
mm: Removed @freeptr_offset to prevent doc warning
...
Diffstat (limited to 'fs/fcntl.c')
-rw-r--r-- | fs/fcntl.c | 166 |
1 files changed, 132 insertions, 34 deletions
diff --git a/fs/fcntl.c b/fs/fcntl.c index 22ec683ad8f8..f6fde75a3bd5 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -33,6 +33,8 @@ #include <asm/siginfo.h> #include <linux/uaccess.h> +#include "internal.h" + #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME) static int setfl(int fd, struct file * filp, unsigned int arg) @@ -87,22 +89,64 @@ static int setfl(int fd, struct file * filp, unsigned int arg) return error; } +/* + * Allocate an file->f_owner struct if it doesn't exist, handling racing + * allocations correctly. + */ +int file_f_owner_allocate(struct file *file) +{ + struct fown_struct *f_owner; + + f_owner = file_f_owner(file); + if (f_owner) + return 0; + + f_owner = kzalloc(sizeof(struct fown_struct), GFP_KERNEL); + if (!f_owner) + return -ENOMEM; + + rwlock_init(&f_owner->lock); + f_owner->file = file; + /* If someone else raced us, drop our allocation. */ + if (unlikely(cmpxchg(&file->f_owner, NULL, f_owner))) + kfree(f_owner); + return 0; +} +EXPORT_SYMBOL(file_f_owner_allocate); + +void file_f_owner_release(struct file *file) +{ + struct fown_struct *f_owner; + + f_owner = file_f_owner(file); + if (f_owner) { + put_pid(f_owner->pid); + kfree(f_owner); + } +} + static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, int force) { - write_lock_irq(&filp->f_owner.lock); - if (force || !filp->f_owner.pid) { - put_pid(filp->f_owner.pid); - filp->f_owner.pid = get_pid(pid); - filp->f_owner.pid_type = type; + struct fown_struct *f_owner; + + f_owner = file_f_owner(filp); + if (WARN_ON_ONCE(!f_owner)) + return; + + write_lock_irq(&f_owner->lock); + if (force || !f_owner->pid) { + put_pid(f_owner->pid); + f_owner->pid = get_pid(pid); + f_owner->pid_type = type; if (pid) { const struct cred *cred = current_cred(); - filp->f_owner.uid = cred->uid; - filp->f_owner.euid = cred->euid; + f_owner->uid = cred->uid; + f_owner->euid = cred->euid; } } - write_unlock_irq(&filp->f_owner.lock); + write_unlock_irq(&f_owner->lock); } void __f_setown(struct file *filp, struct pid *pid, enum pid_type type, @@ -119,6 +163,8 @@ int f_setown(struct file *filp, int who, int force) struct pid *pid = NULL; int ret = 0; + might_sleep(); + type = PIDTYPE_TGID; if (who < 0) { /* avoid overflow below */ @@ -129,6 +175,10 @@ int f_setown(struct file *filp, int who, int force) who = -who; } + ret = file_f_owner_allocate(filp); + if (ret) + return ret; + rcu_read_lock(); if (who) { pid = find_vpid(who); @@ -152,16 +202,21 @@ void f_delown(struct file *filp) pid_t f_getown(struct file *filp) { pid_t pid = 0; + struct fown_struct *f_owner; + + f_owner = file_f_owner(filp); + if (!f_owner) + return pid; - read_lock_irq(&filp->f_owner.lock); + read_lock_irq(&f_owner->lock); rcu_read_lock(); - if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) { - pid = pid_vnr(filp->f_owner.pid); - if (filp->f_owner.pid_type == PIDTYPE_PGID) + if (pid_task(f_owner->pid, f_owner->pid_type)) { + pid = pid_vnr(f_owner->pid); + if (f_owner->pid_type == PIDTYPE_PGID) pid = -pid; } rcu_read_unlock(); - read_unlock_irq(&filp->f_owner.lock); + read_unlock_irq(&f_owner->lock); return pid; } @@ -194,6 +249,10 @@ static int f_setown_ex(struct file *filp, unsigned long arg) return -EINVAL; } + ret = file_f_owner_allocate(filp); + if (ret) + return ret; + rcu_read_lock(); pid = find_vpid(owner.pid); if (owner.pid && !pid) @@ -210,13 +269,20 @@ static int f_getown_ex(struct file *filp, unsigned long arg) struct f_owner_ex __user *owner_p = (void __user *)arg; struct f_owner_ex owner = {}; int ret = 0; + struct fown_struct *f_owner; + enum pid_type pid_type = PIDTYPE_PID; - read_lock_irq(&filp->f_owner.lock); - rcu_read_lock(); - if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) - owner.pid = pid_vnr(filp->f_owner.pid); - rcu_read_unlock(); - switch (filp->f_owner.pid_type) { + f_owner = file_f_owner(filp); + if (f_owner) { + read_lock_irq(&f_owner->lock); + rcu_read_lock(); + if (pid_task(f_owner->pid, f_owner->pid_type)) + owner.pid = pid_vnr(f_owner->pid); + rcu_read_unlock(); + pid_type = f_owner->pid_type; + } + + switch (pid_type) { case PIDTYPE_PID: owner.type = F_OWNER_TID; break; @@ -234,7 +300,8 @@ static int f_getown_ex(struct file *filp, unsigned long arg) ret = -EINVAL; break; } - read_unlock_irq(&filp->f_owner.lock); + if (f_owner) + read_unlock_irq(&f_owner->lock); if (!ret) { ret = copy_to_user(owner_p, &owner, sizeof(owner)); @@ -248,14 +315,18 @@ static int f_getown_ex(struct file *filp, unsigned long arg) static int f_getowner_uids(struct file *filp, unsigned long arg) { struct user_namespace *user_ns = current_user_ns(); + struct fown_struct *f_owner; uid_t __user *dst = (void __user *)arg; - uid_t src[2]; + uid_t src[2] = {0, 0}; int err; - read_lock_irq(&filp->f_owner.lock); - src[0] = from_kuid(user_ns, filp->f_owner.uid); - src[1] = from_kuid(user_ns, filp->f_owner.euid); - read_unlock_irq(&filp->f_owner.lock); + f_owner = file_f_owner(filp); + if (f_owner) { + read_lock_irq(&f_owner->lock); + src[0] = from_kuid(user_ns, f_owner->uid); + src[1] = from_kuid(user_ns, f_owner->euid); + read_unlock_irq(&f_owner->lock); + } err = put_user(src[0], &dst[0]); err |= put_user(src[1], &dst[1]); @@ -349,6 +420,30 @@ static long f_created_query(const struct file *filp) return !!(filp->f_mode & FMODE_CREATED); } +static int f_owner_sig(struct file *filp, int signum, bool setsig) +{ + int ret = 0; + struct fown_struct *f_owner; + + might_sleep(); + + if (setsig) { + if (!valid_signal(signum)) + return -EINVAL; + + ret = file_f_owner_allocate(filp); + if (ret) + return ret; + } + + f_owner = file_f_owner(filp); + if (setsig) + f_owner->signum = signum; + else if (f_owner) + ret = f_owner->signum; + return ret; +} + static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp) { @@ -430,15 +525,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, err = f_getowner_uids(filp, arg); break; case F_GETSIG: - err = filp->f_owner.signum; + err = f_owner_sig(filp, 0, false); break; case F_SETSIG: - /* arg == 0 restores default behaviour. */ - if (!valid_signal(argi)) { - break; - } - err = 0; - filp->f_owner.signum = argi; + err = f_owner_sig(filp, argi, true); break; case F_GETLEASE: err = fcntl_getlease(filp); @@ -854,14 +944,19 @@ static void send_sigurg_to_task(struct task_struct *p, do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, type); } -int send_sigurg(struct fown_struct *fown) +int send_sigurg(struct file *file) { + struct fown_struct *fown; struct task_struct *p; enum pid_type type; struct pid *pid; unsigned long flags; int ret = 0; + fown = file_f_owner(file); + if (!fown) + return 0; + read_lock_irqsave(&fown->lock, flags); type = fown->pid_type; @@ -1037,13 +1132,16 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) } read_lock_irqsave(&fa->fa_lock, flags); if (fa->fa_file) { - fown = &fa->fa_file->f_owner; + fown = file_f_owner(fa->fa_file); + if (!fown) + goto next; /* Don't send SIGURG to processes which have not set a queued signum: SIGURG has its own default signalling mechanism. */ if (!(sig == SIGURG && fown->signum == 0)) send_sigio(fown, fa->fa_fd, band); } +next: read_unlock_irqrestore(&fa->fa_lock, flags); fa = rcu_dereference(fa->fa_next); } |