summaryrefslogtreecommitdiff
path: root/fs/fcntl.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-09-16 09:14:02 +0200
committerLinus Torvalds <torvalds@linux-foundation.org>2024-09-16 09:14:02 +0200
commit3352633ce6b221d64bf40644d412d9670e7d56e3 (patch)
treef74add2d0a46ac33034955c1fa8dcff1cedd7dc0 /fs/fcntl.c
parent2775df6e5e324be9dc375f7db2c8d3042df72bbf (diff)
parent24a988f75c8a5f16ef935c51039700e985767eb9 (diff)
downloadlwn-3352633ce6b221d64bf40644d412d9670e7d56e3.tar.gz
lwn-3352633ce6b221d64bf40644d412d9670e7d56e3.zip
Merge tag 'vfs-6.12.file' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs file updates from Christian Brauner: "This is the work to cleanup and shrink struct file significantly. Right now, (focusing on x86) struct file is 232 bytes. After this series struct file will be 184 bytes aka 3 cacheline and a spare 8 bytes for future extensions at the end of the struct. With struct file being as ubiquitous as it is this should make a difference for file heavy workloads and allow further optimizations in the future. - struct fown_struct was embedded into struct file letting it take up 32 bytes in total when really it shouldn't even be embedded in struct file in the first place. Instead, actual users of struct fown_struct now allocate the struct on demand. This frees up 24 bytes. - Move struct file_ra_state into the union containg the cleanup hooks and move f_iocb_flags out of the union. This closes a 4 byte hole we created earlier and brings struct file to 192 bytes. Which means struct file is 3 cachelines and we managed to shrink it by 40 bytes. - Reorder struct file so that nothing crosses a cacheline. I suspect that in the future we will end up reordering some members to mitigate false sharing issues or just because someone does actually provide really good perf data. - Shrinking struct file to 192 bytes is only part of the work. Files use a slab that is SLAB_TYPESAFE_BY_RCU and when a kmem cache is created with SLAB_TYPESAFE_BY_RCU the free pointer must be located outside of the object because the cache doesn't know what part of the memory can safely be overwritten as it may be needed to prevent object recycling. That has the consequence that SLAB_TYPESAFE_BY_RCU may end up adding a new cacheline. So this also contains work to add a new kmem_cache_create_rcu() function that allows the caller to specify an offset where the freelist pointer is supposed to be placed. Thus avoiding the implicit addition of a fourth cacheline. - And finally this removes the f_version member in struct file. The f_version member isn't particularly well-defined. It is mainly used as a cookie to detect concurrent seeks when iterating directories. But it is also abused by some subsystems for completely unrelated things. It is mostly a directory and filesystem specific thing that doesn't really need to live in struct file and with its wonky semantics it really lacks a specific function. For pipes, f_version is (ab)used to defer poll notifications until a write has happened. And struct pipe_inode_info is used by multiple struct files in their ->private_data so there's no chance of pushing that down into file->private_data without introducing another pointer indirection. But pipes don't rely on f_pos_lock so this adds a union into struct file encompassing f_pos_lock and a pipe specific f_pipe member that pipes can use. This union of course can be extended to other file types and is similar to what we do in struct inode already" * tag 'vfs-6.12.file' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (26 commits) fs: remove f_version pipe: use f_pipe fs: add f_pipe ubifs: store cookie in private data ufs: store cookie in private data udf: store cookie in private data proc: store cookie in private data ocfs2: store cookie in private data input: remove f_version abuse ext4: store cookie in private data ext2: store cookie in private data affs: store cookie in private data fs: add generic_llseek_cookie() fs: use must_set_pos() fs: add must_set_pos() fs: add vfs_setpos_cookie() s390: remove unused f_version ceph: remove unused f_version adi: remove unused f_version mm: Removed @freeptr_offset to prevent doc warning ...
Diffstat (limited to 'fs/fcntl.c')
-rw-r--r--fs/fcntl.c166
1 files changed, 132 insertions, 34 deletions
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 22ec683ad8f8..f6fde75a3bd5 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -33,6 +33,8 @@
#include <asm/siginfo.h>
#include <linux/uaccess.h>
+#include "internal.h"
+
#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
static int setfl(int fd, struct file * filp, unsigned int arg)
@@ -87,22 +89,64 @@ static int setfl(int fd, struct file * filp, unsigned int arg)
return error;
}
+/*
+ * Allocate an file->f_owner struct if it doesn't exist, handling racing
+ * allocations correctly.
+ */
+int file_f_owner_allocate(struct file *file)
+{
+ struct fown_struct *f_owner;
+
+ f_owner = file_f_owner(file);
+ if (f_owner)
+ return 0;
+
+ f_owner = kzalloc(sizeof(struct fown_struct), GFP_KERNEL);
+ if (!f_owner)
+ return -ENOMEM;
+
+ rwlock_init(&f_owner->lock);
+ f_owner->file = file;
+ /* If someone else raced us, drop our allocation. */
+ if (unlikely(cmpxchg(&file->f_owner, NULL, f_owner)))
+ kfree(f_owner);
+ return 0;
+}
+EXPORT_SYMBOL(file_f_owner_allocate);
+
+void file_f_owner_release(struct file *file)
+{
+ struct fown_struct *f_owner;
+
+ f_owner = file_f_owner(file);
+ if (f_owner) {
+ put_pid(f_owner->pid);
+ kfree(f_owner);
+ }
+}
+
static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
int force)
{
- write_lock_irq(&filp->f_owner.lock);
- if (force || !filp->f_owner.pid) {
- put_pid(filp->f_owner.pid);
- filp->f_owner.pid = get_pid(pid);
- filp->f_owner.pid_type = type;
+ struct fown_struct *f_owner;
+
+ f_owner = file_f_owner(filp);
+ if (WARN_ON_ONCE(!f_owner))
+ return;
+
+ write_lock_irq(&f_owner->lock);
+ if (force || !f_owner->pid) {
+ put_pid(f_owner->pid);
+ f_owner->pid = get_pid(pid);
+ f_owner->pid_type = type;
if (pid) {
const struct cred *cred = current_cred();
- filp->f_owner.uid = cred->uid;
- filp->f_owner.euid = cred->euid;
+ f_owner->uid = cred->uid;
+ f_owner->euid = cred->euid;
}
}
- write_unlock_irq(&filp->f_owner.lock);
+ write_unlock_irq(&f_owner->lock);
}
void __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
@@ -119,6 +163,8 @@ int f_setown(struct file *filp, int who, int force)
struct pid *pid = NULL;
int ret = 0;
+ might_sleep();
+
type = PIDTYPE_TGID;
if (who < 0) {
/* avoid overflow below */
@@ -129,6 +175,10 @@ int f_setown(struct file *filp, int who, int force)
who = -who;
}
+ ret = file_f_owner_allocate(filp);
+ if (ret)
+ return ret;
+
rcu_read_lock();
if (who) {
pid = find_vpid(who);
@@ -152,16 +202,21 @@ void f_delown(struct file *filp)
pid_t f_getown(struct file *filp)
{
pid_t pid = 0;
+ struct fown_struct *f_owner;
+
+ f_owner = file_f_owner(filp);
+ if (!f_owner)
+ return pid;
- read_lock_irq(&filp->f_owner.lock);
+ read_lock_irq(&f_owner->lock);
rcu_read_lock();
- if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) {
- pid = pid_vnr(filp->f_owner.pid);
- if (filp->f_owner.pid_type == PIDTYPE_PGID)
+ if (pid_task(f_owner->pid, f_owner->pid_type)) {
+ pid = pid_vnr(f_owner->pid);
+ if (f_owner->pid_type == PIDTYPE_PGID)
pid = -pid;
}
rcu_read_unlock();
- read_unlock_irq(&filp->f_owner.lock);
+ read_unlock_irq(&f_owner->lock);
return pid;
}
@@ -194,6 +249,10 @@ static int f_setown_ex(struct file *filp, unsigned long arg)
return -EINVAL;
}
+ ret = file_f_owner_allocate(filp);
+ if (ret)
+ return ret;
+
rcu_read_lock();
pid = find_vpid(owner.pid);
if (owner.pid && !pid)
@@ -210,13 +269,20 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
struct f_owner_ex __user *owner_p = (void __user *)arg;
struct f_owner_ex owner = {};
int ret = 0;
+ struct fown_struct *f_owner;
+ enum pid_type pid_type = PIDTYPE_PID;
- read_lock_irq(&filp->f_owner.lock);
- rcu_read_lock();
- if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type))
- owner.pid = pid_vnr(filp->f_owner.pid);
- rcu_read_unlock();
- switch (filp->f_owner.pid_type) {
+ f_owner = file_f_owner(filp);
+ if (f_owner) {
+ read_lock_irq(&f_owner->lock);
+ rcu_read_lock();
+ if (pid_task(f_owner->pid, f_owner->pid_type))
+ owner.pid = pid_vnr(f_owner->pid);
+ rcu_read_unlock();
+ pid_type = f_owner->pid_type;
+ }
+
+ switch (pid_type) {
case PIDTYPE_PID:
owner.type = F_OWNER_TID;
break;
@@ -234,7 +300,8 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
ret = -EINVAL;
break;
}
- read_unlock_irq(&filp->f_owner.lock);
+ if (f_owner)
+ read_unlock_irq(&f_owner->lock);
if (!ret) {
ret = copy_to_user(owner_p, &owner, sizeof(owner));
@@ -248,14 +315,18 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
static int f_getowner_uids(struct file *filp, unsigned long arg)
{
struct user_namespace *user_ns = current_user_ns();
+ struct fown_struct *f_owner;
uid_t __user *dst = (void __user *)arg;
- uid_t src[2];
+ uid_t src[2] = {0, 0};
int err;
- read_lock_irq(&filp->f_owner.lock);
- src[0] = from_kuid(user_ns, filp->f_owner.uid);
- src[1] = from_kuid(user_ns, filp->f_owner.euid);
- read_unlock_irq(&filp->f_owner.lock);
+ f_owner = file_f_owner(filp);
+ if (f_owner) {
+ read_lock_irq(&f_owner->lock);
+ src[0] = from_kuid(user_ns, f_owner->uid);
+ src[1] = from_kuid(user_ns, f_owner->euid);
+ read_unlock_irq(&f_owner->lock);
+ }
err = put_user(src[0], &dst[0]);
err |= put_user(src[1], &dst[1]);
@@ -349,6 +420,30 @@ static long f_created_query(const struct file *filp)
return !!(filp->f_mode & FMODE_CREATED);
}
+static int f_owner_sig(struct file *filp, int signum, bool setsig)
+{
+ int ret = 0;
+ struct fown_struct *f_owner;
+
+ might_sleep();
+
+ if (setsig) {
+ if (!valid_signal(signum))
+ return -EINVAL;
+
+ ret = file_f_owner_allocate(filp);
+ if (ret)
+ return ret;
+ }
+
+ f_owner = file_f_owner(filp);
+ if (setsig)
+ f_owner->signum = signum;
+ else if (f_owner)
+ ret = f_owner->signum;
+ return ret;
+}
+
static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
struct file *filp)
{
@@ -430,15 +525,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
err = f_getowner_uids(filp, arg);
break;
case F_GETSIG:
- err = filp->f_owner.signum;
+ err = f_owner_sig(filp, 0, false);
break;
case F_SETSIG:
- /* arg == 0 restores default behaviour. */
- if (!valid_signal(argi)) {
- break;
- }
- err = 0;
- filp->f_owner.signum = argi;
+ err = f_owner_sig(filp, argi, true);
break;
case F_GETLEASE:
err = fcntl_getlease(filp);
@@ -854,14 +944,19 @@ static void send_sigurg_to_task(struct task_struct *p,
do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, type);
}
-int send_sigurg(struct fown_struct *fown)
+int send_sigurg(struct file *file)
{
+ struct fown_struct *fown;
struct task_struct *p;
enum pid_type type;
struct pid *pid;
unsigned long flags;
int ret = 0;
+ fown = file_f_owner(file);
+ if (!fown)
+ return 0;
+
read_lock_irqsave(&fown->lock, flags);
type = fown->pid_type;
@@ -1037,13 +1132,16 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
}
read_lock_irqsave(&fa->fa_lock, flags);
if (fa->fa_file) {
- fown = &fa->fa_file->f_owner;
+ fown = file_f_owner(fa->fa_file);
+ if (!fown)
+ goto next;
/* Don't send SIGURG to processes which have not set a
queued signum: SIGURG has its own default signalling
mechanism. */
if (!(sig == SIGURG && fown->signum == 0))
send_sigio(fown, fa->fa_fd, band);
}
+next:
read_unlock_irqrestore(&fa->fa_lock, flags);
fa = rcu_dereference(fa->fa_next);
}