summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-02-09 15:13:05 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2026-02-09 15:13:05 -0800
commit9e355113f02be17db573d579515dee63621b7c8b (patch)
tree9d6d6dea7ccae97b5439a91d30eed16b5821cfb9
parent3304b3fedddfb1357c7f9e25526b5a7899ee1f13 (diff)
parent6cbfdf89470ef3c2110f376a507d135e7a7a7378 (diff)
downloadlwn-9e355113f02be17db573d579515dee63621b7c8b.tar.gz
lwn-9e355113f02be17db573d579515dee63621b7c8b.zip
Merge tag 'vfs-7.0-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull misc vfs updates from Christian Brauner: "This contains a mix of VFS cleanups, performance improvements, API fixes, documentation, and a deprecation notice. Scalability and performance: - Rework pid allocation to only take pidmap_lock once instead of twice during alloc_pid(), improving thread creation/teardown throughput by 10-16% depending on false-sharing luck. Pad the namespace refcount to reduce false-sharing - Track file lock presence via a flag in ->i_opflags instead of reading ->i_flctx, avoiding false-sharing with ->i_readcount on open/close hot paths. Measured 4-16% improvement on 24-core open-in-a-loop benchmarks - Use a consume fence in locks_inode_context() to match the store-release/load-consume idiom, eliminating a hardware fence on some architectures - Annotate cdev_lock with __cacheline_aligned_in_smp to prevent false-sharing - Remove a redundant DCACHE_MANAGED_DENTRY check in __follow_mount_rcu() that never fires since the caller already verifies it, eliminating a 100% mispredicted branch - Fix a 100% mispredicted likely() in devcgroup_inode_permission() that became wrong after a prior code reorder Bug fixes and correctness: - Make insert_inode_locked() wait for inode destruction instead of skipping, fixing a corner case where two matching inodes could exist in the hash - Move f_mode initialization before file_ref_init() in alloc_file() to respect the SLAB_TYPESAFE_BY_RCU ordering contract - Add a WARN_ON_ONCE guard in try_to_free_buffers() for folios with no buffers attached, preventing a null pointer dereference when AS_RELEASE_ALWAYS is set but no release_folio op exists - Fix select restart_block to store end_time as timespec64, avoiding truncation of tv_sec on 32-bit architectures - Make dump_inode() use get_kernel_nofault() to safely access inode and superblock fields, matching the dump_mapping() pattern API modernization: - Make posix_acl_to_xattr() allocate the buffer internally since every single caller was doing it anyway. Reduces boilerplate and unnecessary error checking across ~15 filesystems - Replace deprecated simple_strtoul() with kstrtoul() for the ihash_entries, dhash_entries, mhash_entries, and mphash_entries boot parameters, adding proper error handling - Convert chardev code to use guard(mutex) and __free(kfree) cleanup patterns - Replace min_t() with min() or umin() in VFS code to avoid silently truncating unsigned long to unsigned int - Gate LOOKUP_RCU assertions behind CONFIG_DEBUG_VFS since callers already check the flag Deprecation: - Begin deprecating legacy BSD process accounting (acct(2)). The interface has numerous footguns and better alternatives exist (eBPF) Documentation: - Fix and complete kernel-doc for struct export_operations, removing duplicated documentation between ReST and source - Fix kernel-doc warnings for __start_dirop() and ilookup5_nowait() Testing: - Add a kunit test for initramfs cpio handling of entries with filesize > PATH_MAX Misc: - Add missing <linux/init_task.h> include in fs_struct.c" * tag 'vfs-7.0-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (28 commits) posix_acl: make posix_acl_to_xattr() alloc the buffer fs: make insert_inode_locked() wait for inode destruction initramfs_test: kunit test for cpio.filesize > PATH_MAX fs: improve dump_inode() to safely access inode fields fs: add <linux/init_task.h> for 'init_fs' docs: exportfs: Use source code struct documentation fs: move initializing f_mode before file_ref_init() exportfs: Complete kernel-doc for struct export_operations exportfs: Mark struct export_operations functions at kernel-doc exportfs: Fix kernel-doc output for get_name() acct(2): begin the deprecation of legacy BSD process accounting device_cgroup: remove branch hint after code refactor VFS: fix __start_dirop() kernel-doc warnings fs: Describe @isnew parameter in ilookup5_nowait() fs/namei: Remove redundant DCACHE_MANAGED_DENTRY check in __follow_mount_rcu fs: only assert on LOOKUP_RCU when built with CONFIG_DEBUG_VFS select: store end_time as timespec64 in restart block chardev: Switch to guard(mutex) and __free(kfree) namespace: Replace simple_strtoul with kstrtoul to parse boot params dcache: Replace simple_strtoul with kstrtoul in set_dhash_entries ...
-rw-r--r--Documentation/filesystems/nfs/exporting.rst42
-rw-r--r--fs/9p/acl.c16
-rw-r--r--fs/btrfs/acl.c10
-rw-r--r--fs/buffer.c6
-rw-r--r--fs/ceph/acl.c50
-rw-r--r--fs/char_dev.c19
-rw-r--r--fs/dcache.c5
-rw-r--r--fs/exec.c2
-rw-r--r--fs/ext4/mballoc.c3
-rw-r--r--fs/ext4/resize.c2
-rw-r--r--fs/ext4/super.c2
-rw-r--r--fs/fat/dir.c4
-rw-r--r--fs/fat/file.c3
-rw-r--r--fs/file_table.c10
-rw-r--r--fs/fs_struct.c1
-rw-r--r--fs/fuse/acl.c12
-rw-r--r--fs/fuse/dev.c2
-rw-r--r--fs/fuse/file.c8
-rw-r--r--fs/gfs2/acl.c13
-rw-r--r--fs/inode.c93
-rw-r--r--fs/jfs/acl.c9
-rw-r--r--fs/locks.c14
-rw-r--r--fs/namei.c8
-rw-r--r--fs/namespace.c10
-rw-r--r--fs/ntfs3/xattr.c6
-rw-r--r--fs/orangefs/acl.c8
-rw-r--r--fs/posix_acl.c21
-rw-r--r--fs/select.c12
-rw-r--r--fs/splice.c2
-rw-r--r--include/linux/device_cgroup.h2
-rw-r--r--include/linux/exportfs.h33
-rw-r--r--include/linux/filelock.h18
-rw-r--r--include/linux/fs.h1
-rw-r--r--include/linux/ns/ns_common_types.h4
-rw-r--r--include/linux/posix_acl_xattr.h5
-rw-r--r--include/linux/restart_block.h4
-rw-r--r--init/Kconfig7
-rw-r--r--init/initramfs_test.c48
-rw-r--r--kernel/pid.c131
39 files changed, 352 insertions, 294 deletions
diff --git a/Documentation/filesystems/nfs/exporting.rst b/Documentation/filesystems/nfs/exporting.rst
index de64d2d002a2..a01d9b9b5bc3 100644
--- a/Documentation/filesystems/nfs/exporting.rst
+++ b/Documentation/filesystems/nfs/exporting.rst
@@ -119,43 +119,11 @@ For a filesystem to be exportable it must:
A file system implementation declares that instances of the filesystem
are exportable by setting the s_export_op field in the struct
-super_block. This field must point to a "struct export_operations"
-struct which has the following members:
-
- encode_fh (mandatory)
- Takes a dentry and creates a filehandle fragment which may later be used
- to find or create a dentry for the same object.
-
- fh_to_dentry (mandatory)
- Given a filehandle fragment, this should find the implied object and
- create a dentry for it (possibly with d_obtain_alias).
-
- fh_to_parent (optional but strongly recommended)
- Given a filehandle fragment, this should find the parent of the
- implied object and create a dentry for it (possibly with
- d_obtain_alias). May fail if the filehandle fragment is too small.
-
- get_parent (optional but strongly recommended)
- When given a dentry for a directory, this should return a dentry for
- the parent. Quite possibly the parent dentry will have been allocated
- by d_alloc_anon. The default get_parent function just returns an error
- so any filehandle lookup that requires finding a parent will fail.
- ->lookup("..") is *not* used as a default as it can leave ".." entries
- in the dcache which are too messy to work with.
-
- get_name (optional)
- When given a parent dentry and a child dentry, this should find a name
- in the directory identified by the parent dentry, which leads to the
- object identified by the child dentry. If no get_name function is
- supplied, a default implementation is provided which uses vfs_readdir
- to find potential names, and matches inode numbers to find the correct
- match.
-
- flags
- Some filesystems may need to be handled differently than others. The
- export_operations struct also includes a flags field that allows the
- filesystem to communicate such information to nfsd. See the Export
- Operations Flags section below for more explanation.
+super_block. This field must point to a struct export_operations
+which has the following members:
+
+.. kernel-doc:: include/linux/exportfs.h
+ :identifiers: struct export_operations
A filehandle fragment consists of an array of 1 or more 4byte words,
together with a one byte "type".
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 633da5e37299..ae7e7cf7523a 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -167,17 +167,11 @@ int v9fs_iop_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
if (retval)
goto err_out;
- size = posix_acl_xattr_size(acl->a_count);
-
- value = kzalloc(size, GFP_NOFS);
+ value = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_NOFS);
if (!value) {
retval = -ENOMEM;
goto err_out;
}
-
- retval = posix_acl_to_xattr(&init_user_ns, acl, value, size);
- if (retval < 0)
- goto err_out;
}
/*
@@ -257,13 +251,10 @@ static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl)
return 0;
/* Set a setxattr request to server */
- size = posix_acl_xattr_size(acl->a_count);
- buffer = kmalloc(size, GFP_KERNEL);
+ buffer = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_KERNEL);
if (!buffer)
return -ENOMEM;
- retval = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
- if (retval < 0)
- goto err_free_out;
+
switch (type) {
case ACL_TYPE_ACCESS:
name = XATTR_NAME_POSIX_ACL_ACCESS;
@@ -275,7 +266,6 @@ static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl)
BUG();
}
retval = v9fs_fid_xattr_set(fid, name, buffer, size, 0);
-err_free_out:
kfree(buffer);
return retval;
}
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index c336e2ab7f8a..e55b686fe1ab 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -57,7 +57,8 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu)
int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
struct posix_acl *acl, int type)
{
- int ret, size = 0;
+ int ret;
+ size_t size = 0;
const char *name;
char AUTO_KFREE(value);
@@ -77,20 +78,15 @@ int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
if (acl) {
unsigned int nofs_flag;
- size = posix_acl_xattr_size(acl->a_count);
/*
* We're holding a transaction handle, so use a NOFS memory
* allocation context to avoid deadlock if reclaim happens.
*/
nofs_flag = memalloc_nofs_save();
- value = kmalloc(size, GFP_KERNEL);
+ value = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_KERNEL);
memalloc_nofs_restore(nofs_flag);
if (!value)
return -ENOMEM;
-
- ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
- if (ret < 0)
- return ret;
}
if (trans)
diff --git a/fs/buffer.c b/fs/buffer.c
index 838c0c571022..fd53b806ab7e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2354,7 +2354,7 @@ bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
if (!head)
return false;
blocksize = head->b_size;
- to = min_t(unsigned, folio_size(folio) - from, count);
+ to = min(folio_size(folio) - from, count);
to = from + to;
if (from < blocksize && to > folio_size(folio) - blocksize)
return false;
@@ -2948,6 +2948,10 @@ bool try_to_free_buffers(struct folio *folio)
if (folio_test_writeback(folio))
return false;
+ /* Misconfigured folio check */
+ if (WARN_ON_ONCE(!folio_buffers(folio)))
+ return true;
+
if (mapping == NULL) { /* can this still happen? */
ret = drop_buffers(folio, &buffers_to_free);
goto out;
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 1564eacc253d..85d3dd48b167 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -90,7 +90,8 @@ retry:
int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
struct posix_acl *acl, int type)
{
- int ret = 0, size = 0;
+ int ret = 0;
+ size_t size = 0;
const char *name = NULL;
char *value = NULL;
struct iattr newattrs;
@@ -126,16 +127,11 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
}
if (acl) {
- size = posix_acl_xattr_size(acl->a_count);
- value = kmalloc(size, GFP_NOFS);
+ value = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_NOFS);
if (!value) {
ret = -ENOMEM;
goto out;
}
-
- ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
- if (ret < 0)
- goto out_free;
}
if (new_mode != old_mode) {
@@ -172,7 +168,7 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
struct posix_acl *acl, *default_acl;
size_t val_size1 = 0, val_size2 = 0;
struct ceph_pagelist *pagelist = NULL;
- void *tmp_buf = NULL;
+ void *tmp_buf1 = NULL, *tmp_buf2 = NULL;
int err;
err = posix_acl_create(dir, mode, &default_acl, &acl);
@@ -192,15 +188,7 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
if (!default_acl && !acl)
return 0;
- if (acl)
- val_size1 = posix_acl_xattr_size(acl->a_count);
- if (default_acl)
- val_size2 = posix_acl_xattr_size(default_acl->a_count);
-
err = -ENOMEM;
- tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL);
- if (!tmp_buf)
- goto out_err;
pagelist = ceph_pagelist_alloc(GFP_KERNEL);
if (!pagelist)
goto out_err;
@@ -213,34 +201,39 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
if (acl) {
size_t len = strlen(XATTR_NAME_POSIX_ACL_ACCESS);
+
+ err = -ENOMEM;
+ tmp_buf1 = posix_acl_to_xattr(&init_user_ns, acl,
+ &val_size1, GFP_KERNEL);
+ if (!tmp_buf1)
+ goto out_err;
err = ceph_pagelist_reserve(pagelist, len + val_size1 + 8);
if (err)
goto out_err;
ceph_pagelist_encode_string(pagelist, XATTR_NAME_POSIX_ACL_ACCESS,
len);
- err = posix_acl_to_xattr(&init_user_ns, acl,
- tmp_buf, val_size1);
- if (err < 0)
- goto out_err;
ceph_pagelist_encode_32(pagelist, val_size1);
- ceph_pagelist_append(pagelist, tmp_buf, val_size1);
+ ceph_pagelist_append(pagelist, tmp_buf1, val_size1);
}
if (default_acl) {
size_t len = strlen(XATTR_NAME_POSIX_ACL_DEFAULT);
+
+ err = -ENOMEM;
+ tmp_buf2 = posix_acl_to_xattr(&init_user_ns, default_acl,
+ &val_size2, GFP_KERNEL);
+ if (!tmp_buf2)
+ goto out_err;
err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8);
if (err)
goto out_err;
ceph_pagelist_encode_string(pagelist,
XATTR_NAME_POSIX_ACL_DEFAULT, len);
- err = posix_acl_to_xattr(&init_user_ns, default_acl,
- tmp_buf, val_size2);
- if (err < 0)
- goto out_err;
ceph_pagelist_encode_32(pagelist, val_size2);
- ceph_pagelist_append(pagelist, tmp_buf, val_size2);
+ ceph_pagelist_append(pagelist, tmp_buf2, val_size2);
}
- kfree(tmp_buf);
+ kfree(tmp_buf1);
+ kfree(tmp_buf2);
as_ctx->acl = acl;
as_ctx->default_acl = default_acl;
@@ -250,7 +243,8 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
out_err:
posix_acl_release(acl);
posix_acl_release(default_acl);
- kfree(tmp_buf);
+ kfree(tmp_buf1);
+ kfree(tmp_buf2);
if (pagelist)
ceph_pagelist_release(pagelist);
return err;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index c2ddb998f3c9..bf7b32650e54 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -10,6 +10,7 @@
#include <linux/kdev_t.h>
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/cleanup.h>
#include <linux/major.h>
#include <linux/errno.h>
@@ -97,7 +98,8 @@ static struct char_device_struct *
__register_chrdev_region(unsigned int major, unsigned int baseminor,
int minorct, const char *name)
{
- struct char_device_struct *cd, *curr, *prev = NULL;
+ struct char_device_struct *cd __free(kfree) = NULL;
+ struct char_device_struct *curr, *prev = NULL;
int ret;
int i;
@@ -117,14 +119,14 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
if (cd == NULL)
return ERR_PTR(-ENOMEM);
- mutex_lock(&chrdevs_lock);
+ guard(mutex)(&chrdevs_lock);
if (major == 0) {
ret = find_dynamic_major();
if (ret < 0) {
pr_err("CHRDEV \"%s\" dynamic allocation region is full\n",
name);
- goto out;
+ return ERR_PTR(ret);
}
major = ret;
}
@@ -144,7 +146,7 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
if (curr->baseminor >= baseminor + minorct)
break;
- goto out;
+ return ERR_PTR(ret);
}
cd->major = major;
@@ -160,12 +162,7 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
prev->next = cd;
}
- mutex_unlock(&chrdevs_lock);
- return cd;
-out:
- mutex_unlock(&chrdevs_lock);
- kfree(cd);
- return ERR_PTR(ret);
+ return_ptr(cd);
}
static struct char_device_struct *
@@ -343,7 +340,7 @@ void __unregister_chrdev(unsigned int major, unsigned int baseminor,
kfree(cd);
}
-static DEFINE_SPINLOCK(cdev_lock);
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(cdev_lock);
static struct kobject *cdev_get(struct cdev *p)
{
diff --git a/fs/dcache.c b/fs/dcache.c
index 66dd1bb830d1..7088df2d042c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3237,10 +3237,7 @@ EXPORT_SYMBOL(d_parent_ino);
static __initdata unsigned long dhash_entries;
static int __init set_dhash_entries(char *str)
{
- if (!str)
- return 0;
- dhash_entries = simple_strtoul(str, &str, 0);
- return 1;
+ return kstrtoul(str, 0, &dhash_entries) == 0;
}
__setup("dhash_entries=", set_dhash_entries);
diff --git a/fs/exec.c b/fs/exec.c
index 9d5ebc9d15b0..d0606e53376f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -555,7 +555,7 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm)
return -E2BIG;
while (len > 0) {
- unsigned int bytes_to_copy = min_t(unsigned int, len,
+ unsigned int bytes_to_copy = min(len,
min_not_zero(offset_in_page(pos), PAGE_SIZE));
struct page *page;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 56d50fd3310b..e817a758801d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4276,8 +4276,7 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
* get the corresponding group metadata to work with.
* For this we have goto again loop.
*/
- thisgrp_len = min_t(unsigned int, (unsigned int)len,
- EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff));
+ thisgrp_len = min(len, EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff));
clen = EXT4_NUM_B2C(sbi, thisgrp_len);
if (!ext4_sb_block_valid(sb, NULL, block, thisgrp_len)) {
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 050f26168d97..76842f0957b5 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1479,7 +1479,7 @@ static void ext4_update_super(struct super_block *sb,
/* Update the global fs size fields */
sbi->s_groups_count += flex_gd->count;
- sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+ sbi->s_blockfile_groups = min(sbi->s_groups_count,
(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
/* Update the reserved block counts only once the new group is
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a6241ffb8639..a8d2460b527a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4837,7 +4837,7 @@ static int ext4_check_geometry(struct super_block *sb,
return -EINVAL;
}
sbi->s_groups_count = blocks_count;
- sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+ sbi->s_blockfile_groups = min(sbi->s_groups_count,
(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) !=
le32_to_cpu(es->s_inodes_count)) {
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 0e97ef6c2327..07d95f1442c8 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -1355,7 +1355,7 @@ found:
/* Fill the long name slots. */
for (i = 0; i < long_bhs; i++) {
- int copy = min_t(int, sb->s_blocksize - offset, size);
+ int copy = umin(sb->s_blocksize - offset, size);
memcpy(bhs[i]->b_data + offset, slots, copy);
mark_buffer_dirty_inode(bhs[i], dir);
offset = 0;
@@ -1366,7 +1366,7 @@ found:
err = fat_sync_bhs(bhs, long_bhs);
if (!err && i < nr_bhs) {
/* Fill the short name slot. */
- int copy = min_t(int, sb->s_blocksize - offset, size);
+ int copy = umin(sb->s_blocksize - offset, size);
memcpy(bhs[i]->b_data + offset, slots, copy);
mark_buffer_dirty_inode(bhs[i], dir);
if (IS_DIRSYNC(dir))
diff --git a/fs/fat/file.c b/fs/fat/file.c
index afc0e3ad6536..124d9c5431c8 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -141,8 +141,7 @@ static int fat_ioctl_fitrim(struct inode *inode, unsigned long arg)
if (copy_from_user(&range, user_range, sizeof(range)))
return -EFAULT;
- range.minlen = max_t(unsigned int, range.minlen,
- bdev_discard_granularity(sb->s_bdev));
+ range.minlen = max(range.minlen, bdev_discard_granularity(sb->s_bdev));
err = fat_trim_fs(inode, &range);
if (err < 0)
diff --git a/fs/file_table.c b/fs/file_table.c
index cd4a3db4659a..34244fccf2ed 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -176,6 +176,11 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
f->f_flags = flags;
f->f_mode = OPEN_FMODE(flags);
+ /*
+ * Disable permission and pre-content events for all files by default.
+ * They may be enabled later by fsnotify_open_perm_and_set_mode().
+ */
+ file_set_fsnotify_mode(f, FMODE_NONOTIFY_PERM);
f->f_op = NULL;
f->f_mapping = NULL;
@@ -197,11 +202,6 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
* refcount bumps we should reinitialize the reused file first.
*/
file_ref_init(&f->f_ref, 1);
- /*
- * Disable permission and pre-content events for all files by default.
- * They may be enabled later by fsnotify_open_perm_and_set_mode().
- */
- file_set_fsnotify_mode(f, FMODE_NONOTIFY_PERM);
return 0;
}
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index b8c46c5a38a0..394875d06fd6 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -6,6 +6,7 @@
#include <linux/path.h>
#include <linux/slab.h>
#include <linux/fs_struct.h>
+#include <linux/init_task.h>
#include "internal.h"
/*
diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c
index 8f484b105f13..cbde6ac1add3 100644
--- a/fs/fuse/acl.c
+++ b/fs/fuse/acl.c
@@ -122,20 +122,16 @@ int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
* them to be refreshed the next time they are used,
* and it also updates i_ctime.
*/
- size_t size = posix_acl_xattr_size(acl->a_count);
+ size_t size;
void *value;
- if (size > PAGE_SIZE)
- return -E2BIG;
-
- value = kmalloc(size, GFP_KERNEL);
+ value = posix_acl_to_xattr(fc->user_ns, acl, &size, GFP_KERNEL);
if (!value)
return -ENOMEM;
- ret = posix_acl_to_xattr(fc->user_ns, acl, value, size);
- if (ret < 0) {
+ if (size > PAGE_SIZE) {
kfree(value);
- return ret;
+ return -E2BIG;
}
/*
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6d59cbc877c6..a30c8b57d478 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1813,7 +1813,7 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
goto out_iput;
folio_offset = ((index - folio->index) << PAGE_SHIFT) + offset;
- nr_bytes = min_t(unsigned, num, folio_size(folio) - folio_offset);
+ nr_bytes = min(num, folio_size(folio) - folio_offset);
nr_pages = (offset + nr_bytes + PAGE_SIZE - 1) >> PAGE_SHIFT;
err = fuse_copy_folio(cs, &folio, folio_offset, nr_bytes, 0);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 78fa46cfc636..dffd454e30e2 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1323,10 +1323,8 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
static inline unsigned int fuse_wr_pages(loff_t pos, size_t len,
unsigned int max_pages)
{
- return min_t(unsigned int,
- ((pos + len - 1) >> PAGE_SHIFT) -
- (pos >> PAGE_SHIFT) + 1,
- max_pages);
+ return min(((pos + len - 1) >> PAGE_SHIFT) - (pos >> PAGE_SHIFT) + 1,
+ max_pages);
}
static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii)
@@ -1607,7 +1605,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
struct folio *folio = page_folio(pages[i]);
unsigned int offset = start +
(folio_page_idx(folio, pages[i]) << PAGE_SHIFT);
- unsigned int len = min_t(unsigned int, ret, PAGE_SIZE - start);
+ unsigned int len = umin(ret, PAGE_SIZE - start);
ap->descs[ap->num_folios].offset = offset;
ap->descs[ap->num_folios].length = len;
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 443640e6fb9c..a5b60778b91c 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -83,21 +83,14 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu)
int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
int error;
- size_t len;
- char *data;
+ size_t len = 0;
+ char *data = NULL;
const char *name = gfs2_acl_name(type);
if (acl) {
- len = posix_acl_xattr_size(acl->a_count);
- data = kmalloc(len, GFP_NOFS);
+ data = posix_acl_to_xattr(&init_user_ns, acl, &len, GFP_NOFS);
if (data == NULL)
return -ENOMEM;
- error = posix_acl_to_xattr(&init_user_ns, acl, data, len);
- if (error < 0)
- goto out;
- } else {
- data = NULL;
- len = 0;
}
error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS);
diff --git a/fs/inode.c b/fs/inode.c
index 1d0474745e77..dae43a8de7e0 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1028,19 +1028,20 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
return freed;
}
-static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked);
+static void __wait_on_freeing_inode(struct inode *inode, bool hash_locked, bool rcu_locked);
+
/*
* Called with the inode lock held.
*/
static struct inode *find_inode(struct super_block *sb,
struct hlist_head *head,
int (*test)(struct inode *, void *),
- void *data, bool is_inode_hash_locked,
+ void *data, bool hash_locked,
bool *isnew)
{
struct inode *inode = NULL;
- if (is_inode_hash_locked)
+ if (hash_locked)
lockdep_assert_held(&inode_hash_lock);
else
lockdep_assert_not_held(&inode_hash_lock);
@@ -1054,7 +1055,7 @@ repeat:
continue;
spin_lock(&inode->i_lock);
if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) {
- __wait_on_freeing_inode(inode, is_inode_hash_locked);
+ __wait_on_freeing_inode(inode, hash_locked, true);
goto repeat;
}
if (unlikely(inode_state_read(inode) & I_CREATING)) {
@@ -1078,11 +1079,11 @@ repeat:
*/
static struct inode *find_inode_fast(struct super_block *sb,
struct hlist_head *head, unsigned long ino,
- bool is_inode_hash_locked, bool *isnew)
+ bool hash_locked, bool *isnew)
{
struct inode *inode = NULL;
- if (is_inode_hash_locked)
+ if (hash_locked)
lockdep_assert_held(&inode_hash_lock);
else
lockdep_assert_not_held(&inode_hash_lock);
@@ -1096,7 +1097,7 @@ repeat:
continue;
spin_lock(&inode->i_lock);
if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) {
- __wait_on_freeing_inode(inode, is_inode_hash_locked);
+ __wait_on_freeing_inode(inode, hash_locked, true);
goto repeat;
}
if (unlikely(inode_state_read(inode) & I_CREATING)) {
@@ -1832,16 +1833,13 @@ int insert_inode_locked(struct inode *inode)
while (1) {
struct inode *old = NULL;
spin_lock(&inode_hash_lock);
+repeat:
hlist_for_each_entry(old, head, i_hash) {
if (old->i_ino != ino)
continue;
if (old->i_sb != sb)
continue;
spin_lock(&old->i_lock);
- if (inode_state_read(old) & (I_FREEING | I_WILL_FREE)) {
- spin_unlock(&old->i_lock);
- continue;
- }
break;
}
if (likely(!old)) {
@@ -1852,6 +1850,11 @@ int insert_inode_locked(struct inode *inode)
spin_unlock(&inode_hash_lock);
return 0;
}
+ if (inode_state_read(old) & (I_FREEING | I_WILL_FREE)) {
+ __wait_on_freeing_inode(old, true, false);
+ old = NULL;
+ goto repeat;
+ }
if (unlikely(inode_state_read(old) & I_CREATING)) {
spin_unlock(&old->i_lock);
spin_unlock(&inode_hash_lock);
@@ -2522,16 +2525,18 @@ EXPORT_SYMBOL(inode_needs_sync);
* wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
* will DTRT.
*/
-static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked)
+static void __wait_on_freeing_inode(struct inode *inode, bool hash_locked, bool rcu_locked)
{
struct wait_bit_queue_entry wqe;
struct wait_queue_head *wq_head;
+ VFS_BUG_ON(!hash_locked && !rcu_locked);
+
/*
* Handle racing against evict(), see that routine for more details.
*/
if (unlikely(inode_unhashed(inode))) {
- WARN_ON(is_inode_hash_locked);
+ WARN_ON(hash_locked);
spin_unlock(&inode->i_lock);
return;
}
@@ -2539,23 +2544,22 @@ static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_lock
wq_head = inode_bit_waitqueue(&wqe, inode, __I_NEW);
prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
spin_unlock(&inode->i_lock);
- rcu_read_unlock();
- if (is_inode_hash_locked)
+ if (rcu_locked)
+ rcu_read_unlock();
+ if (hash_locked)
spin_unlock(&inode_hash_lock);
schedule();
finish_wait(wq_head, &wqe.wq_entry);
- if (is_inode_hash_locked)
+ if (hash_locked)
spin_lock(&inode_hash_lock);
- rcu_read_lock();
+ if (rcu_locked)
+ rcu_read_lock();
}
static __initdata unsigned long ihash_entries;
static int __init set_ihash_entries(char *str)
{
- if (!str)
- return 0;
- ihash_entries = simple_strtoul(str, &str, 0);
- return 1;
+ return kstrtoul(str, 0, &ihash_entries) == 0;
}
__setup("ihash_entries=", set_ihash_entries);
@@ -3005,24 +3009,45 @@ umode_t mode_strip_sgid(struct mnt_idmap *idmap,
EXPORT_SYMBOL(mode_strip_sgid);
#ifdef CONFIG_DEBUG_VFS
-/*
- * Dump an inode.
- *
- * TODO: add a proper inode dumping routine, this is a stub to get debug off the
- * ground.
+/**
+ * dump_inode - dump an inode.
+ * @inode: inode to dump
+ * @reason: reason for dumping
*
- * TODO: handle getting to fs type with get_kernel_nofault()?
- * See dump_mapping() above.
+ * If inode is an invalid pointer, we don't want to crash accessing it,
+ * so probe everything depending on it carefully with get_kernel_nofault().
*/
void dump_inode(struct inode *inode, const char *reason)
{
- struct super_block *sb = inode->i_sb;
+ struct super_block *sb;
+ struct file_system_type *s_type;
+ const char *fs_name_ptr;
+ char fs_name[32] = {};
+ umode_t mode;
+ unsigned short opflags;
+ unsigned int flags;
+ unsigned int state;
+ int count;
+
+ if (get_kernel_nofault(sb, &inode->i_sb) ||
+ get_kernel_nofault(mode, &inode->i_mode) ||
+ get_kernel_nofault(opflags, &inode->i_opflags) ||
+ get_kernel_nofault(flags, &inode->i_flags)) {
+ pr_warn("%s: unreadable inode:%px\n", reason, inode);
+ return;
+ }
- pr_warn("%s encountered for inode %px\n"
- "fs %s mode %ho opflags 0x%hx flags 0x%x state 0x%x count %d\n",
- reason, inode, sb->s_type->name, inode->i_mode, inode->i_opflags,
- inode->i_flags, inode_state_read_once(inode), atomic_read(&inode->i_count));
-}
+ state = inode_state_read_once(inode);
+ count = atomic_read(&inode->i_count);
+ if (!sb ||
+ get_kernel_nofault(s_type, &sb->s_type) || !s_type ||
+ get_kernel_nofault(fs_name_ptr, &s_type->name) || !fs_name_ptr ||
+ strncpy_from_kernel_nofault(fs_name, fs_name_ptr, sizeof(fs_name) - 1) < 0)
+ strscpy(fs_name, "<unknown, sb unreadable>");
+
+ pr_warn("%s: inode:%px fs:%s mode:%ho opflags:%#x flags:%#x state:%#x count:%d\n",
+ reason, inode, fs_name, mode, opflags, flags, state, count);
+}
EXPORT_SYMBOL(dump_inode);
#endif
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 1de3602c98de..16b71a23ff1e 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -61,7 +61,7 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type,
{
char *ea_name;
int rc;
- int size = 0;
+ size_t size = 0;
char *value = NULL;
switch (type) {
@@ -76,16 +76,11 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type,
}
if (acl) {
- size = posix_acl_xattr_size(acl->a_count);
- value = kmalloc(size, GFP_KERNEL);
+ value = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_KERNEL);
if (!value)
return -ENOMEM;
- rc = posix_acl_to_xattr(&init_user_ns, acl, value, size);
- if (rc < 0)
- goto out;
}
rc = __jfs_setxattr(tid, inode, ea_name, value, size, 0);
-out:
kfree(value);
if (!rc)
diff --git a/fs/locks.c b/fs/locks.c
index cf1968b01bcb..3ea25d3a780f 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -178,7 +178,6 @@ locks_get_lock_context(struct inode *inode, int type)
{
struct file_lock_context *ctx;
- /* paired with cmpxchg() below */
ctx = locks_inode_context(inode);
if (likely(ctx) || type == F_UNLCK)
goto out;
@@ -196,7 +195,18 @@ locks_get_lock_context(struct inode *inode, int type)
* Assign the pointer if it's not already assigned. If it is, then
* free the context we just allocated.
*/
- if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
+ spin_lock(&inode->i_lock);
+ if (!(inode->i_opflags & IOP_FLCTX)) {
+ VFS_BUG_ON_INODE(inode->i_flctx, inode);
+ WRITE_ONCE(inode->i_flctx, ctx);
+ /*
+ * Paired with locks_inode_context().
+ */
+ smp_store_release(&inode->i_opflags, inode->i_opflags | IOP_FLCTX);
+ spin_unlock(&inode->i_lock);
+ } else {
+ VFS_BUG_ON_INODE(!inode->i_flctx, inode);
+ spin_unlock(&inode->i_lock);
kmem_cache_free(flctx_cache, ctx);
ctx = locks_inode_context(inode);
}
diff --git a/fs/namei.c b/fs/namei.c
index 76bc569ace8e..b28ecb699f32 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -879,7 +879,7 @@ static bool try_to_unlazy(struct nameidata *nd)
{
struct dentry *parent = nd->path.dentry;
- BUG_ON(!(nd->flags & LOOKUP_RCU));
+ VFS_BUG_ON(!(nd->flags & LOOKUP_RCU));
if (unlikely(nd->flags & LOOKUP_CACHED)) {
drop_links(nd);
@@ -919,7 +919,8 @@ out:
static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry)
{
int res;
- BUG_ON(!(nd->flags & LOOKUP_RCU));
+
+ VFS_BUG_ON(!(nd->flags & LOOKUP_RCU));
if (unlikely(nd->flags & LOOKUP_CACHED)) {
drop_links(nd);
@@ -1631,9 +1632,6 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path)
struct dentry *dentry = path->dentry;
unsigned int flags = dentry->d_flags;
- if (likely(!(flags & DCACHE_MANAGED_DENTRY)))
- return true;
-
if (unlikely(nd->flags & LOOKUP_NO_XDEV))
return false;
diff --git a/fs/namespace.c b/fs/namespace.c
index 1d2089ffb6ab..0cc8c2757500 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -49,20 +49,14 @@ static unsigned int mp_hash_shift __ro_after_init;
static __initdata unsigned long mhash_entries;
static int __init set_mhash_entries(char *str)
{
- if (!str)
- return 0;
- mhash_entries = simple_strtoul(str, &str, 0);
- return 1;
+ return kstrtoul(str, 0, &mhash_entries) == 0;
}
__setup("mhash_entries=", set_mhash_entries);
static __initdata unsigned long mphash_entries;
static int __init set_mphash_entries(char *str)
{
- if (!str)
- return 0;
- mphash_entries = simple_strtoul(str, &str, 0);
- return 1;
+ return kstrtoul(str, 0, &mphash_entries) == 0;
}
__setup("mphash_entries=", set_mphash_entries);
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
index c93df55e98d0..37a69a75ce68 100644
--- a/fs/ntfs3/xattr.c
+++ b/fs/ntfs3/xattr.c
@@ -641,13 +641,9 @@ static noinline int ntfs_set_acl_ex(struct mnt_idmap *idmap,
value = NULL;
flags = XATTR_REPLACE;
} else {
- size = posix_acl_xattr_size(acl->a_count);
- value = kmalloc(size, GFP_NOFS);
+ value = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_NOFS);
if (!value)
return -ENOMEM;
- err = posix_acl_to_xattr(&init_user_ns, acl, value, size);
- if (err < 0)
- goto out;
flags = 0;
}
diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c
index 5aefb705bcc8..a01ef0c1b1bf 100644
--- a/fs/orangefs/acl.c
+++ b/fs/orangefs/acl.c
@@ -90,14 +90,9 @@ int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
type);
if (acl) {
- size = posix_acl_xattr_size(acl->a_count);
- value = kmalloc(size, GFP_KERNEL);
+ value = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_KERNEL);
if (!value)
return -ENOMEM;
-
- error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
- if (error < 0)
- goto out;
}
gossip_debug(GOSSIP_ACL_DEBUG,
@@ -111,7 +106,6 @@ int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
*/
error = orangefs_inode_setxattr(inode, name, value, size, 0);
-out:
kfree(value);
if (!error)
set_cached_acl(inode, type, acl);
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 768f027c1428..4ef6f9d2b8d6 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -829,19 +829,19 @@ EXPORT_SYMBOL (posix_acl_from_xattr);
/*
* Convert from in-memory to extended attribute representation.
*/
-int
+void *
posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
- void *buffer, size_t size)
+ size_t *sizep, gfp_t gfp)
{
- struct posix_acl_xattr_header *ext_acl = buffer;
+ struct posix_acl_xattr_header *ext_acl;
struct posix_acl_xattr_entry *ext_entry;
- int real_size, n;
+ size_t size;
+ int n;
- real_size = posix_acl_xattr_size(acl->a_count);
- if (!buffer)
- return real_size;
- if (real_size > size)
- return -ERANGE;
+ size = posix_acl_xattr_size(acl->a_count);
+ ext_acl = kmalloc(size, gfp);
+ if (!ext_acl)
+ return NULL;
ext_entry = (void *)(ext_acl + 1);
ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
@@ -864,7 +864,8 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
break;
}
}
- return real_size;
+ *sizep = size;
+ return ext_acl;
}
EXPORT_SYMBOL (posix_acl_to_xattr);
diff --git a/fs/select.c b/fs/select.c
index 65019b8ba3f7..78a1508c84d3 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -1038,14 +1038,11 @@ static long do_restart_poll(struct restart_block *restart_block)
{
struct pollfd __user *ufds = restart_block->poll.ufds;
int nfds = restart_block->poll.nfds;
- struct timespec64 *to = NULL, end_time;
+ struct timespec64 *to = NULL;
int ret;
- if (restart_block->poll.has_timeout) {
- end_time.tv_sec = restart_block->poll.tv_sec;
- end_time.tv_nsec = restart_block->poll.tv_nsec;
- to = &end_time;
- }
+ if (restart_block->poll.has_timeout)
+ to = &restart_block->poll.end_time;
ret = do_sys_poll(ufds, nfds, to);
@@ -1077,8 +1074,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
restart_block->poll.nfds = nfds;
if (timeout_msecs >= 0) {
- restart_block->poll.tv_sec = end_time.tv_sec;
- restart_block->poll.tv_nsec = end_time.tv_nsec;
+ restart_block->poll.end_time = end_time;
restart_block->poll.has_timeout = 1;
} else
restart_block->poll.has_timeout = 0;
diff --git a/fs/splice.c b/fs/splice.c
index d338fe56b50b..5fb07c01936f 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1467,7 +1467,7 @@ static ssize_t iter_to_pipe(struct iov_iter *from,
n = DIV_ROUND_UP(left + start, PAGE_SIZE);
for (i = 0; i < n; i++) {
- int size = min_t(int, left, PAGE_SIZE - start);
+ int size = umin(left, PAGE_SIZE - start);
buf.page = pages[i];
buf.offset = start;
diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h
index 0864773a57e8..822085bc2d20 100644
--- a/include/linux/device_cgroup.h
+++ b/include/linux/device_cgroup.h
@@ -21,7 +21,7 @@ static inline int devcgroup_inode_permission(struct inode *inode, int mask)
if (likely(!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode)))
return 0;
- if (likely(!inode->i_rdev))
+ if (!inode->i_rdev)
return 0;
if (S_ISBLK(inode->i_mode))
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index f0cf2714ec52..262e24d83313 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -201,9 +201,9 @@ struct handle_to_path_ctx {
* @commit_metadata: commit metadata changes to stable storage
*
* See Documentation/filesystems/nfs/exporting.rst for details on how to use
- * this interface correctly.
+ * this interface correctly and the definition of the flags.
*
- * encode_fh:
+ * @encode_fh:
* @encode_fh should store in the file handle fragment @fh (using at most
* @max_len bytes) information that can be used by @decode_fh to recover the
* file referred to by the &struct dentry @de. If @flag has CONNECTABLE bit
@@ -215,7 +215,7 @@ struct handle_to_path_ctx {
* greater than @max_len*4 bytes). On error @max_len contains the minimum
* size(in 4 byte unit) needed to encode the file handle.
*
- * fh_to_dentry:
+ * @fh_to_dentry:
* @fh_to_dentry is given a &struct super_block (@sb) and a file handle
* fragment (@fh, @fh_len). It should return a &struct dentry which refers
* to the same file that the file handle fragment refers to. If it cannot,
@@ -227,31 +227,44 @@ struct handle_to_path_ctx {
* created with d_alloc_root. The caller can then find any other extant
* dentries by following the d_alias links.
*
- * fh_to_parent:
+ * @fh_to_parent:
* Same as @fh_to_dentry, except that it returns a pointer to the parent
* dentry if it was encoded into the filehandle fragment by @encode_fh.
*
- * get_name:
+ * @get_name:
* @get_name should find a name for the given @child in the given @parent
* directory. The name should be stored in the @name (with the
- * understanding that it is already pointing to a %NAME_MAX+1 sized
+ * understanding that it is already pointing to a %NAME_MAX + 1 sized
* buffer. get_name() should return %0 on success, a negative error code
* or error. @get_name will be called without @parent->i_rwsem held.
*
- * get_parent:
+ * @get_parent:
* @get_parent should find the parent directory for the given @child which
* is also a directory. In the event that it cannot be found, or storage
* space cannot be allocated, a %ERR_PTR should be returned.
*
- * permission:
+ * @permission:
* Allow filesystems to specify a custom permission function.
*
- * open:
+ * @open:
* Allow filesystems to specify a custom open function.
*
- * commit_metadata:
+ * @commit_metadata:
* @commit_metadata should commit metadata changes to stable storage.
*
+ * @get_uuid:
+ * Get a filesystem unique signature exposed to clients.
+ *
+ * @map_blocks:
+ * Map and, if necessary, allocate blocks for a layout.
+ *
+ * @commit_blocks:
+ * Commit blocks in a layout once the client is done with them.
+ *
+ * @flags:
+ * Allows the filesystem to communicate to nfsd that it may want to do things
+ * differently when dealing with it.
+ *
* Locking rules:
* get_parent is called with child->d_inode->i_rwsem down
* get_name is not (which is possibly inconsistent)
diff --git a/include/linux/filelock.h b/include/linux/filelock.h
index 2f5e5588ee07..d2c9740e26a8 100644
--- a/include/linux/filelock.h
+++ b/include/linux/filelock.h
@@ -242,7 +242,14 @@ bool locks_owner_has_blockers(struct file_lock_context *flctx,
static inline struct file_lock_context *
locks_inode_context(const struct inode *inode)
{
- return smp_load_acquire(&inode->i_flctx);
+ /*
+ * Paired with smp_store_release in locks_get_lock_context().
+ *
+ * Ensures ->i_flctx will be visible if we spotted the flag.
+ */
+ if (likely(!(smp_load_acquire(&inode->i_opflags) & IOP_FLCTX)))
+ return NULL;
+ return READ_ONCE(inode->i_flctx);
}
#else /* !CONFIG_FILE_LOCKING */
@@ -469,7 +476,7 @@ static inline int break_lease(struct inode *inode, unsigned int mode)
* could end up racing with tasks trying to set a new lease on this
* file.
*/
- flctx = READ_ONCE(inode->i_flctx);
+ flctx = locks_inode_context(inode);
if (!flctx)
return 0;
smp_mb();
@@ -488,7 +495,7 @@ static inline int break_deleg(struct inode *inode, unsigned int flags)
* could end up racing with tasks trying to set a new lease on this
* file.
*/
- flctx = READ_ONCE(inode->i_flctx);
+ flctx = locks_inode_context(inode);
if (!flctx)
return 0;
smp_mb();
@@ -533,8 +540,11 @@ static inline int break_deleg_wait(struct delegated_inode *di)
static inline int break_layout(struct inode *inode, bool wait)
{
+ struct file_lock_context *flctx;
+
smp_mb();
- if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) {
+ flctx = locks_inode_context(inode);
+ if (flctx && !list_empty_careful(&flctx->flc_lease)) {
unsigned int flags = LEASE_BREAK_LAYOUT;
if (!wait)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ca31bc9308a3..73911f961c7e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -631,6 +631,7 @@ is_uncached_acl(struct posix_acl *acl)
#define IOP_MGTIME 0x0020
#define IOP_CACHED_LINK 0x0040
#define IOP_FASTPERM_MAY_EXEC 0x0080
+#define IOP_FLCTX 0x0100
/*
* Inode state bits. Protected by inode->i_lock
diff --git a/include/linux/ns/ns_common_types.h b/include/linux/ns/ns_common_types.h
index b332b019b29c..0014fbc1c626 100644
--- a/include/linux/ns/ns_common_types.h
+++ b/include/linux/ns/ns_common_types.h
@@ -108,11 +108,13 @@ extern const struct proc_ns_operations utsns_operations;
* @ns_tree: namespace tree nodes and active reference count
*/
struct ns_common {
+ struct {
+ refcount_t __ns_ref; /* do not use directly */
+ } ____cacheline_aligned_in_smp;
u32 ns_type;
struct dentry *stashed;
const struct proc_ns_operations *ops;
unsigned int inum;
- refcount_t __ns_ref; /* do not use directly */
union {
struct ns_tree;
struct rcu_head ns_rcu;
diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h
index e86f3b731da2..9e1892525eac 100644
--- a/include/linux/posix_acl_xattr.h
+++ b/include/linux/posix_acl_xattr.h
@@ -44,8 +44,9 @@ posix_acl_from_xattr(struct user_namespace *user_ns, const void *value,
}
#endif
-int posix_acl_to_xattr(struct user_namespace *user_ns,
- const struct posix_acl *acl, void *buffer, size_t size);
+extern void *posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
+ size_t *sizep, gfp_t gfp);
+
static inline const char *posix_acl_xattr_name(int type)
{
switch (type) {
diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h
index 67d2bf579942..9b262109726d 100644
--- a/include/linux/restart_block.h
+++ b/include/linux/restart_block.h
@@ -6,6 +6,7 @@
#define __LINUX_RESTART_BLOCK_H
#include <linux/compiler.h>
+#include <linux/time64.h>
#include <linux/types.h>
struct __kernel_timespec;
@@ -50,8 +51,7 @@ struct restart_block {
struct pollfd __user *ufds;
int nfds;
int has_timeout;
- unsigned long tv_sec;
- unsigned long tv_nsec;
+ struct timespec64 end_time;
} poll;
};
};
diff --git a/init/Kconfig b/init/Kconfig
index fa79feb8fe57..160c1c4ef253 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -624,8 +624,9 @@ config SCHED_HW_PRESSURE
arch_update_hw_pressure() and arch_scale_thermal_pressure().
config BSD_PROCESS_ACCT
- bool "BSD Process Accounting"
+ bool "BSD Process Accounting (DEPRECATED)"
depends on MULTIUSER
+ default n
help
If you say Y here, a user level program will be able to instruct the
kernel (via a special system call) to write process accounting
@@ -635,7 +636,9 @@ config BSD_PROCESS_ACCT
command name, memory usage, controlling terminal etc. (the complete
list is in the struct acct in <file:include/linux/acct.h>). It is
up to the user level program to do useful things with this
- information. This is generally a good idea, so say Y.
+ information. This mechanism is antiquated and has significant
+ scalability issues. You probably want to use eBPF instead. Say
+ N unless you really need this.
config BSD_PROCESS_ACCT_V3
bool "BSD Process Accounting version 3 file format"
diff --git a/init/initramfs_test.c b/init/initramfs_test.c
index 5d2db455e60c..beb6e3cf7808 100644
--- a/init/initramfs_test.c
+++ b/init/initramfs_test.c
@@ -447,6 +447,53 @@ out:
kfree(tbufs);
}
+static void __init initramfs_test_fname_path_max(struct kunit *test)
+{
+ char *err;
+ size_t len;
+ struct kstat st0, st1;
+ char fdata[] = "this file data will not be unpacked";
+ struct test_fname_path_max {
+ char fname_oversize[PATH_MAX + 1];
+ char fname_ok[PATH_MAX];
+ char cpio_src[(CPIO_HDRLEN + PATH_MAX + 3 + sizeof(fdata)) * 2];
+ } *tbufs = kzalloc(sizeof(struct test_fname_path_max), GFP_KERNEL);
+ struct initramfs_test_cpio c[] = { {
+ .magic = "070701",
+ .ino = 1,
+ .mode = S_IFDIR | 0777,
+ .nlink = 1,
+ .namesize = sizeof(tbufs->fname_oversize),
+ .fname = tbufs->fname_oversize,
+ .filesize = sizeof(fdata),
+ .data = fdata,
+ }, {
+ .magic = "070701",
+ .ino = 2,
+ .mode = S_IFDIR | 0777,
+ .nlink = 1,
+ .namesize = sizeof(tbufs->fname_ok),
+ .fname = tbufs->fname_ok,
+ } };
+
+ memset(tbufs->fname_oversize, '/', sizeof(tbufs->fname_oversize) - 1);
+ memset(tbufs->fname_ok, '/', sizeof(tbufs->fname_ok) - 1);
+ memcpy(tbufs->fname_oversize, "fname_oversize",
+ sizeof("fname_oversize") - 1);
+ memcpy(tbufs->fname_ok, "fname_ok", sizeof("fname_ok") - 1);
+ len = fill_cpio(c, ARRAY_SIZE(c), tbufs->cpio_src);
+
+ /* unpack skips over fname_oversize instead of returning an error */
+ err = unpack_to_rootfs(tbufs->cpio_src, len);
+ KUNIT_EXPECT_NULL(test, err);
+
+ KUNIT_EXPECT_EQ(test, init_stat("fname_oversize", &st0, 0), -ENOENT);
+ KUNIT_EXPECT_EQ(test, init_stat("fname_ok", &st1, 0), 0);
+ KUNIT_EXPECT_EQ(test, init_rmdir("fname_ok"), 0);
+
+ kfree(tbufs);
+}
+
/*
* The kunit_case/_suite struct cannot be marked as __initdata as this will be
* used in debugfs to retrieve results after test has run.
@@ -459,6 +506,7 @@ static struct kunit_case __refdata initramfs_test_cases[] = {
KUNIT_CASE(initramfs_test_hardlink),
KUNIT_CASE(initramfs_test_many),
KUNIT_CASE(initramfs_test_fname_pad),
+ KUNIT_CASE(initramfs_test_fname_path_max),
{},
};
diff --git a/kernel/pid.c b/kernel/pid.c
index a31771bc89c1..f45ae56db7da 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -159,58 +159,86 @@ void free_pids(struct pid **pids)
free_pid(pids[tmp]);
}
-struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
- size_t set_tid_size)
+struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
+ size_t arg_set_tid_size)
{
+ int set_tid[MAX_PID_NS_LEVEL + 1] = {};
+ int pid_max[MAX_PID_NS_LEVEL + 1] = {};
struct pid *pid;
enum pid_type type;
int i, nr;
struct pid_namespace *tmp;
struct upid *upid;
int retval = -ENOMEM;
+ bool retried_preload;
/*
- * set_tid_size contains the size of the set_tid array. Starting at
+ * arg_set_tid_size contains the size of the arg_set_tid array. Starting at
* the most nested currently active PID namespace it tells alloc_pid()
* which PID to set for a process in that most nested PID namespace
- * up to set_tid_size PID namespaces. It does not have to set the PID
- * for a process in all nested PID namespaces but set_tid_size must
+ * up to arg_set_tid_size PID namespaces. It does not have to set the PID
+ * for a process in all nested PID namespaces but arg_set_tid_size must
* never be greater than the current ns->level + 1.
*/
- if (set_tid_size > ns->level + 1)
+ if (arg_set_tid_size > ns->level + 1)
return ERR_PTR(-EINVAL);
+ /*
+ * Prep before we take locks:
+ *
+ * 1. allocate and fill in pid struct
+ */
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
if (!pid)
return ERR_PTR(retval);
- tmp = ns;
+ get_pid_ns(ns);
pid->level = ns->level;
+ refcount_set(&pid->count, 1);
+ spin_lock_init(&pid->lock);
+ for (type = 0; type < PIDTYPE_MAX; ++type)
+ INIT_HLIST_HEAD(&pid->tasks[type]);
+ init_waitqueue_head(&pid->wait_pidfd);
+ INIT_HLIST_HEAD(&pid->inodes);
- for (i = ns->level; i >= 0; i--) {
- int tid = 0;
- int pid_max = READ_ONCE(tmp->pid_max);
+ /*
+ * 2. perm check checkpoint_restore_ns_capable()
+ *
+ * This stores found pid_max to make sure the used value is the same should
+ * later code need it.
+ */
+ for (tmp = ns, i = ns->level; i >= 0; i--) {
+ pid_max[ns->level - i] = READ_ONCE(tmp->pid_max);
- if (set_tid_size) {
- tid = set_tid[ns->level - i];
+ if (arg_set_tid_size) {
+ int tid = set_tid[ns->level - i] = arg_set_tid[ns->level - i];
retval = -EINVAL;
- if (tid < 1 || tid >= pid_max)
- goto out_free;
+ if (tid < 1 || tid >= pid_max[ns->level - i])
+ goto out_abort;
/*
* Also fail if a PID != 1 is requested and
* no PID 1 exists.
*/
if (tid != 1 && !tmp->child_reaper)
- goto out_free;
+ goto out_abort;
retval = -EPERM;
if (!checkpoint_restore_ns_capable(tmp->user_ns))
- goto out_free;
- set_tid_size--;
+ goto out_abort;
+ arg_set_tid_size--;
}
- idr_preload(GFP_KERNEL);
- spin_lock(&pidmap_lock);
+ tmp = tmp->parent;
+ }
+
+ /*
+ * Prep is done, id allocation goes here:
+ */
+ retried_preload = false;
+ idr_preload(GFP_KERNEL);
+ spin_lock(&pidmap_lock);
+ for (tmp = ns, i = ns->level; i >= 0;) {
+ int tid = set_tid[ns->level - i];
if (tid) {
nr = idr_alloc(&tmp->idr, NULL, tid,
@@ -220,6 +248,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
* alreay in use. Return EEXIST in that case.
*/
if (nr == -ENOSPC)
+
nr = -EEXIST;
} else {
int pid_min = 1;
@@ -235,19 +264,42 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
* a partially initialized PID (see below).
*/
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
- pid_max, GFP_ATOMIC);
+ pid_max[ns->level - i], GFP_ATOMIC);
+ if (nr == -ENOSPC)
+ nr = -EAGAIN;
}
- spin_unlock(&pidmap_lock);
- idr_preload_end();
- if (nr < 0) {
- retval = (nr == -ENOSPC) ? -EAGAIN : nr;
+ if (unlikely(nr < 0)) {
+ /*
+ * Preload more memory if idr_alloc{,cyclic} failed with -ENOMEM.
+ *
+ * The IDR API only allows us to preload memory for one call, while we may end
+ * up doing several under pidmap_lock with GFP_ATOMIC. The situation may be
+ * salvageable with GFP_KERNEL. But make sure to not loop indefinitely if preload
+ * did not help (the routine unfortunately returns void, so we have no idea
+ * if it got anywhere).
+ *
+ * The lock can be safely dropped and picked up as historically pid allocation
+ * for different namespaces was *not* atomic -- we try to hold on to it the
+ * entire time only for performance reasons.
+ */
+ if (nr == -ENOMEM && !retried_preload) {
+ spin_unlock(&pidmap_lock);
+ idr_preload_end();
+ retried_preload = true;
+ idr_preload(GFP_KERNEL);
+ spin_lock(&pidmap_lock);
+ continue;
+ }
+ retval = nr;
goto out_free;
}
pid->numbers[i].nr = nr;
pid->numbers[i].ns = tmp;
tmp = tmp->parent;
+ i--;
+ retried_preload = false;
}
/*
@@ -257,25 +309,15 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
* is what we have exposed to userspace for a long time and it is
* documented behavior for pid namespaces. So we can't easily
* change it even if there were an error code better suited.
+ *
+ * This can't be done earlier because we need to preserve other
+ * error conditions.
*/
retval = -ENOMEM;
-
- get_pid_ns(ns);
- refcount_set(&pid->count, 1);
- spin_lock_init(&pid->lock);
- for (type = 0; type < PIDTYPE_MAX; ++type)
- INIT_HLIST_HEAD(&pid->tasks[type]);
-
- init_waitqueue_head(&pid->wait_pidfd);
- INIT_HLIST_HEAD(&pid->inodes);
-
- upid = pid->numbers + ns->level;
- idr_preload(GFP_KERNEL);
- spin_lock(&pidmap_lock);
- if (!(ns->pid_allocated & PIDNS_ADDING))
- goto out_unlock;
+ if (unlikely(!(ns->pid_allocated & PIDNS_ADDING)))
+ goto out_free;
pidfs_add_pid(pid);
- for ( ; upid >= pid->numbers; --upid) {
+ for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) {
/* Make the PID visible to find_pid_ns. */
idr_replace(&upid->ns->idr, pid, upid->nr);
upid->ns->pid_allocated++;
@@ -286,13 +328,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
return pid;
-out_unlock:
- spin_unlock(&pidmap_lock);
- idr_preload_end();
- put_pid_ns(ns);
-
out_free:
- spin_lock(&pidmap_lock);
while (++i <= ns->level) {
upid = pid->numbers + i;
idr_remove(&upid->ns->idr, upid->nr);
@@ -303,7 +339,10 @@ out_free:
idr_set_cursor(&ns->idr, 0);
spin_unlock(&pidmap_lock);
+ idr_preload_end();
+out_abort:
+ put_pid_ns(ns);
kmem_cache_free(ns->pid_cachep, pid);
return ERR_PTR(retval);
}