diff options
Diffstat (limited to 'include/linux')
95 files changed, 1376 insertions, 598 deletions
diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h index 3ffa5341dce2..2014288ca2f4 100644 --- a/include/linux/alarmtimer.h +++ b/include/linux/alarmtimer.h @@ -42,11 +42,14 @@ struct alarm { void *data; }; +static __always_inline ktime_t alarm_get_expires(struct alarm *alarm) +{ + return alarm->node.expires; +} + void alarm_init(struct alarm *alarm, enum alarmtimer_type type, void (*function)(struct alarm *, ktime_t)); -void alarm_start(struct alarm *alarm, ktime_t start); -void alarm_start_relative(struct alarm *alarm, ktime_t start); -void alarm_restart(struct alarm *alarm); +bool alarm_start_timer(struct alarm *alarm, ktime_t expires, bool relative); int alarm_try_to_cancel(struct alarm *alarm); int alarm_cancel(struct alarm *alarm); diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h index 9946276aff73..6c54d5c0d21f 100644 --- a/include/linux/amba/bus.h +++ b/include/linux/amba/bus.h @@ -71,11 +71,6 @@ struct amba_device { unsigned int cid; struct amba_cs_uci_id uci; unsigned int irq[AMBA_NR_IRQS]; - /* - * Driver name to force a match. Do not set directly, because core - * frees it. Use driver_set_override() to set or clear it. - */ - const char *driver_override; }; struct amba_driver { diff --git a/include/linux/auxiliary_bus.h b/include/linux/auxiliary_bus.h index bc09b55e3682..4e1ad8ccbcdd 100644 --- a/include/linux/auxiliary_bus.h +++ b/include/linux/auxiliary_bus.h @@ -62,6 +62,9 @@ * @sysfs.irqs: irqs xarray contains irq indices which are used by the device, * @sysfs.lock: Synchronize irq sysfs creation, * @sysfs.irq_dir_exists: whether "irqs" directory exists, + * @registration_data_rust: private data owned by the registering (parent) + * driver; valid for as long as the device is + * registered with the driver core, * * An auxiliary_device represents a part of its parent device's functionality. * It is given a name that, combined with the registering drivers @@ -148,6 +151,7 @@ struct auxiliary_device { struct mutex lock; /* Synchronize irq sysfs creation */ bool irq_dir_exists; } sysfs; + void *registration_data_rust; }; /** diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index a06b93446d10..4f1084937315 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -26,6 +26,7 @@ enum wb_state { WB_writeback_running, /* Writeback is in progress */ WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */ WB_start_all, /* nr_pages == 0 (all) work pending */ + WB_start_dontcache, /* dontcache writeback pending */ }; enum wb_stat_item { @@ -33,6 +34,7 @@ enum wb_stat_item { WB_WRITEBACK, WB_DIRTIED, WB_WRITTEN, + WB_DONTCACHE_DIRTY, NR_WB_STAT_ITEMS }; @@ -55,6 +57,7 @@ enum wb_reason { */ WB_REASON_FORKER_THREAD, WB_REASON_FOREIGN_FLUSH, + WB_REASON_DONTCACHE, WB_REASON_MAX, }; diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 65abd5ab8836..2c77e383e737 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -25,6 +25,9 @@ struct linux_binprm { struct page *page[MAX_ARG_PAGES]; #endif struct mm_struct *mm; + struct mm_struct *old_mm; /* replaced address space, freed by setup_new_exec() */ + /* user_ns published to task->exec_state at execve, narrowed by would_dump(). */ + struct user_namespace *user_ns; unsigned long p; /* current top of mem */ unsigned int /* Should an execfd be passed to userspace? */ diff --git a/include/linux/bio.h b/include/linux/bio.h index dc17780d6c1e..8300d5565e36 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -703,20 +703,6 @@ static inline bool bioset_initialized(struct bio_set *bs) return bs->bio_slab != NULL; } -/* - * Mark a bio as polled. Note that for async polled IO, the caller must - * expect -EWOULDBLOCK if we cannot allocate a request (or other resources). - * We cannot block waiting for requests on polled IO, as those completions - * must be found by the caller. This is different than IRQ driven IO, where - * it's safe to wait for IO to complete. - */ -static inline void bio_set_polled(struct bio *bio, struct kiocb *kiocb) -{ - bio->bi_opf |= REQ_POLLED; - if (kiocb->ki_flags & IOCB_NOWAIT) - bio->bi_opf |= REQ_NOWAIT; -} - static inline void bio_clear_polled(struct bio *bio) { bio->bi_opf &= ~REQ_POLLED; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cd191c5fdb0a..64efc3fdb716 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -31,6 +31,7 @@ #include <linux/static_call.h> #include <linux/memcontrol.h> #include <linux/cfi.h> +#include <linux/xattr.h> #include <asm/rqspinlock.h> struct bpf_verifier_env; @@ -1918,6 +1919,8 @@ struct bpf_mount_opts { u64 delegate_maps; u64 delegate_progs; u64 delegate_attachs; + + struct simple_xattr_cache xa_cache; }; struct bpf_token { diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index e4939e33b4b5..8b23bc9a244c 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -46,7 +46,6 @@ enum bh_state_bits { struct page; struct buffer_head; struct address_space; -typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate); /* * Historically, a buffer_head was used to map a single block @@ -55,7 +54,7 @@ typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate); * is the bio, and buffer_heads are used for extracting block * mappings (via a get_block_t call), for tracking state within * a folio (via a folio_mapping) and for wrapping bio submission - * for backward compatibility reasons (e.g. submit_bh). + * for backward compatibility reasons (e.g. bh_submit). */ struct buffer_head { unsigned long b_state; /* buffer state bitmap (see above) */ @@ -70,8 +69,7 @@ struct buffer_head { char *b_data; /* pointer to data within the page */ struct block_device *b_bdev; - bh_end_io_t *b_end_io; /* I/O completion */ - void *b_private; /* reserved for b_end_io */ + void *b_private; /* reserved for bio_end_io */ struct list_head b_assoc_buffers; /* associated with another mapping */ struct mapping_metadata_bhs *b_mmb; /* head of the list of metadata bhs * this buffer is associated with */ @@ -203,7 +201,12 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size); struct buffer_head *create_empty_buffers(struct folio *folio, unsigned long blocksize, unsigned long b_state); void end_buffer_read_sync(struct buffer_head *bh, int uptodate); -void end_buffer_write_sync(struct buffer_head *bh, int uptodate); +bool bio_endio_bh(struct bio *bio, struct buffer_head **bhp); + +/* Completion routines suitable for passing to bh_submit() */ +void bh_end_read(struct bio *bio); +void bh_end_write(struct bio *bio); +void bh_end_async_write(struct bio *bio); /* Things to do with metadata buffers list */ void mmb_mark_buffer_dirty(struct buffer_head *bh, struct mapping_metadata_bhs *mmb); @@ -218,7 +221,6 @@ static inline void clean_bdev_bh_alias(struct buffer_head *bh) clean_bdev_aliases(bh->b_bdev, bh->b_blocknr, 1); } -void mark_buffer_async_write(struct buffer_head *bh); void __wait_on_buffer(struct buffer_head *); wait_queue_head_t *bh_waitq_head(struct buffer_head *bh); struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block, @@ -239,7 +241,7 @@ void __lock_buffer(struct buffer_head *bh); int sync_dirty_buffer(struct buffer_head *bh); int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags); void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags); -void submit_bh(blk_opf_t, struct buffer_head *); +void bh_submit(struct buffer_head *, blk_opf_t, bio_end_io_t); void write_boundary_block(struct block_device *bdev, sector_t bblock, unsigned blocksize); int bh_uptodate_or_lock(struct buffer_head *bh); diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index c8f4f0a0b874..fc879ac4cc4f 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -89,6 +89,7 @@ int populate_cache_leaves(unsigned int cpu); int cache_setup_acpi(unsigned int cpu); bool last_level_cache_is_valid(unsigned int cpu); bool last_level_cache_is_shared(unsigned int cpu_x, unsigned int cpu_y); +struct cacheinfo *get_cpu_cacheinfo_llc(unsigned int cpu); int fetch_cache_info(unsigned int cpu); int detect_cache_attributes(unsigned int cpu); #ifndef CONFIG_ACPI_PPTT diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h index b1ba97f6c9ad..f54770f110bc 100644 --- a/include/linux/cdx/cdx_bus.h +++ b/include/linux/cdx/cdx_bus.h @@ -137,9 +137,6 @@ struct cdx_controller { * @enabled: is this bus enabled * @msi_dev_id: MSI Device ID associated with CDX device * @num_msi: Number of MSI's supported by the device - * @driver_override: driver name to force a match; do not set directly, - * because core frees it; use driver_set_override() to - * set or clear it. * @irqchip_lock: lock to synchronize irq/msi configuration * @msi_write_pending: MSI write pending for this device */ @@ -165,7 +162,6 @@ struct cdx_device { bool enabled; u32 msi_dev_id; u32 num_msi; - const char *driver_override; struct mutex irqchip_lock; bool msi_write_pending; }; diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index ea95ca4bc11c..b1b5698cbf1b 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -397,7 +397,8 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond __DEFINE_GUARD_LOCK_PTR(_name, _T) #define DEFINE_GUARD(_name, _type, _lock, _unlock) \ - DEFINE_CLASS(_name, _type, if (_T) { _unlock; }, ({ _lock; _T; }), _type _T); \ + static __always_inline __nonnull_args(1) _type class_##_name##_constructor(_type _T); \ + DEFINE_CLASS(_name, _type, _unlock, ({ _lock; _T; }), _type _T); \ DEFINE_CLASS_IS_GUARD(_name) #define DEFINE_GUARD_COND_4(_name, _ext, _lock, _cond) \ @@ -491,13 +492,14 @@ typedef struct { \ static __always_inline void class_##_name##_destructor(class_##_name##_t *_T) \ __no_context_analysis \ { \ - if (_T->lock) { _unlock; } \ + _unlock; \ } \ \ __DEFINE_GUARD_LOCK_PTR(_name, &_T->lock) #define __DEFINE_LOCK_GUARD_1(_name, _type, ...) \ -static __always_inline class_##_name##_t class_##_name##_constructor(_type *l) \ +static __always_inline __nonnull_args(1) \ +class_##_name##_t class_##_name##_constructor(_type *l) \ __no_context_analysis \ { \ class_##_name##_t _t = { .lock = l }, *_T = &_t; \ diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 7c38190b10bf..283d7297aa79 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -32,6 +32,21 @@ struct module; #include <vdso/clocksource.h> /** + * struct clocksource_hw_snapshot - Snapshot for the underlying hardware counter of derived + * clocksources like kvmclock or Hyper-V scaled TSC + * @hw_cycles: The hardware counter value + * @hw_csid: Clocksource ID of the hardware counter + * + * Such clocksources must implement the read_snapshot() callback and fill in the + * hardware counter value, the clocksource ID of the hardware counter and derive + * the actual clocksource cycles from @hw_cycles to provide an atomic snapshot + */ +struct clocksource_hw_snapshot { + u64 hw_cycles; + enum clocksource_ids hw_csid; +}; + +/** * struct clocksource - hardware abstraction for a free running counter * Provides mostly state-free accessors to the underlying hardware. * This is the structure used for system time. @@ -72,6 +87,14 @@ struct module; * @flags: Flags describing special properties * @base: Hardware abstraction for clock on which a clocksource * is based + * @read_snapshot: Extended @read() function for clocksources such as + * kvmclock or the Hyper-V scaled TSC where the actual + * clocksource value for timekeeping is calculated from an + * underlying hardware counter. Returns the timekeeping + * relevant cycle value and stores the raw value of the + * underlying counter from which it was calculated + * including the clocksource ID of that counter in the + * clocksource hardware snapshot. * @enable: Optional function to enable the clocksource * @disable: Optional function to disable the clocksource * @suspend: Optional suspend function for the clocksource @@ -113,6 +136,7 @@ struct clocksource { unsigned long flags; struct clocksource_base *base; + u64 (*read_snapshot)(struct clocksource *cs, struct clocksource_hw_snapshot *chs); int (*enable)(struct clocksource *cs); void (*disable)(struct clocksource *cs); void (*suspend)(struct clocksource *cs); @@ -236,8 +260,9 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec); */ extern int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq); -extern void -__clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq); +extern int +__devm_clocksource_register_scale(struct device *dev, struct clocksource *cs, + u32 scale, u32 freq); /* * Don't call this unless you are a default clocksource @@ -258,14 +283,16 @@ static inline int clocksource_register_khz(struct clocksource *cs, u32 khz) return __clocksource_register_scale(cs, 1000, khz); } -static inline void __clocksource_update_freq_hz(struct clocksource *cs, u32 hz) +static inline int devm_clocksource_register_hz(struct device *dev, + struct clocksource *cs, u32 hz) { - __clocksource_update_freq_scale(cs, 1, hz); + return __devm_clocksource_register_scale(dev, cs, 1, hz); } -static inline void __clocksource_update_freq_khz(struct clocksource *cs, u32 khz) +static inline int devm_clocksource_register_khz(struct device *dev, + struct clocksource *cs, u32 khz) { - __clocksource_update_freq_scale(cs, 1000, khz); + return __devm_clocksource_register_scale(dev, cs, 1000, khz); } #ifdef CONFIG_ARCH_CLOCKSOURCE_INIT diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index 527e4e136020..e606ed6c7539 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -5,15 +5,6 @@ /* Compiler specific definitions for Clang compiler */ -/* - * Clang prior to 17 is being silly and considers many __cleanup() variables - * as unused (because they are, their sole purpose is to go out of scope). - * - * https://github.com/llvm/llvm-project/commit/877210faa447f4cc7db87812f8ed80e398fedd61 - */ -#undef __cleanup -#define __cleanup(func) __maybe_unused __attribute__((__cleanup__(func))) - /* all clang versions usable with the kernel support KASAN ABI version 5 */ #define KASAN_ABI_VERSION 5 @@ -137,10 +128,10 @@ #define __diag_clang_23(s) #endif -#define __diag_clang_13(s) __diag(s) +#define __diag_clang_all(s) __diag(s) #define __diag_ignore_all(option, comment) \ - __diag_clang(13, ignore, option) + __diag_clang(all, ignore, option) /* * clang has horrible behavior with "g" or "rm" constraints for asm diff --git a/include/linux/compiler-context-analysis.h b/include/linux/compiler-context-analysis.h index a9317571e6af..8302ebc2ea8c 100644 --- a/include/linux/compiler-context-analysis.h +++ b/include/linux/compiler-context-analysis.h @@ -39,12 +39,14 @@ # define __assumes_shared_ctx_lock(...) __attribute__((assert_shared_capability(__VA_ARGS__))) /** - * __guarded_by - struct member and globals attribute, declares variable - * only accessible within active context + * __guarded_by() - struct member and globals attribute, declares variable + * only accessible within active context + * @...: context lock instance pointer(s) * * Declares that the struct member or global variable is only accessible within - * the context entered by the given context lock. Read operations on the data - * require shared access, while write operations require exclusive access. + * the context entered by the given context lock(s). Read operations on the data + * require shared access to at least one of the context locks, while write + * operations require exclusive access to all listed context locks. * * .. code-block:: c * @@ -52,17 +54,24 @@ * spinlock_t lock; * long counter __guarded_by(&lock); * }; + * + * struct some_state { + * spinlock_t lock1, lock2; + * long counter __guarded_by(&lock1, &lock2); + * }; */ # define __guarded_by(...) __attribute__((guarded_by(__VA_ARGS__))) /** - * __pt_guarded_by - struct member and globals attribute, declares pointed-to - * data only accessible within active context + * __pt_guarded_by() - struct member and globals attribute, declares pointed-to + * data only accessible within active context + * @...: context lock instance pointer(s) * * Declares that the data pointed to by the struct member pointer or global * pointer is only accessible within the context entered by the given context - * lock. Read operations on the data require shared access, while write - * operations require exclusive access. + * lock(s). Read operations on the data require shared access to at least one + * of the context locks, while write operations require exclusive access to all + * listed context locks. * * .. code-block:: c * @@ -70,6 +79,11 @@ * spinlock_t lock; * long *counter __pt_guarded_by(&lock); * }; + * + * struct some_state { + * spinlock_t lock1, lock2; + * long *counter __pt_guarded_by(&lock1, &lock2); + * }; */ # define __pt_guarded_by(...) __attribute__((pt_guarded_by(__VA_ARGS__))) diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index 836a50f5917a..476c4c560d17 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -231,6 +231,15 @@ #define noinline __attribute__((__noinline__)) /* + * Note: deliberately not named '__nonnull', to avoid clashing with glibc's + * __nonnull() when kernel and userspace headers are combined. + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Attributes.html#index-nonnull + * clang: https://clang.llvm.org/docs/AttributeReference.html#nonnull + */ +#define __nonnull_args(x...) __attribute__((__nonnull__(x))) + +/* * Optional: only supported since gcc >= 8 * Optional: not supported by clang * diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 369966598a2c..c5921f139007 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -634,6 +634,9 @@ struct ftrace_likely_data { #else #define __unqual_scalar_typeof(x) __typeof_unqual__(x) #endif + +#include <asm/percpu_types.h> + #endif /* !__ASSEMBLY__ */ /* diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 68861da4cf7c..7b38ee2e7913 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -5,6 +5,7 @@ #include <linux/types.h> #include <linux/mm.h> #include <linux/fs.h> +#include <linux/sched/coredump.h> #include <asm/siginfo.h> #ifdef CONFIG_COREDUMP @@ -20,7 +21,10 @@ struct coredump_params { const kernel_siginfo_t *siginfo; struct file *file; unsigned long limit; + /* MMF_DUMP_FILTER_* bits, snapshot of mm->flags at dump start. */ unsigned long mm_flags; + /* Snapshot of dumpable at dump start. */ + enum task_dumpable dumpable; int cpu; loff_t written; loff_t pos; diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 2131febebee9..1cf85d772bea 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -691,8 +691,11 @@ coresight_find_output_type(struct coresight_platform_data *pdata, enum coresight_dev_type type, union coresight_dev_subtype subtype); -int coresight_init_driver(const char *drv, struct amba_driver *amba_drv, - struct platform_driver *pdev_drv, struct module *owner); +int coresight_init_driver_with_owner(const char *drv, struct amba_driver *amba_drv, + struct platform_driver *pdev_drv, struct module *owner, + const char *mod_name); +#define coresight_init_driver(drv, amba_drv, pdev_drv) \ + coresight_init_driver_with_owner(drv, amba_drv, pdev_drv, THIS_MODULE, KBUILD_MODNAME) void coresight_remove_driver(struct amba_driver *amba_drv, struct platform_driver *pdev_drv); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 2ab691828e48..ae9d1ce4f49c 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -146,6 +146,9 @@ struct cpufreq_policy { /* Per policy boost supported flag. */ bool boost_supported; + /* Pending policy->min/max update for the driver */ + bool update_limits; + /* Cached frequency lookup from cpufreq_driver_resolve_freq. */ unsigned int cached_target_freq; unsigned int cached_resolved_idx; @@ -434,7 +437,7 @@ struct cpufreq_driver { /* * Set by drivers that need to update internal upper and lower boundaries along * with the target frequency and so the core and governors should also invoke - * the diver if the target frequency does not change, but the policy min or max + * the driver if the target frequency does not change, but the policy min or max * may have changed. */ #define CPUFREQ_NEED_UPDATE_LIMITS BIT(0) diff --git a/include/linux/cpuhplock.h b/include/linux/cpuhplock.h index 286b3ab92e15..42f6a095ba5b 100644 --- a/include/linux/cpuhplock.h +++ b/include/linux/cpuhplock.h @@ -12,9 +12,6 @@ struct device; -extern int lockdep_is_cpus_held(void); -extern int lockdep_is_cpus_write_held(void); - #ifdef CONFIG_HOTPLUG_CPU void cpus_write_lock(void); void cpus_write_unlock(void); @@ -22,6 +19,8 @@ void cpus_read_lock(void); void cpus_read_unlock(void); int cpus_read_trylock(void); void lockdep_assert_cpus_held(void); +int lockdep_is_cpus_held(void); +int lockdep_is_cpus_write_held(void); void cpu_hotplug_disable_offlining(void); void cpu_hotplug_disable(void); void cpu_hotplug_enable(void); @@ -38,6 +37,8 @@ static inline void cpus_read_lock(void) { } static inline void cpus_read_unlock(void) { } static inline int cpus_read_trylock(void) { return true; } static inline void lockdep_assert_cpus_held(void) { } +static inline int lockdep_is_cpus_held(void) { return 1; } +static inline int lockdep_is_cpus_write_held(void) { return 1; } static inline void cpu_hotplug_disable_offlining(void) { } static inline void cpu_hotplug_disable(void) { } static inline void cpu_hotplug_enable(void) { } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 2577c05f84ec..4b1ff99608e0 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -116,10 +116,7 @@ struct dentry { * possible! */ - union { - struct list_head d_lru; /* LRU list */ - wait_queue_head_t *d_wait; /* in-lookup ones only */ - }; + struct list_head d_lru; /* LRU list */ struct hlist_node d_sib; /* child of parent list */ struct hlist_head d_children; /* our children */ /* @@ -210,6 +207,9 @@ enum dentry_flags { DCACHE_REFERENCED = BIT(6), /* Recently used, don't discard. */ DCACHE_DONTCACHE = BIT(7), /* Purge from memory on final dput() */ DCACHE_CANT_MOUNT = BIT(8), + DCACHE_LOOKUP_WAITERS = BIT(9), /* A thread is waiting for + * PAR_LOOKUP to clear + */ DCACHE_SHRINK_LIST = BIT(10), DCACHE_OP_WEAK_REVALIDATE = BIT(11), /* @@ -256,8 +256,7 @@ extern void d_delete(struct dentry *); /* allocate/de-allocate */ extern struct dentry * d_alloc(struct dentry *, const struct qstr *); extern struct dentry * d_alloc_anon(struct super_block *); -extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *, - wait_queue_head_t *); +extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *); extern struct dentry * d_splice_alias(struct inode *, struct dentry *); /* weird procfs mess; *NOT* exported */ extern struct dentry * d_splice_alias_ops(struct inode *, struct dentry *, @@ -281,7 +280,7 @@ extern void d_tmpfile(struct file *, struct inode *); extern struct dentry *d_find_alias(struct inode *); extern void d_prune_aliases(struct inode *); -extern void d_dispose_if_unused(struct dentry *, struct list_head *); +extern bool __move_to_shrink_list(struct dentry *, struct list_head *); extern void shrink_dentry_list(struct list_head *); extern struct dentry *d_find_alias_rcu(struct inode *); @@ -366,6 +365,24 @@ static inline struct dentry *dget(struct dentry *dentry) return dentry; } +/* dentry->d_inode->i_lock must be held by caller */ +static inline bool dget_alias_ilocked(struct dentry *dentry) +{ + if (likely(!(READ_ONCE(dentry->d_flags) & DCACHE_NORCU))) { + lockref_get(&dentry->d_lockref); + return true; + } + // NORCU dentries with zero refcount MUST NOT be grabbed + spin_lock(&dentry->d_lock); + if (dentry->d_lockref.count > 0) { + dget_dlock(dentry); + spin_unlock(&dentry->d_lock); + return true; + } + spin_unlock(&dentry->d_lock); + return false; +} + extern struct dentry *dget_parent(struct dentry *dentry); /** diff --git a/include/linux/delay.h b/include/linux/delay.h index 46412c00033a..68b2a69dd24d 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -110,7 +110,7 @@ static const unsigned int max_slack_shift = 2; * fsleep - flexible sleep which autoselects the best mechanism * @usecs: requested sleep duration in microseconds * - * flseep() selects the best mechanism that will provide maximum 25% slack + * fsleep() selects the best mechanism that will provide maximum 25% slack * to the requested sleep duration. Therefore it uses: * * * udelay() loop for sleep durations <= 10 microseconds to avoid hrtimer diff --git a/include/linux/device.h b/include/linux/device.h index 9c8fde6a3d86..7b2baffdd2f5 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -38,7 +38,6 @@ struct device_private; struct device_driver; struct driver_private; struct module; -struct class; struct subsys_private; struct device_node; struct fwnode_handle; @@ -104,10 +103,18 @@ struct device_type { */ struct device_attribute { struct attribute attr; - ssize_t (*show)(struct device *dev, struct device_attribute *attr, - char *buf); - ssize_t (*store)(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count); + __SYSFS_FUNCTION_ALTERNATIVE( + ssize_t (*show)(struct device *dev, struct device_attribute *attr, + char *buf); + ssize_t (*show_const)(struct device *dev, const struct device_attribute *attr, + char *buf); + ); + __SYSFS_FUNCTION_ALTERNATIVE( + ssize_t (*store)(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count); + ssize_t (*store_const)(struct device *dev, const struct device_attribute *attr, + const char *buf, size_t count); + ); }; /** @@ -135,6 +142,77 @@ ssize_t device_store_bool(struct device *dev, struct device_attribute *attr, ssize_t device_show_string(struct device *dev, struct device_attribute *attr, char *buf); +typedef ssize_t __device_show_handler_const(struct device *dev, const struct device_attribute *attr, + char *buf); +typedef ssize_t __device_store_handler_const(struct device *dev, const struct device_attribute *attr, + const char *buf, size_t count); + +#ifdef CONFIG_CFI + +#define __DEVICE_ATTR_SHOW_STORE(_show, _store) \ + .show = _Generic(_show, \ + __device_show_handler_const * : NULL, \ + default : _show \ + ), \ + .show_const = _Generic(_show, \ + __device_show_handler_const * : _show, \ + default : NULL \ + ), \ + .store = _Generic(_store, \ + __device_store_handler_const * : NULL, \ + default : _store \ + ), \ + .store_const = _Generic(_store, \ + __device_store_handler_const * : _store, \ + default : NULL \ + ), + +#else + +#define __DEVICE_ATTR_SHOW_STORE(_show, _store) \ + .show = _Generic(_show, \ + __device_show_handler_const * : (void *)_show, \ + default : _show \ + ), \ + .store = _Generic(_store, \ + __device_store_handler_const * : (void *)_store, \ + default : _store \ + ), \ + +#endif + + +#define __DEVICE_ATTR(_name, _mode, _show, _store) { \ + .attr = {.name = __stringify(_name), \ + .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ + __DEVICE_ATTR_SHOW_STORE(_show, _store) \ +} + +#define __DEVICE_ATTR_RO_MODE(_name, _mode) \ + __DEVICE_ATTR(_name, _mode, _name##_show, NULL) + +#define __DEVICE_ATTR_RO(_name) \ + __DEVICE_ATTR_RO_MODE(_name, 0444) + +#define __DEVICE_ATTR_WO(_name) \ + __DEVICE_ATTR(_name, 0200, NULL, _name##_store) + +#define __DEVICE_ATTR_RW_MODE(_name, _mode) \ + __DEVICE_ATTR(_name, _mode, _name##_show, _name##_store) + +#define __DEVICE_ATTR_RW(_name) \ + __DEVICE_ATTR_RW_MODE(_name, 0644) + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +#define __DEVICE_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) { \ + .attr = {.name = __stringify(_name), .mode = _mode, \ + .ignore_lockdep = true }, \ + __DEVICE_ATTR_SHOW_STORE(_show, _store) \ +} +#else +#define __DEVICE_ATTR_IGNORE_LOCKDEP __DEVICE_ATTR +#endif + /** * DEVICE_ATTR - Define a device attribute. * @_name: Attribute name. @@ -155,20 +233,7 @@ ssize_t device_show_string(struct device *dev, struct device_attribute *attr, * }; */ #define DEVICE_ATTR(_name, _mode, _show, _store) \ - struct device_attribute dev_attr_##_name = __ATTR(_name, _mode, _show, _store) - -/** - * DEVICE_ATTR_PREALLOC - Define a preallocated device attribute. - * @_name: Attribute name. - * @_mode: File mode. - * @_show: Show handler. Optional, but mandatory if attribute is readable. - * @_store: Store handler. Optional, but mandatory if attribute is writable. - * - * Like DEVICE_ATTR(), but ``SYSFS_PREALLOC`` is set on @_mode. - */ -#define DEVICE_ATTR_PREALLOC(_name, _mode, _show, _store) \ - struct device_attribute dev_attr_##_name = \ - __ATTR_PREALLOC(_name, _mode, _show, _store) + struct device_attribute dev_attr_##_name = __DEVICE_ATTR(_name, _mode, _show, _store) /** * DEVICE_ATTR_RW - Define a read-write device attribute. @@ -178,7 +243,7 @@ ssize_t device_show_string(struct device *dev, struct device_attribute *attr, * and @_store is <_name>_store. */ #define DEVICE_ATTR_RW(_name) \ - struct device_attribute dev_attr_##_name = __ATTR_RW(_name) + struct device_attribute dev_attr_##_name = __DEVICE_ATTR_RW(_name) /** * DEVICE_ATTR_ADMIN_RW - Define an admin-only read-write device attribute. @@ -187,7 +252,7 @@ ssize_t device_show_string(struct device *dev, struct device_attribute *attr, * Like DEVICE_ATTR_RW(), but @_mode is 0600. */ #define DEVICE_ATTR_ADMIN_RW(_name) \ - struct device_attribute dev_attr_##_name = __ATTR_RW_MODE(_name, 0600) + struct device_attribute dev_attr_##_name = __DEVICE_ATTR_RW_MODE(_name, 0600) /** * DEVICE_ATTR_RW_NAMED - Define a read-write device attribute with a sysfs name @@ -201,8 +266,7 @@ ssize_t device_show_string(struct device *dev, struct device_attribute *attr, #define DEVICE_ATTR_RW_NAMED(_name, _attrname) \ struct device_attribute dev_attr_##_name = { \ .attr = { .name = _attrname, .mode = 0644 }, \ - .show = _name##_show, \ - .store = _name##_store, \ + __DEVICE_ATTR_SHOW_STORE(_name##_show, _name##_store) \ } /** @@ -212,7 +276,7 @@ ssize_t device_show_string(struct device *dev, struct device_attribute *attr, * Like DEVICE_ATTR(), but @_mode is 0444 and @_show is <_name>_show. */ #define DEVICE_ATTR_RO(_name) \ - struct device_attribute dev_attr_##_name = __ATTR_RO(_name) + struct device_attribute dev_attr_##_name = __DEVICE_ATTR_RO(_name) /** * DEVICE_ATTR_ADMIN_RO - Define an admin-only readable device attribute. @@ -221,7 +285,7 @@ ssize_t device_show_string(struct device *dev, struct device_attribute *attr, * Like DEVICE_ATTR_RO(), but @_mode is 0400. */ #define DEVICE_ATTR_ADMIN_RO(_name) \ - struct device_attribute dev_attr_##_name = __ATTR_RO_MODE(_name, 0400) + struct device_attribute dev_attr_##_name = __DEVICE_ATTR_RO_MODE(_name, 0400) /** * DEVICE_ATTR_RO_NAMED - Define a read-only device attribute with a sysfs name @@ -235,7 +299,7 @@ ssize_t device_show_string(struct device *dev, struct device_attribute *attr, #define DEVICE_ATTR_RO_NAMED(_name, _attrname) \ struct device_attribute dev_attr_##_name = { \ .attr = { .name = _attrname, .mode = 0444 }, \ - .show = _name##_show, \ + __DEVICE_ATTR_SHOW_STORE(_name##_show, NULL) \ } /** @@ -245,7 +309,7 @@ ssize_t device_show_string(struct device *dev, struct device_attribute *attr, * Like DEVICE_ATTR(), but @_mode is 0200 and @_store is <_name>_store. */ #define DEVICE_ATTR_WO(_name) \ - struct device_attribute dev_attr_##_name = __ATTR_WO(_name) + struct device_attribute dev_attr_##_name = __DEVICE_ATTR_WO(_name) /** * DEVICE_ATTR_WO_NAMED - Define a read-only device attribute with a sysfs name @@ -259,7 +323,7 @@ ssize_t device_show_string(struct device *dev, struct device_attribute *attr, #define DEVICE_ATTR_WO_NAMED(_name, _attrname) \ struct device_attribute dev_attr_##_name = { \ .attr = { .name = _attrname, .mode = 0200 }, \ - .store = _name##_store, \ + __DEVICE_ATTR_SHOW_STORE(NULL, _name##_store) \ } /** @@ -273,7 +337,7 @@ ssize_t device_show_string(struct device *dev, struct device_attribute *attr, */ #define DEVICE_ULONG_ATTR(_name, _mode, _var) \ struct dev_ext_attribute dev_attr_##_name = \ - { __ATTR(_name, _mode, device_show_ulong, device_store_ulong), &(_var) } + { __DEVICE_ATTR(_name, _mode, device_show_ulong, device_store_ulong), &(_var) } /** * DEVICE_INT_ATTR - Define a device attribute backed by an int. @@ -285,7 +349,7 @@ ssize_t device_show_string(struct device *dev, struct device_attribute *attr, */ #define DEVICE_INT_ATTR(_name, _mode, _var) \ struct dev_ext_attribute dev_attr_##_name = \ - { __ATTR(_name, _mode, device_show_int, device_store_int), &(_var) } + { __DEVICE_ATTR(_name, _mode, device_show_int, device_store_int), &(_var) } /** * DEVICE_BOOL_ATTR - Define a device attribute backed by a bool. @@ -297,7 +361,7 @@ ssize_t device_show_string(struct device *dev, struct device_attribute *attr, */ #define DEVICE_BOOL_ATTR(_name, _mode, _var) \ struct dev_ext_attribute dev_attr_##_name = \ - { __ATTR(_name, _mode, device_show_bool, device_store_bool), &(_var) } + { __DEVICE_ATTR(_name, _mode, device_show_bool, device_store_bool), &(_var) } /** * DEVICE_STRING_ATTR_RO - Define a device attribute backed by a r/o string. @@ -310,11 +374,11 @@ ssize_t device_show_string(struct device *dev, struct device_attribute *attr, */ #define DEVICE_STRING_ATTR_RO(_name, _mode, _var) \ struct dev_ext_attribute dev_attr_##_name = \ - { __ATTR(_name, (_mode) & ~0222, device_show_string, NULL), (_var) } + { __DEVICE_ATTR(_name, (_mode) & ~0222, device_show_string, NULL), (_var) } #define DEVICE_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) \ struct device_attribute dev_attr_##_name = \ - __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) + __DEVICE_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) int device_create_file(struct device *device, const struct device_attribute *entry); @@ -512,10 +576,40 @@ struct device_physical_location { * * @DEV_FLAG_READY_TO_PROBE: If set then device_add() has finished enough * initialization that probe could be called. + * @DEV_FLAG_CAN_MATCH: The device has matched with a driver at least once or it + * is in a bus (like AMBA) which can't check for matching drivers + * until other devices probe successfully. + * @DEV_FLAG_DMA_IOMMU: Device is using default IOMMU implementation for DMA and + * doesn't rely on dma_ops structure. + * @DEV_FLAG_DMA_SKIP_SYNC: DMA sync operations can be skipped for coherent + * buffers. + * @DEV_FLAG_DMA_OPS_BYPASS: If set then the dma_ops are bypassed for the + * streaming DMA operations (->map_* / ->unmap_* / ->sync_*), and + * optional (if the coherent mask is large enough) also for dma + * allocations. This flag is managed by the dma ops instance from + * ->dma_supported. + * @DEV_FLAG_STATE_SYNCED: The hardware state of this device has been synced to + * match the software state of this device by calling the + * driver/bus sync_state() callback. + * @DEV_FLAG_DMA_COHERENT: This particular device is dma coherent, even if the + * architecture supports non-coherent devices. + * @DEV_FLAG_OF_NODE_REUSED: Set if the device-tree node is shared with an + * ancestor device. + * @DEV_FLAG_OFFLINE_DISABLED: If set, the device is permanently online. + * @DEV_FLAG_OFFLINE: Set after successful invocation of bus type's .offline(). * @DEV_FLAG_COUNT: Number of defined struct_device_flags. */ enum struct_device_flags { DEV_FLAG_READY_TO_PROBE = 0, + DEV_FLAG_CAN_MATCH = 1, + DEV_FLAG_DMA_IOMMU = 2, + DEV_FLAG_DMA_SKIP_SYNC = 3, + DEV_FLAG_DMA_OPS_BYPASS = 4, + DEV_FLAG_STATE_SYNCED = 5, + DEV_FLAG_DMA_COHERENT = 6, + DEV_FLAG_OF_NODE_REUSED = 7, + DEV_FLAG_OFFLINE_DISABLED = 8, + DEV_FLAG_OFFLINE = 9, DEV_FLAG_COUNT }; @@ -594,27 +688,6 @@ enum struct_device_flags { * @removable: Whether the device can be removed from the system. This * should be set by the subsystem / bus driver that discovered * the device. - * - * @offline_disabled: If set, the device is permanently online. - * @offline: Set after successful invocation of bus type's .offline(). - * @of_node_reused: Set if the device-tree node is shared with an ancestor - * device. - * @state_synced: The hardware state of this device has been synced to match - * the software state of this device by calling the driver/bus - * sync_state() callback. - * @can_match: The device has matched with a driver at least once or it is in - * a bus (like AMBA) which can't check for matching drivers until - * other devices probe successfully. - * @dma_coherent: this particular device is dma coherent, even if the - * architecture supports non-coherent devices. - * @dma_ops_bypass: If set to %true then the dma_ops are bypassed for the - * streaming DMA operations (->map_* / ->unmap_* / ->sync_*), - * and optionall (if the coherent mask is large enough) also - * for dma allocations. This flag is managed by the dma ops - * instance from ->dma_supported. - * @dma_skip_sync: DMA sync operations can be skipped for coherent buffers. - * @dma_iommu: Device is using default IOMMU implementation for DMA and - * doesn't rely on dma_ops structure. * @flags: DEV_FLAG_XXX flags. Use atomic bitfield operations to modify. * * At the lowest level, every device in a Linux system is represented by an @@ -719,26 +792,6 @@ struct device { enum device_removable removable; - bool offline_disabled:1; - bool offline:1; - bool of_node_reused:1; - bool state_synced:1; - bool can_match:1; -#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ - defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ - defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) - bool dma_coherent:1; -#endif -#ifdef CONFIG_DMA_OPS_BYPASS - bool dma_ops_bypass : 1; -#endif -#ifdef CONFIG_DMA_NEED_SYNC - bool dma_skip_sync:1; -#endif -#ifdef CONFIG_IOMMU_DMA - bool dma_iommu:1; -#endif - DECLARE_BITMAP(flags, DEV_FLAG_COUNT); }; @@ -765,6 +818,15 @@ static inline bool dev_test_and_set_##accessor_name(struct device *dev) \ } __create_dev_flag_accessors(ready_to_probe, DEV_FLAG_READY_TO_PROBE); +__create_dev_flag_accessors(can_match, DEV_FLAG_CAN_MATCH); +__create_dev_flag_accessors(dma_iommu, DEV_FLAG_DMA_IOMMU); +__create_dev_flag_accessors(dma_skip_sync, DEV_FLAG_DMA_SKIP_SYNC); +__create_dev_flag_accessors(dma_ops_bypass, DEV_FLAG_DMA_OPS_BYPASS); +__create_dev_flag_accessors(state_synced, DEV_FLAG_STATE_SYNCED); +__create_dev_flag_accessors(dma_coherent, DEV_FLAG_DMA_COHERENT); +__create_dev_flag_accessors(of_node_reused, DEV_FLAG_OF_NODE_REUSED); +__create_dev_flag_accessors(offline_disabled, DEV_FLAG_OFFLINE_DISABLED); +__create_dev_flag_accessors(offline, DEV_FLAG_OFFLINE); #undef __create_dev_flag_accessors @@ -1063,17 +1125,6 @@ static inline void device_lock_assert(struct device *dev) lockdep_assert_held(&dev->mutex); } -static inline bool dev_has_sync_state(struct device *dev) -{ - if (!dev) - return false; - if (dev->driver && dev->driver->sync_state) - return true; - if (dev->bus && dev->bus->sync_state) - return true; - return false; -} - static inline int dev_set_drv_sync_state(struct device *dev, void (*fn)(struct device *dev)) { diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h index c1b463cd6464..a38f7229b8f4 100644 --- a/include/linux/device/bus.h +++ b/include/linux/device/bus.h @@ -83,9 +83,9 @@ struct fwnode_handle; struct bus_type { const char *name; const char *dev_name; - const struct attribute_group **bus_groups; - const struct attribute_group **dev_groups; - const struct attribute_group **drv_groups; + const struct attribute_group *const *bus_groups; + const struct attribute_group *const *dev_groups; + const struct attribute_group *const *drv_groups; int (*match)(struct device *dev, const struct device_driver *drv); int (*uevent)(const struct device *dev, struct kobj_uevent_env *env); diff --git a/include/linux/device/class.h b/include/linux/device/class.h index 78ab8d2b3e30..9db1d61ba743 100644 --- a/include/linux/device/class.h +++ b/include/linux/device/class.h @@ -34,7 +34,7 @@ struct fwnode_handle; * @class_release: Called to release this class. * @dev_release: Called to release the device. * @shutdown_pre: Called at shut-down time before driver shutdown. - * @ns_type: Callbacks so sysfs can detemine namespaces. + * @ns_type: Callbacks so sysfs can determine namespaces. * @namespace: Namespace of the device belongs to this class. * @get_ownership: Allows class to specify uid/gid of the sysfs directories * for the devices belonging to the class. Usually tied to diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h index bbc67ec513ed..38048e74d10a 100644 --- a/include/linux/device/driver.h +++ b/include/linux/device/driver.h @@ -114,8 +114,8 @@ struct device_driver { void (*shutdown) (struct device *dev); int (*suspend) (struct device *dev, pm_message_t state); int (*resume) (struct device *dev); - const struct attribute_group **groups; - const struct attribute_group **dev_groups; + const struct attribute_group *const *groups; + const struct attribute_group *const *dev_groups; const struct dev_pm_ops *pm; void (*coredump) (struct device *dev); @@ -123,8 +123,8 @@ struct device_driver { struct driver_private *p; struct { /* - * Called after remove() and after all devres entries have been - * processed. This is a Rust only callback. + * Called after remove() but before devres entries are released. + * This is a Rust only callback. */ void (*post_unbind_rust)(struct device *dev); } p_cb; @@ -160,8 +160,6 @@ int __must_check driver_create_file(const struct device_driver *driver, void driver_remove_file(const struct device_driver *driver, const struct driver_attribute *attr); -int driver_set_override(struct device *dev, const char **override, - const char *s, size_t len); int __must_check driver_for_each_device(struct device_driver *drv, struct device *start, void *data, device_iter_t fn); struct device *driver_find_device(const struct device_driver *drv, diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 6a1832a73cad..bcb5b5428aea 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -225,7 +225,7 @@ int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start, extern bool dma_default_coherent; static inline bool dev_is_dma_coherent(struct device *dev) { - return dev->dma_coherent; + return dev_dma_coherent(dev); } #else #define dma_default_coherent true @@ -240,8 +240,8 @@ static inline void dma_reset_need_sync(struct device *dev) { #ifdef CONFIG_DMA_NEED_SYNC /* Reset it only once so that the function can be called on hotpath */ - if (unlikely(dev->dma_skip_sync)) - dev->dma_skip_sync = false; + if (unlikely(dev_dma_skip_sync(dev))) + dev_clear_dma_skip_sync(dev); #endif } diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index db8ab24a54f4..cc0823a99cfd 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -429,7 +429,7 @@ bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr); static inline bool dma_dev_need_sync(const struct device *dev) { /* Always call DMA sync operations when debugging is enabled */ - return !dev->dma_skip_sync || IS_ENABLED(CONFIG_DMA_API_DEBUG); + return !dev_dma_skip_sync(dev) || IS_ENABLED(CONFIG_DMA_API_DEBUG); } static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index 728fb5dee5ed..de1c738aa8ad 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -61,8 +61,16 @@ static inline void eventpoll_release(struct file *file) eventpoll_release_file(file); } +struct epoll_key { + struct file *file; + int fd; +} __packed; + +int do_epoll_ctl_file(struct file *f, int op, struct epoll_key *tf, + struct epoll_event *epds, bool nonblock); int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, bool nonblock); +bool is_file_epoll(struct file *f); /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ static inline int ep_op_has_event(int op) diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 8bcdba28b406..c835bc64f4fa 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -6,9 +6,8 @@ #include <linux/path.h> struct dentry; -struct iattr; +struct exportfs_block_ops; struct inode; -struct iomap; struct super_block; struct vfsmount; @@ -260,19 +259,13 @@ struct handle_to_path_ctx { * @commit_metadata: * @commit_metadata should commit metadata changes to stable storage. * - * @get_uuid: - * Get a filesystem unique signature exposed to clients. - * - * @map_blocks: - * Map and, if necessary, allocate blocks for a layout. - * - * @commit_blocks: - * Commit blocks in a layout once the client is done with them. - * * @flags: * Allows the filesystem to communicate to nfsd that it may want to do things * differently when dealing with it. * + * @block_ops: + * Operations for layout grants to block on the underlying device. + * * Locking rules: * get_parent is called with child->d_inode->i_rwsem down * get_name is not (which is possibly inconsistent) @@ -290,12 +283,6 @@ struct export_operations { struct dentry * (*get_parent)(struct dentry *child); int (*commit_metadata)(struct inode *inode); - int (*get_uuid)(struct super_block *sb, u8 *buf, u32 *len, u64 *offset); - int (*map_blocks)(struct inode *inode, loff_t offset, - u64 len, struct iomap *iomap, - bool write, u32 *device_generation); - int (*commit_blocks)(struct inode *inode, struct iomap *iomaps, - int nr_iomaps, struct iattr *iattr); int (*permission)(struct handle_to_path_ctx *ctx, unsigned int oflags); struct file * (*open)(const struct path *path, unsigned int oflags); #define EXPORT_OP_NOWCC (0x1) /* don't collect v3 wcc data */ @@ -308,6 +295,10 @@ struct export_operations { #define EXPORT_OP_FLUSH_ON_CLOSE (0x20) /* fs flushes file data on close */ #define EXPORT_OP_NOLOCKS (0x40) /* no file locking support */ unsigned long flags; + +#ifdef CONFIG_EXPORTFS_BLOCK_OPS + const struct exportfs_block_ops *block_ops; +#endif }; /** diff --git a/include/linux/exportfs_block.h b/include/linux/exportfs_block.h new file mode 100644 index 000000000000..de519b7b599b --- /dev/null +++ b/include/linux/exportfs_block.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2014-2026 Christoph Hellwig. + * + * Support for exportfs-based layout grants for direct block device access. + */ +#ifndef LINUX_EXPORTFS_BLOCK_H +#define LINUX_EXPORTFS_BLOCK_H 1 + +#include <linux/blkdev.h> +#include <linux/exportfs.h> +#include <linux/fs.h> + +struct inode; +struct iomap; +struct super_block; + +/* + * There are the two types of block-style layout support: + * - In-band implies a device identified by a unique cookie inside the actual + * device address space checked by the ->get_uuid method as used by the pNFS + * block layout. This is a bit dangerous and deprecated. + * - Out of band implies identification by out of band unique identifiers + * specified by the storage protocol, which is much safer and used by the + * pNFS SCSI/NVMe layouts. + */ +typedef unsigned int __bitwise expfs_block_layouts_t; +#define EXPFS_BLOCK_FLAG(__bit) \ + ((__force expfs_block_layouts_t)(1u << __bit)) +#define EXPFS_BLOCK_IN_BAND_ID EXPFS_BLOCK_FLAG(0) +#define EXPFS_BLOCK_OUT_OF_BAND_ID EXPFS_BLOCK_FLAG(1) + +struct exportfs_block_ops { + /* + * Returns the EXPFS_BLOCK_* bitmap of supported layout types. + */ + expfs_block_layouts_t (*layouts_supported)(struct super_block *sb); + + /* + * Get the in-band device unique signature exposed to clients. + */ + int (*get_uuid)(struct super_block *sb, u8 *buf, u32 *len, u64 *offset); + + /* + * Map blocks for direct block access. + * If @write is %true, also allocate the blocks for the range if needed. + */ + int (*map_blocks)(struct inode *inode, loff_t offset, u64 len, + struct iomap *iomap, bool write, + u32 *device_generation); + + /* + * Commit blocks previously handed out by ->map_blocks and written to by + * the client. + */ + int (*commit_blocks)(struct inode *inode, struct iomap *iomaps, + int nr_iomaps, loff_t new_size); +}; + +static inline bool +exportfs_bdev_supports_out_of_band_id(struct block_device *bdev) +{ + return bdev->bd_disk->fops->pr_ops && + bdev->bd_disk->fops->get_unique_id; +} + +#ifdef CONFIG_EXPORTFS_BLOCK_OPS +static inline expfs_block_layouts_t +exportfs_layouts_supported(struct super_block *sb) +{ + const struct exportfs_block_ops *bops = sb->s_export_op->block_ops; + + if (!bops || + !bops->layouts_supported || + WARN_ON_ONCE(!bops->map_blocks) || + WARN_ON_ONCE(!bops->commit_blocks)) + return 0; + return bops->layouts_supported(sb); +} +#else +static inline expfs_block_layouts_t +exportfs_layouts_supported(struct super_block *sb) +{ + return 0; +} +#endif /* CONFIG_EXPORTFS_BLOCK_OPS */ + +#endif /* LINUX_EXPORTFS_BLOCK_H */ diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h index a332e79b3207..6ad6b9e7a226 100644 --- a/include/linux/fcntl.h +++ b/include/linux/fcntl.h @@ -4,13 +4,31 @@ #include <linux/stat.h> #include <uapi/linux/fcntl.h> +#include <uapi/linux/openat2.h> /* List of all valid flags for the open/openat flags argument: */ #define VALID_OPEN_FLAGS \ (O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | \ O_APPEND | O_NDELAY | O_NONBLOCK | __O_SYNC | O_DSYNC | \ FASYNC | O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | \ - O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE) + O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE | O_EMPTYPATH) + +/* List of all valid flags for openat2(2)'s how->flags argument. */ +#define VALID_OPENAT2_FLAGS (VALID_OPEN_FLAGS | OPENAT2_REGULAR) + +/* + * Kernel-internal carrier for OPENAT2_REGULAR. The UAPI bit lives in the + * upper 32 bits of open_how::flags so open()/openat() cannot encode it. + * build_open_flags() translates it to this internal flag, which then + * propagates through op->open_flag and f->f_flags exactly like __FMODE_EXEC. + * do_dentry_open() strips it so userspace cannot observe it via + * fcntl(F_GETFL). + * + * Bit 30 is not claimed by any O_* flag on any architecture and stays clear + * of the sign bit of the int op->open_flag. fcntl_init() enforces that it + * never aliases an open-flag bit. + */ +#define __O_REGULAR (1 << 30) /* List of all valid flags for the how->resolve argument: */ #define VALID_RESOLVE_FLAGS \ diff --git a/include/linux/fileattr.h b/include/linux/fileattr.h index 3780904a63a6..58044b598016 100644 --- a/include/linux/fileattr.h +++ b/include/linux/fileattr.h @@ -16,7 +16,8 @@ /* Read-only inode flags */ #define FS_XFLAG_RDONLY_MASK \ - (FS_XFLAG_PREALLOC | FS_XFLAG_HASATTR | FS_XFLAG_VERITY) + (FS_XFLAG_PREALLOC | FS_XFLAG_HASATTR | FS_XFLAG_VERITY | \ + FS_XFLAG_CASEFOLD | FS_XFLAG_CASENONPRESERVING) /* Flags to indicate valid value of fsx_ fields */ #define FS_XFLAG_VALUES_MASK \ diff --git a/include/linux/filelock.h b/include/linux/filelock.h index 5f0a2fb31450..ec11cc6b4c58 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -4,19 +4,22 @@ #include <linux/fs.h> -#define FL_POSIX 1 -#define FL_FLOCK 2 -#define FL_DELEG 4 /* NFSv4 delegation */ -#define FL_ACCESS 8 /* not trying to lock, just looking */ -#define FL_EXISTS 16 /* when unlocking, test for existence */ -#define FL_LEASE 32 /* lease held on this file */ -#define FL_CLOSE 64 /* unlock on close */ -#define FL_SLEEP 128 /* A blocking lock */ -#define FL_DOWNGRADE_PENDING 256 /* Lease is being downgraded */ -#define FL_UNLOCK_PENDING 512 /* Lease is being broken */ -#define FL_OFDLCK 1024 /* lock is "owned" by struct file */ -#define FL_LAYOUT 2048 /* outstanding pNFS layout */ -#define FL_RECLAIM 4096 /* reclaiming from a reboot server */ +#define FL_POSIX BIT(0) /* POSIX lock */ +#define FL_FLOCK BIT(1) /* BSD lock */ +#define FL_DELEG BIT(2) /* NFSv4 delegation */ +#define FL_ACCESS BIT(3) /* not trying to lock, just looking */ +#define FL_EXISTS BIT(4) /* when unlocking, test for existence */ +#define FL_LEASE BIT(5) /* file lease */ +#define FL_CLOSE BIT(6) /* unlock on close */ +#define FL_SLEEP BIT(7) /* A blocking lock */ +#define FL_DOWNGRADE_PENDING BIT(8) /* Lease is being downgraded */ +#define FL_UNLOCK_PENDING BIT(9) /* Lease is being broken */ +#define FL_OFDLCK BIT(10) /* POSIX lock "owned" by struct file */ +#define FL_LAYOUT BIT(11) /* outstanding pNFS layout */ +#define FL_RECLAIM BIT(12) /* reclaiming from a reboot server */ +#define FL_IGN_DIR_CREATE BIT(13) /* ignore DIR_CREATE events */ +#define FL_IGN_DIR_DELETE BIT(14) /* ignore DIR_DELETE events */ +#define FL_IGN_DIR_RENAME BIT(15) /* ignore DIR_RENAME events */ #define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE) @@ -26,6 +29,15 @@ */ #define FILE_LOCK_DEFERRED 1 +#define LEASE_BREAK_LEASE BIT(0) // break leases and delegations +#define LEASE_BREAK_DELEG BIT(1) // break delegations only +#define LEASE_BREAK_LAYOUT BIT(2) // break layouts only +#define LEASE_BREAK_NONBLOCK BIT(3) // non-blocking break +#define LEASE_BREAK_OPEN_RDONLY BIT(4) // readonly open event +#define LEASE_BREAK_DIR_CREATE BIT(5) // dir deleg create event +#define LEASE_BREAK_DIR_DELETE BIT(6) // dir deleg delete event +#define LEASE_BREAK_DIR_RENAME BIT(7) // dir deleg rename event + struct file_lock; struct file_lease; @@ -216,19 +228,13 @@ int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl); void locks_init_lease(struct file_lease *); void locks_free_lease(struct file_lease *fl); struct file_lease *locks_alloc_lease(void); - -#define LEASE_BREAK_LEASE BIT(0) // break leases and delegations -#define LEASE_BREAK_DELEG BIT(1) // break delegations only -#define LEASE_BREAK_LAYOUT BIT(2) // break layouts only -#define LEASE_BREAK_NONBLOCK BIT(3) // non-blocking break -#define LEASE_BREAK_OPEN_RDONLY BIT(4) // readonly open event - int __break_lease(struct inode *inode, unsigned int flags); void lease_get_mtime(struct inode *, struct timespec64 *time); int generic_setlease(struct file *, int, struct file_lease **, void **priv); int kernel_setlease(struct file *, int, struct file_lease **, void **); int vfs_setlease(struct file *, int, struct file_lease **, void **); int lease_modify(struct file_lease *, int, struct list_head *); +u32 inode_lease_ignore_mask(struct inode *inode); struct notifier_block; int lease_register_notifier(struct notifier_block *); @@ -516,12 +522,26 @@ static inline bool is_delegated(struct delegated_inode *di) return di->di_inode; } -static inline int try_break_deleg(struct inode *inode, +/** + * try_break_deleg - do a non-blocking delegation break + * @inode: inode that should have its delegations broken + * @flags: extra LEASE_BREAK_* flags to pass to break_deleg() + * @di: returns pointer to delegated inode (may be NULL) + * + * Break delegations in a non-blocking fashion. If there are + * outstanding delegations and @di is set, then an extra reference + * will be taken on @inode and @di->di_inode will be populated so + * that it may be waited upon. + * + * Returns 0 if there is no need to wait or an error. If -EWOULDBLOCK + * is returned, then @di will be populated (if non-NULL). + */ +static inline int try_break_deleg(struct inode *inode, unsigned int flags, struct delegated_inode *di) { int ret; - ret = break_deleg(inode, LEASE_BREAK_NONBLOCK); + ret = break_deleg(inode, flags | LEASE_BREAK_NONBLOCK); if (ret == -EWOULDBLOCK && di) { di->di_inode = inode; ihold(inode); @@ -564,7 +584,7 @@ static inline bool is_delegated(struct delegated_inode *di) return false; } -static inline int break_lease(struct inode *inode, bool wait) +static inline int break_lease(struct inode *inode, unsigned int mode) { return 0; } @@ -574,7 +594,7 @@ static inline int break_deleg(struct inode *inode, unsigned int flags) return 0; } -static inline int try_break_deleg(struct inode *inode, +static inline int try_break_deleg(struct inode *inode, unsigned int flags, struct delegated_inode *delegated_inode) { return 0; diff --git a/include/linux/firmware/meson/meson_sm.h b/include/linux/firmware/meson/meson_sm.h index 8eaf8922ab02..3ebc2bd9a976 100644 --- a/include/linux/firmware/meson/meson_sm.h +++ b/include/linux/firmware/meson/meson_sm.h @@ -12,6 +12,7 @@ enum { SM_EFUSE_WRITE, SM_EFUSE_USER_MAX, SM_GET_CHIP_ID, + SM_THERMAL_CALIB_READ, SM_A1_PWRC_SET, SM_A1_PWRC_GET, }; @@ -27,5 +28,7 @@ int meson_sm_call_read(struct meson_sm_firmware *fw, void *buffer, unsigned int bsize, unsigned int cmd_index, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4); struct meson_sm_firmware *meson_sm_get(struct device_node *firmware_node); +int meson_sm_get_thermal_calib(struct meson_sm_firmware *fw, u32 *trim_info, + u32 tsensor_id); #endif /* _MESON_SM_FW_H_ */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 11559c513dfb..6da44573ce45 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2218,8 +2218,21 @@ static inline void mark_inode_dirty_sync(struct inode *inode) __mark_inode_dirty(inode, I_DIRTY_SYNC); } +/* + * returns the refcount on the inode. it can change arbitrarily. + */ +static inline int icount_read_once(const struct inode *inode) +{ + return atomic_read(&inode->i_count); +} + +/* + * returns the refcount on the inode. The lock guarantees no 0->1 or 1->0 transitions + * of the count are going to take place, otherwise it changes arbitrarily. + */ static inline int icount_read(const struct inode *inode) { + lockdep_assert_held(&inode->i_lock); return atomic_read(&inode->i_count); } @@ -2281,12 +2294,14 @@ struct file_system_type { #define FS_MGTIME 64 /* FS uses multigrain timestamps */ #define FS_LBS 128 /* FS supports LBS */ #define FS_POWER_FREEZE 256 /* Always freeze on suspend/hibernate */ +#define FS_USERNS_MOUNT_RESTRICTED 512 /* Restrict mount in userns if not already visible */ +#define FS_USERNS_DELEGATABLE 1024 /* Can be mounted inside userns from outside */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; void (*kill_sb) (struct super_block *); struct module *owner; - struct file_system_type * next; + struct hlist_node list; struct hlist_head fs_supers; struct lock_class_key s_lock_key; @@ -2327,10 +2342,6 @@ void free_anon_bdev(dev_t); struct super_block *sget_fc(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), int (*set)(struct super_block *, struct fs_context *)); -struct super_block *sget(struct file_system_type *type, - int (*test)(struct super_block *,void *), - int (*set)(struct super_block *,void *), - int flags, void *data); struct super_block *sget_dev(struct fs_context *fc, dev_t dev); /* Alas, no aliases. Too much hassle with bringing module.h everywhere */ @@ -2624,6 +2635,7 @@ extern int __must_check file_write_and_wait_range(struct file *file, loff_t start, loff_t end); int filemap_flush_range(struct address_space *mapping, loff_t start, loff_t end); +void filemap_dontcache_kick_writeback(struct address_space *mapping); static inline int file_write_and_wait(struct file *file) { @@ -2657,10 +2669,7 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) if (ret) return ret; } else if (iocb->ki_flags & IOCB_DONTCACHE) { - struct address_space *mapping = iocb->ki_filp->f_mapping; - - filemap_flush_range(mapping, iocb->ki_pos - count, - iocb->ki_pos - 1); + filemap_dontcache_kick_writeback(iocb->ki_filp->f_mapping); } return count; diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h index 383050e7fdf5..ef7941e9dc79 100644 --- a/include/linux/fs/super_types.h +++ b/include/linux/fs/super_types.h @@ -162,7 +162,8 @@ struct super_block { struct unicode_map *s_encoding; __u16 s_encoding_flags; #endif - struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ + struct hlist_head s_roots; /* alternate root dentries for NFS */ + spinlock_t s_roots_lock; struct mount *s_mounts; /* list of mounts; _not_ for fs use */ struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */ struct file *s_bdev_file; @@ -274,6 +275,14 @@ struct super_block { /* number of fserrors that are being sent to fsnotify/filesystems */ refcount_t s_pending_errors; + +#ifdef CONFIG_CGROUP_WRITEBACK + /* + * Number of in-flight inode wb switches for this sb. Drained by + * cgroup_writeback_umount() before tear-down. + */ + atomic_t s_isw_nr_in_flight; +#endif } __randomize_layout; /* @@ -326,7 +335,7 @@ struct super_block { #define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */ /* sb->s_iflags to limit user namespace mounts */ -#define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ +#define SB_I_RESTRICTED_VARIANT 0x00000010 #define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 #define SB_I_UNTRUSTED_MOUNTER 0x00000040 #define SB_I_EVM_HMAC_UNSUPPORTED 0x00000080 diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 079c18bcdbde..bda798bc67bc 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -257,6 +257,10 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir, __u32 new_dir_mask = FS_MOVED_TO; __u32 rename_mask = FS_RENAME; const struct qstr *new_name = &moved->d_name; + struct fsnotify_rename_data rd = { + .moved = moved, + .target = target, + }; if (isdir) { old_dir_mask |= FS_ISDIR; @@ -265,12 +269,12 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir, } /* Event with information about both old and new parent+name */ - fsnotify_name(rename_mask, moved, FSNOTIFY_EVENT_DENTRY, + fsnotify_name(rename_mask, &rd, FSNOTIFY_EVENT_RENAME, old_dir, old_name, 0); fsnotify_name(old_dir_mask, source, FSNOTIFY_EVENT_INODE, old_dir, old_name, fs_cookie); - fsnotify_name(new_dir_mask, source, FSNOTIFY_EVENT_INODE, + fsnotify_name(new_dir_mask, &rd, FSNOTIFY_EVENT_RENAME, new_dir, new_name, fs_cookie); if (target) diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index e5cde39d6e85..618eed4d6d72 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -311,6 +311,7 @@ enum fsnotify_data_type { FSNOTIFY_EVENT_DENTRY, FSNOTIFY_EVENT_MNT, FSNOTIFY_EVENT_ERROR, + FSNOTIFY_EVENT_RENAME, }; struct fs_error_report { @@ -335,6 +336,11 @@ struct fsnotify_mnt { u64 mnt_id; }; +struct fsnotify_rename_data { + struct dentry *moved; /* the dentry that was renamed */ + struct inode *target; /* inode overwritten by rename, or NULL */ +}; + static inline struct inode *fsnotify_data_inode(const void *data, int data_type) { switch (data_type) { @@ -348,6 +354,8 @@ static inline struct inode *fsnotify_data_inode(const void *data, int data_type) return d_inode(file_range_path(data)->dentry); case FSNOTIFY_EVENT_ERROR: return ((struct fs_error_report *)data)->inode; + case FSNOTIFY_EVENT_RENAME: + return d_inode(((const struct fsnotify_rename_data *)data)->moved); default: return NULL; } @@ -363,6 +371,8 @@ static inline struct dentry *fsnotify_data_dentry(const void *data, int data_typ return ((const struct path *)data)->dentry; case FSNOTIFY_EVENT_FILE_RANGE: return file_range_path(data)->dentry; + case FSNOTIFY_EVENT_RENAME: + return ((struct fsnotify_rename_data *)data)->moved; default: return NULL; } @@ -395,6 +405,8 @@ static inline struct super_block *fsnotify_data_sb(const void *data, return file_range_path(data)->dentry->d_sb; case FSNOTIFY_EVENT_ERROR: return ((struct fs_error_report *) data)->sb; + case FSNOTIFY_EVENT_RENAME: + return ((const struct fsnotify_rename_data *)data)->moved->d_sb; default: return NULL; } @@ -430,6 +442,14 @@ static inline struct fs_error_report *fsnotify_data_error_report( } } +static inline struct inode *fsnotify_data_rename_target(const void *data, + int data_type) +{ + if (data_type == FSNOTIFY_EVENT_RENAME) + return ((const struct fsnotify_rename_data *)data)->target; + return NULL; +} + static inline const struct file_range *fsnotify_data_file_range( const void *data, int data_type) @@ -918,6 +938,7 @@ extern void fsnotify_put_mark(struct fsnotify_mark *mark); struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark); extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info); extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info); +extern void fsnotify_modify_mark_mask(struct fsnotify_mark *mark, u32 set, u32 clear); static inline void fsnotify_init_event(struct fsnotify_event *event) { diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index a8f9aa75b792..6c467ded9751 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -201,6 +201,8 @@ bool fsverity_verify_blocks(struct fsverity_info *vi, struct folio *folio, size_t len, size_t offset); void fsverity_verify_bio(struct fsverity_info *vi, struct bio *bio); void fsverity_enqueue_verify_work(struct work_struct *work); +void fsverity_fill_zerohash(struct folio *folio, size_t offset, size_t len, + struct fsverity_info *vi); #else /* !CONFIG_FS_VERITY */ @@ -281,6 +283,12 @@ static inline void fsverity_enqueue_verify_work(struct work_struct *work) WARN_ON_ONCE(1); } +static inline void fsverity_fill_zerohash(struct folio *folio, size_t offset, + size_t len, struct fsverity_info *vi) +{ + WARN_ON_ONCE(1); +} + #endif /* !CONFIG_FS_VERITY */ static inline bool fsverity_verify_folio(struct fsverity_info *vi, diff --git a/include/linux/futex.h b/include/linux/futex.h index 9e9750f04980..51f4ccdc9092 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -64,14 +64,10 @@ enum { static inline void futex_init_task(struct task_struct *tsk) { - tsk->robust_list = NULL; -#ifdef CONFIG_COMPAT - tsk->compat_robust_list = NULL; -#endif - INIT_LIST_HEAD(&tsk->pi_state_list); - tsk->pi_state_cache = NULL; - tsk->futex_state = FUTEX_STATE_OK; - mutex_init(&tsk->futex_exit_mutex); + memset(&tsk->futex, 0, sizeof(tsk->futex)); + INIT_LIST_HEAD(&tsk->futex.pi_state_list); + tsk->futex.state = FUTEX_STATE_OK; + mutex_init(&tsk->futex.exit_mutex); } void futex_exit_recursive(struct task_struct *tsk); @@ -85,22 +81,18 @@ int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4) #ifdef CONFIG_FUTEX_PRIVATE_HASH int futex_hash_allocate_default(void); void futex_hash_free(struct mm_struct *mm); -int futex_mm_init(struct mm_struct *mm); - -#else /* !CONFIG_FUTEX_PRIVATE_HASH */ +#else /* CONFIG_FUTEX_PRIVATE_HASH */ static inline int futex_hash_allocate_default(void) { return 0; } static inline int futex_hash_free(struct mm_struct *mm) { return 0; } -static inline int futex_mm_init(struct mm_struct *mm) { return 0; } -#endif /* CONFIG_FUTEX_PRIVATE_HASH */ +#endif /* !CONFIG_FUTEX_PRIVATE_HASH */ -#else /* !CONFIG_FUTEX */ +#else /* CONFIG_FUTEX */ static inline void futex_init_task(struct task_struct *tsk) { } static inline void futex_exit_recursive(struct task_struct *tsk) { } static inline void futex_exit_release(struct task_struct *tsk) { } static inline void futex_exec_release(struct task_struct *tsk) { } -static inline long do_futex(u32 __user *uaddr, int op, u32 val, - ktime_t *timeout, u32 __user *uaddr2, - u32 val2, u32 val3) +static inline long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, + u32 __user *uaddr2, u32 val2, u32 val3) { return -EINVAL; } @@ -108,13 +100,63 @@ static inline int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsig { return -EINVAL; } -static inline int futex_hash_allocate_default(void) +static inline int futex_hash_allocate_default(void) { return 0; } +static inline int futex_hash_free(struct mm_struct *mm) { return 0; } +#endif /* !CONFIG_FUTEX */ + +#ifdef CONFIG_FUTEX_ROBUST_UNLOCK +#include <asm/futex_robust.h> + +void futex_reset_cs_ranges(struct futex_mm_data *fd); +void __futex_fixup_robust_unlock(struct pt_regs *regs, struct futex_unlock_cs_range *csr); + +static inline bool futex_within_robust_unlock(struct pt_regs *regs, + struct futex_unlock_cs_range *csr) { - return 0; + unsigned long ip = instruction_pointer(regs); + + return ip >= csr->start_ip && ip < csr->start_ip + csr->len; } -static inline int futex_hash_free(struct mm_struct *mm) { return 0; } -static inline int futex_mm_init(struct mm_struct *mm) { return 0; } -#endif +static inline void futex_fixup_robust_unlock(struct pt_regs *regs) +{ + struct futex_unlock_cs_range *csr; + + /* + * Avoid dereferencing current->mm if not returning from interrupt. + * current->rseq.event is going to be used subsequently, so bringing the + * cache line in is not a big deal. + */ + if (!current->rseq.event.user_irq) + return; + + csr = current->mm->futex.unlock.cs_ranges; + + /* The loop is optimized out for !COMPAT */ + for (int r = 0; r < FUTEX_ROBUST_MAX_CS_RANGES; r++, csr++) { + if (unlikely(futex_within_robust_unlock(regs, csr))) { + __futex_fixup_robust_unlock(regs, csr); + return; + } + } +} + +static inline void futex_set_vdso_cs_range(struct futex_mm_data *fd, unsigned int idx, + unsigned long start, unsigned long end, bool sz32) +{ + fd->unlock.cs_ranges[idx].start_ip = start; + fd->unlock.cs_ranges[idx].len = end - start; + fd->unlock.cs_ranges[idx].pop_size32 = sz32; +} +#else /* CONFIG_FUTEX_ROBUST_UNLOCK */ +static inline void futex_fixup_robust_unlock(struct pt_regs *regs) { } +#endif /* !CONFIG_FUTEX_ROBUST_UNLOCK */ + +#if defined(CONFIG_FUTEX_PRIVATE_HASH) || defined(CONFIG_FUTEX_ROBUST_UNLOCK) +void futex_mm_init(struct mm_struct *mm); +#else +static inline void futex_mm_init(struct mm_struct *mm) { } #endif + +#endif /* _LINUX_FUTEX_H */ diff --git a/include/linux/futex_types.h b/include/linux/futex_types.h new file mode 100644 index 000000000000..d320c0571f0c --- /dev/null +++ b/include/linux/futex_types.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_FUTEX_TYPES_H +#define _LINUX_FUTEX_TYPES_H + +#ifdef CONFIG_FUTEX +#include <linux/compiler_types.h> +#include <linux/mutex_types.h> +#include <linux/types.h> + +struct compat_robust_list_head; +struct futex_pi_state; +struct robust_list_head; + +/** + * struct futex_sched_data - Futex related per task data + * @robust_list: User space registered robust list pointer + * @compat_robust_list: User space registered robust list pointer for compat tasks + * @pi_state_list: List head for Priority Inheritance (PI) state management + * @pi_state_cache: Pointer to cache one PI state object per task + * @exit_mutex: Mutex for serializing exit + * @state: Futex handling state to handle exit races correctly + */ +struct futex_sched_data { + struct robust_list_head __user *robust_list; +#ifdef CONFIG_COMPAT + struct compat_robust_list_head __user *compat_robust_list; +#endif + struct list_head pi_state_list; + struct futex_pi_state *pi_state_cache; + struct mutex exit_mutex; + unsigned int state; +}; + +#ifdef CONFIG_FUTEX_PRIVATE_HASH +/** + * struct futex_mm_phash - Futex private hash related per MM data + * @lock: Mutex to protect the private hash operations + * @hash: RCU managed pointer to the private hash + * @hash_new: Pointer to a newly allocated private hash + * @batches: Batch state for RCU synchronization + * @rcu: RCU head for call_rcu() + * @atomic: Aggregate value for @hash_ref + * @ref: Per CPU reference counter for a private hash + */ +struct futex_mm_phash { + struct mutex lock; + struct futex_private_hash __rcu *hash; + struct futex_private_hash *hash_new; + unsigned long batches; + struct rcu_head rcu; + atomic_long_t atomic; + unsigned int __percpu *ref; +}; +#else /* CONFIG_FUTEX_ROBUST_UNLOCK */ +struct futex_mm_phash { }; +#endif /* !CONFIG_FUTEX_ROBUST_UNLOCK */ + +#ifdef CONFIG_FUTEX_ROBUST_UNLOCK +/** + * struct futex_unlock_cs_range - Range for the VDSO unlock critical section + * @start_ip: The start IP of the robust futex unlock critical section (inclusive) + * @len: The length of the robust futex unlock critical section + * @pop_size32: Pending OP pointer size indicator. 0 == 64-bit, 1 == 32-bit + */ +struct futex_unlock_cs_range { + unsigned long start_ip; + unsigned int len; + unsigned int pop_size32; +}; + +#define FUTEX_ROBUST_MAX_CS_RANGES (1 + IS_ENABLED(CONFIG_COMPAT)) + +/** + * struct futex_unlock_cs_ranges - Futex unlock VSDO critical sections + * @cs_ranges: Array of critical section ranges + */ +struct futex_unlock_cs_ranges { + struct futex_unlock_cs_range cs_ranges[FUTEX_ROBUST_MAX_CS_RANGES]; +}; +#else /* CONFIG_FUTEX_ROBUST_UNLOCK */ +struct futex_unlock_cs_ranges { }; +#endif /* !CONFIG_FUTEX_ROBUST_UNLOCK */ + +/** + * struct futex_mm_data - Futex related per MM data + * @phash: Futex private hash related data + * @unlock: Futex unlock VDSO critical sections + */ +struct futex_mm_data { + struct futex_mm_phash phash; + struct futex_unlock_cs_ranges unlock; +}; +#else /* CONFIG_FUTEX */ +struct futex_sched_data { }; +struct futex_mm_data { }; +#endif /* !CONFIG_FUTEX */ + +#endif /* _LINUX_FUTEX_TYPES_H */ diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 31df7608737e..4e86e6990d28 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -210,8 +210,10 @@ static inline void fwnode_init(struct fwnode_handle *fwnode, { fwnode->secondary = NULL; fwnode->ops = ops; + fwnode->dev = NULL; INIT_LIST_HEAD(&fwnode->consumers); INIT_LIST_HEAD(&fwnode->suppliers); + fwnode->flags = 0; } static inline void fwnode_set_flag(struct fwnode_handle *fwnode, @@ -251,6 +253,7 @@ int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup, u8 flags); void fwnode_links_purge(struct fwnode_handle *fwnode); void fw_devlink_purge_absent_suppliers(struct fwnode_handle *fwnode); +void fw_devlink_refresh_fwnode(struct fwnode_handle *fwnode); bool fw_devlink_is_strict(void); #endif diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 9ced498fefaa..6862dea0acc5 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -206,6 +206,9 @@ static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { } extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 range_ns, const enum hrtimer_mode mode); +extern bool hrtimer_start_range_ns_user(struct hrtimer *timer, ktime_t tim, + u64 range_ns, const enum hrtimer_mode mode); + /** * hrtimer_start - (re)start an hrtimer * @timer: the timer to be added @@ -223,17 +226,28 @@ static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim, extern int hrtimer_cancel(struct hrtimer *timer); extern int hrtimer_try_to_cancel(struct hrtimer *timer); -static inline void hrtimer_start_expires(struct hrtimer *timer, - enum hrtimer_mode mode) +static inline void hrtimer_start_expires(struct hrtimer *timer, enum hrtimer_mode mode) { - u64 delta; ktime_t soft, hard; + u64 delta; + soft = hrtimer_get_softexpires(timer); hard = hrtimer_get_expires(timer); delta = ktime_to_ns(ktime_sub(hard, soft)); hrtimer_start_range_ns(timer, soft, delta, mode); } +static inline bool hrtimer_start_expires_user(struct hrtimer *timer, enum hrtimer_mode mode) +{ + ktime_t soft, hard; + u64 delta; + + soft = hrtimer_get_softexpires(timer); + hard = hrtimer_get_expires(timer); + delta = ktime_to_ns(ktime_sub(hard, soft)); + return hrtimer_start_range_ns_user(timer, soft, delta, mode); +} + void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode mode); @@ -254,8 +268,8 @@ static inline ktime_t hrtimer_get_remaining(const struct hrtimer *timer) return __hrtimer_get_remaining(timer, false); } -extern u64 hrtimer_get_next_event(void); -extern u64 hrtimer_next_event_without(const struct hrtimer *exclude); +extern ktime_t hrtimer_get_next_event(void); +extern ktime_t hrtimer_next_event_without(const struct hrtimer *exclude); extern bool hrtimer_active(const struct hrtimer *timer); diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index 301a83afbd66..a578a10baff2 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -477,7 +477,8 @@ hwmon_device_register_with_info(struct device *dev, const struct attribute_group **extra_groups); struct device * hwmon_device_register_for_thermal(struct device *dev, const char *name, - void *drvdata); + void *drvdata, + const struct attribute_group **extra_groups); struct device * devm_hwmon_device_register_with_info(struct device *dev, const char *name, void *drvdata, diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 734b7ef98f4d..9de2c8d6037a 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1272,11 +1272,6 @@ struct hv_device { u16 device_id; struct device device; - /* - * Driver name to force a match. Do not set directly, because core - * frees it. Use driver_set_override() to set or clear it. - */ - const char *driver_override; struct vmbus_channel *channel; struct kset *channels_kset; diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 6cd26ffb0505..3bf969ad8fe0 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -864,6 +864,7 @@ static inline void init_irq_proc(void) struct seq_file; int show_interrupts(struct seq_file *p, void *v); int arch_show_interrupts(struct seq_file *p, int prec); +void irq_proc_emit_counts(struct seq_file *p, unsigned int __percpu *cnts); extern int early_irq_init(void); extern int arch_probe_nr_irqs(void); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 2c5685adf3a9..3582ed1fe236 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -67,6 +67,9 @@ struct vm_fault; * bio, i.e. set REQ_ATOMIC. * * IOMAP_F_INTEGRITY indicates that the filesystems handles integrity metadata. + * + * IOMAP_F_ZERO_TAIL indicates the remainder of the block after the data + * written should be zeroed. */ #define IOMAP_F_NEW (1U << 0) #define IOMAP_F_DIRTY (1U << 1) @@ -86,6 +89,15 @@ struct vm_fault; #else #define IOMAP_F_INTEGRITY 0 #endif /* CONFIG_BLK_DEV_INTEGRITY */ +#define IOMAP_F_ZERO_TAIL (1U << 10) + +/* + * Indicates reads and writes of fsverity metadata. + * + * Fsverity metadata is stored after the regular file data and thus beyond + * i_size. + */ +#define IOMAP_F_FSVERITY (1U << 11) /* * Flag reserved for file system specific usage @@ -143,16 +155,6 @@ static inline void *iomap_inline_data(const struct iomap *iomap, loff_t pos) } /* - * Check if the mapping's length is within the valid range for inline data. - * This is used to guard against accessing data beyond the page inline_data - * points at. - */ -static inline bool iomap_inline_data_valid(const struct iomap *iomap) -{ - return iomap->length <= PAGE_SIZE - offset_in_page(iomap->inline_data); -} - -/* * When get_folio succeeds, put_folio will always be called to do any * cleanup work necessary. put_folio is responsible for unlocking and putting * @folio. @@ -351,6 +353,9 @@ static inline bool iomap_want_unshare_iter(const struct iomap_iter *iter) ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops, void *private); +int iomap_fsverity_write(struct file *file, loff_t pos, size_t length, + const void *buf, const struct iomap_ops *ops, + const struct iomap_write_ops *write_ops); void iomap_read_folio(const struct iomap_ops *ops, struct iomap_read_folio_ctx *ctx, void *private); void iomap_readahead(const struct iomap_ops *ops, @@ -427,6 +432,7 @@ struct iomap_ioend { loff_t io_offset; /* offset in the file */ sector_t io_sector; /* start sector of ioend */ void *io_private; /* file system private data */ + struct fsverity_info *io_vi; /* fsverity info */ struct bio io_bio; /* MUST BE LAST! */ }; @@ -501,6 +507,7 @@ struct iomap_read_folio_ctx { struct readahead_control *rac; void *read_ctx; loff_t read_ctx_file_offset; + struct fsverity_info *vi; }; struct iomap_read_ops { diff --git a/include/linux/iommu-dma.h b/include/linux/iommu-dma.h index a92b3ff9b934..060f6e23ab3c 100644 --- a/include/linux/iommu-dma.h +++ b/include/linux/iommu-dma.h @@ -7,12 +7,13 @@ #ifndef _LINUX_IOMMU_DMA_H #define _LINUX_IOMMU_DMA_H +#include <linux/device.h> #include <linux/dma-direction.h> #ifdef CONFIG_IOMMU_DMA static inline bool use_dma_iommu(struct device *dev) { - return dev->dma_iommu; + return dev_dma_iommu(dev); } #else static inline bool use_dma_iommu(struct device *dev) diff --git a/include/linux/irq.h b/include/linux/irq.h index efa514ee562f..f485369b1b4f 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -103,6 +103,7 @@ enum { IRQ_DISABLE_UNLAZY = (1 << 19), IRQ_HIDDEN = (1 << 20), IRQ_NO_DEBUG = (1 << 21), + IRQ_RESERVED = (1 << 22), }; #define IRQF_MODIFY_MASK \ diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 0225121f3013..ea5fd2374ebe 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -604,7 +604,7 @@ #include <asm/arch_gicv3.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * We need a value to serve as a irq-type for LPIs. Choose one that will diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h index d45fa19f9e47..849386dc5ec8 100644 --- a/include/linux/irqchip/arm-gic.h +++ b/include/linux/irqchip/arm-gic.h @@ -131,7 +131,7 @@ #define GICV_PMR_PRIORITY_SHIFT 3 #define GICV_PMR_PRIORITY_MASK (0x1f << GICV_PMR_PRIORITY_SHIFT) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/irqdomain.h> @@ -162,5 +162,5 @@ int gic_get_cpu_id(unsigned int cpu); void gic_migrate_target(unsigned int new_cpu_id); unsigned long gic_get_sgir_physaddr(void); -#endif /* __ASSEMBLY */ +#endif /* __ASSEMBLER__ */ #endif diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index dae9a9b93665..8080db17c1b1 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -52,8 +52,8 @@ struct irq_redirect { * @depth: disable-depth, for nested irq_disable() calls * @wake_depth: enable depth, for multiple irq_set_irq_wake() callers * @tot_count: stats field for non-percpu irqs - * @irq_count: stats field to detect stalled irqs * @last_unhandled: aging timer for unhandled count + * @irq_count: stats field to detect stalled irqs * @irqs_unhandled: stats field for spurious unhandled interrupts * @threads_handled: stats field for deferred spurious detection of threaded handlers * @threads_handled_last: comparator field for deferred spurious detection of threaded handlers @@ -70,6 +70,7 @@ struct irq_redirect { * IRQF_NO_SUSPEND set * @force_resume_depth: number of irqactions on a irq descriptor with * IRQF_FORCE_RESUME set + * @refcnt: Reference count mainly for /proc/interrupts * @rcu: rcu head for delayed free * @kobj: kobject used to represent this struct in sysfs * @request_mutex: mutex to protect request/free before locking desc->lock @@ -87,9 +88,9 @@ struct irq_desc { unsigned int core_internal_state__do_not_mess_with_it; unsigned int depth; /* nested irq disables */ unsigned int wake_depth; /* nested wake enables */ - unsigned int tot_count; - unsigned int irq_count; /* For detecting broken IRQs */ + unsigned long tot_count; unsigned long last_unhandled; /* Aging timer for unhandled count */ + unsigned int irq_count; /* For detecting broken IRQs */ unsigned int irqs_unhandled; atomic_t threads_handled; int threads_handled_last; @@ -119,6 +120,7 @@ struct irq_desc { struct dentry *debugfs_file; const char *dev_name; #endif + rcuref_t refcnt; #ifdef CONFIG_SPARSE_IRQ struct rcu_head rcu; struct kobject kobj; diff --git a/include/linux/kcsan-checks.h b/include/linux/kcsan-checks.h index 92f3843d9ebb..e135dacaa90f 100644 --- a/include/linux/kcsan-checks.h +++ b/include/linux/kcsan-checks.h @@ -282,7 +282,7 @@ static inline void __kcsan_disable_current(void) { } * @size: size of access */ #define __kcsan_check_write(ptr, size) \ - __kcsan_check_access(ptr, size, KCSAN_ACCESS_WRITE) + __kcsan_check_access(absolute_pointer(ptr), size, KCSAN_ACCESS_WRITE) /** * __kcsan_check_read_write - check regular read-write access for races @@ -308,7 +308,7 @@ static inline void __kcsan_disable_current(void) { } * @size: size of access */ #define kcsan_check_write(ptr, size) \ - kcsan_check_access(ptr, size, KCSAN_ACCESS_WRITE) + kcsan_check_access(absolute_pointer(ptr), size, KCSAN_ACCESS_WRITE) /** * kcsan_check_read_write - check regular read-write access for races @@ -331,7 +331,7 @@ static inline void __kcsan_disable_current(void) { } #define kcsan_check_atomic_read(ptr, size) \ kcsan_check_access(ptr, size, KCSAN_ACCESS_ATOMIC) #define kcsan_check_atomic_write(ptr, size) \ - kcsan_check_access(ptr, size, KCSAN_ACCESS_ATOMIC | KCSAN_ACCESS_WRITE) + kcsan_check_access(absolute_pointer(ptr), size, KCSAN_ACCESS_ATOMIC | KCSAN_ACCESS_WRITE) #define kcsan_check_atomic_read_write(ptr, size) \ kcsan_check_access(ptr, size, KCSAN_ACCESS_ATOMIC | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_COMPOUND) #endif diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index b97ce2df376f..fce1392e2140 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -34,7 +34,14 @@ enum cpu_usage_stat { }; struct kernel_cpustat { - u64 cpustat[NR_STATS]; +#ifdef CONFIG_NO_HZ_COMMON + bool idle_dyntick; + bool idle_elapse; + seqcount_t idle_sleeptime_seq; + u64 idle_entrytime; + u64 idle_stealtime[2]; +#endif + u64 cpustat[NR_STATS]; }; struct kernel_stat { @@ -99,23 +106,68 @@ static inline unsigned long kstat_cpu_irqs_sum(unsigned int cpu) return kstat_cpu(cpu).irqs_sum; } +#ifdef CONFIG_NO_HZ_COMMON +extern void kcpustat_dyntick_start(u64 now); +extern void kcpustat_dyntick_stop(u64 now); +extern void kcpustat_irq_enter(u64 now); +extern void kcpustat_irq_exit(u64 now); +extern u64 kcpustat_field_idle(int cpu); +extern u64 kcpustat_field_iowait(int cpu); + +static inline bool kcpustat_idle_dyntick(void) +{ + return __this_cpu_read(kernel_cpustat.idle_dyntick); +} +#else +static inline u64 kcpustat_field_idle(int cpu) +{ + return kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; +} +static inline u64 kcpustat_field_iowait(int cpu) +{ + return kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; +} + +static inline bool kcpustat_idle_dyntick(void) +{ + return false; +} +#endif /* CONFIG_NO_HZ_COMMON */ + +extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); +extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); + +/* Fetch cputime values when vtime is disabled on a CPU */ +static inline u64 kcpustat_field_default(enum cpu_usage_stat usage, int cpu) +{ + if (usage == CPUTIME_IDLE) + return kcpustat_field_idle(cpu); + if (usage == CPUTIME_IOWAIT) + return kcpustat_field_iowait(cpu); + return kcpustat_cpu(cpu).cpustat[usage]; +} + +static inline void kcpustat_cpu_fetch_default(struct kernel_cpustat *dst, int cpu) +{ + *dst = kcpustat_cpu(cpu); + dst->cpustat[CPUTIME_IDLE] = kcpustat_field_idle(cpu); + dst->cpustat[CPUTIME_IOWAIT] = kcpustat_field_iowait(cpu); +} + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -extern u64 kcpustat_field(struct kernel_cpustat *kcpustat, - enum cpu_usage_stat usage, int cpu); +extern u64 kcpustat_field(enum cpu_usage_stat usage, int cpu); extern void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu); #else -static inline u64 kcpustat_field(struct kernel_cpustat *kcpustat, - enum cpu_usage_stat usage, int cpu) +static inline u64 kcpustat_field(enum cpu_usage_stat usage, int cpu) { - return kcpustat->cpustat[usage]; + return kcpustat_field_default(usage, cpu); } static inline void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu) { - *dst = kcpustat_cpu(cpu); + kcpustat_cpu_fetch_default(dst, cpu); } - -#endif +#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */ extern void account_user_time(struct task_struct *, u64); extern void account_guest_time(struct task_struct *, u64); @@ -124,19 +176,17 @@ extern void account_system_index_time(struct task_struct *, u64, enum cpu_usage_stat); extern void account_steal_time(u64); extern void account_idle_time(u64); -extern u64 get_idle_time(struct kernel_cpustat *kcs, int cpu); #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE static inline void account_process_tick(struct task_struct *tsk, int user) { - vtime_flush(tsk); + if (!kcpustat_idle_dyntick()) + vtime_flush(tsk); } #else extern void account_process_tick(struct task_struct *, int user); #endif -extern void account_idle_ticks(unsigned long ticks); - #ifdef CONFIG_SCHED_CORE extern void __account_forceidle_time(struct task_struct *tsk, u64 delta); #endif diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index e21b2f7f4159..351a5101c862 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -76,20 +76,25 @@ struct kernfs_iattrs; * kernfs_open_file. * * kernfs_open_files are chained at kernfs_open_node->files, which is - * protected by kernfs_global_locks.open_file_mutex[i]. + * protected by kernfs_global_locks.node_mutex[i]. * * To reduce possible contention in sysfs access, arising due to single - * locks, use an array of locks (e.g. open_file_mutex) and use kernfs_node + * locks, use an array of locks (e.g. node_mutex) and use kernfs_node * object address as hash keys to get the index of these locks. * * Hashed mutexes are safe to use here because operations using these don't * rely on global exclusion. * + * The hashed mutex array protects per-node data: the kernfs_open_node for + * open file management, and kernfs_node xattr operations (necessary because + * multiple superblocks with different namespaces can share the same + * kernfs_node, making per-inode locking insufficient). + * * In future we intend to replace other global locks with hashed ones as well. * kernfs_global_locks acts as a holder for all such hash tables. */ struct kernfs_global_locks { - struct mutex open_file_mutex[NR_KERNFS_LOCKS]; + struct mutex node_mutex[NR_KERNFS_LOCKS]; }; enum kernfs_node_type { diff --git a/include/linux/kstrtox.h b/include/linux/kstrtox.h index 6ea897222af1..6c9282866770 100644 --- a/include/linux/kstrtox.h +++ b/include/linux/kstrtox.h @@ -142,10 +142,9 @@ static inline int __must_check kstrtos32_from_user(const char __user *s, size_t * Keep in mind above caveat. */ -extern unsigned long simple_strtoul(const char *,char **,unsigned int); -extern unsigned long simple_strntoul(const char *,char **,unsigned int,size_t); -extern long simple_strtol(const char *,char **,unsigned int); -extern unsigned long long simple_strtoull(const char *,char **,unsigned int); -extern long long simple_strtoll(const char *,char **,unsigned int); +unsigned long simple_strtoul(const char *cp, char **endp, unsigned int base); +long simple_strtol(const char *cp, char **endp, unsigned int base); +unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base); +long long simple_strtoll(const char *cp, char **endp, unsigned int base); #endif /* _LINUX_KSTRTOX_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index a308e2c23b82..5cadb00d9352 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -20,6 +20,7 @@ #include <linux/seqlock.h> #include <linux/percpu_counter.h> #include <linux/types.h> +#include <linux/futex_types.h> #include <linux/rseq_types.h> #include <linux/bitmap.h> @@ -1222,6 +1223,8 @@ struct mm_struct { /* MM CID related storage */ struct mm_mm_cid mm_cid; + /* sched_cache related statistics */ + struct sched_cache_stat sc_stat; #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* size of all page tables */ #endif @@ -1270,16 +1273,7 @@ struct mm_struct { */ seqcount_t mm_lock_seq; #endif -#ifdef CONFIG_FUTEX_PRIVATE_HASH - struct mutex futex_hash_lock; - struct futex_private_hash __rcu *futex_phash; - struct futex_private_hash *futex_phash_new; - /* futex-ref */ - unsigned long futex_batches; - struct rcu_head futex_rcu; - atomic_long_t futex_atomic; - unsigned int __percpu *futex_ref; -#endif + struct futex_mm_data futex; unsigned long hiwater_rss; /* High-watermark of RSS usage */ unsigned long hiwater_vm; /* High-water virtual memory usage */ @@ -1342,7 +1336,6 @@ struct mm_struct { */ struct task_struct __rcu *owner; #endif - struct user_namespace *user_ns; /* store ref to file /proc/<pid>/exe symlink points to */ struct file __rcu *exe_file; @@ -1628,6 +1621,36 @@ static inline unsigned int mm_cid_size(void) # define MM_CID_STATIC_SIZE 0 #endif /* CONFIG_SCHED_MM_CID */ +#ifdef CONFIG_SCHED_CACHE +void mm_init_sched(struct mm_struct *mm, + struct sched_cache_time __percpu *pcpu_sched); + +static inline int mm_alloc_sched_noprof(struct mm_struct *mm) +{ + struct sched_cache_time __percpu *pcpu_sched = + alloc_percpu_noprof(struct sched_cache_time); + + if (!pcpu_sched) + return -ENOMEM; + + mm_init_sched(mm, pcpu_sched); + return 0; +} + +#define mm_alloc_sched(...) alloc_hooks(mm_alloc_sched_noprof(__VA_ARGS__)) + +static inline void mm_destroy_sched(struct mm_struct *mm) +{ + free_percpu(mm->sc_stat.pcpu_sched); + mm->sc_stat.pcpu_sched = NULL; +} +#else /* !CONFIG_SCHED_CACHE */ + +static inline int mm_alloc_sched(struct mm_struct *mm) { return 0; } +static inline void mm_destroy_sched(struct mm_struct *mm) { } + +#endif /* CONFIG_SCHED_CACHE */ + struct mmu_gather; extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); @@ -1907,11 +1930,11 @@ enum { /* mm flags */ /* - * The first two bits represent core dump modes for set-user-ID, - * the modes are SUID_DUMP_* defined in linux/sched/coredump.h + * Bits 0 and 1 were dumpability; that moved to task->exec_state. Reserve + * the bits so MMF_DUMP_FILTER_* positions stay stable for the + * /proc/<pid>/coredump_filter ABI. */ #define MMF_DUMPABLE_BITS 2 -#define MMF_DUMPABLE_MASK (BIT(MMF_DUMPABLE_BITS) - 1) /* coredump filter bits */ #define MMF_DUMP_ANON_PRIVATE 2 #define MMF_DUMP_ANON_SHARED 3 @@ -1972,7 +1995,7 @@ enum { #define MMF_TOPDOWN 31 /* mm searches top down by default */ #define MMF_TOPDOWN_MASK BIT(MMF_TOPDOWN) -#define MMF_INIT_LEGACY_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ +#define MMF_INIT_LEGACY_MASK (MMF_DUMP_FILTER_MASK |\ MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\ MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK) diff --git a/include/linux/msi.h b/include/linux/msi.h index fa41eed62868..a4613de11960 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -444,7 +444,7 @@ struct msi_domain_info; * * @domain_alloc_irqs, @domain_free_irqs can be used to override the * default allocation/free functions (__msi_domain_alloc/free_irqs). This - * is initially for a wrapper around XENs seperate MSI universe which can't + * is initially for a wrapper around XEN's separate MSI universe which can't * be wrapped into the regular irq domains concepts by mere mortals. This * allows to universally use msi_domain_alloc/free_irqs without having to * special case XEN all over the place. diff --git a/include/linux/namei.h b/include/linux/namei.h index 2ad6dd9987b9..ebe6e29f7e93 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -13,11 +13,6 @@ enum { MAX_NESTED_LINKS = 8 }; #define MAXSYMLINKS 40 -/* - * Type of the last component on LOOKUP_PARENT - */ -enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT}; - /* pathwalk mode */ #define LOOKUP_FOLLOW BIT(0) /* follow links at the end */ #define LOOKUP_DIRECTORY BIT(1) /* require a directory */ @@ -61,13 +56,12 @@ extern struct dentry *start_creating_path(int, const char *, struct path *, unsi extern struct dentry *start_creating_user_path(int, const char __user *, struct path *, unsigned int); extern void end_creating_path(const struct path *, struct dentry *); extern struct dentry *start_removing_path(const char *, struct path *); -extern struct dentry *start_removing_user_path_at(int , const char __user *, struct path *); static inline void end_removing_path(const struct path *path , struct dentry *dentry) { end_creating_path(path, dentry); } int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, - struct path *parent, struct qstr *last, int *type, + struct path *parent, struct qstr *last, const struct path *root); int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 4daee27fa5eb..34d294774f8c 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -306,7 +306,7 @@ struct nfs_server { #define NFS_CAP_ATOMIC_OPEN (1U << 4) #define NFS_CAP_LGOPEN (1U << 5) #define NFS_CAP_CASE_INSENSITIVE (1U << 6) -#define NFS_CAP_CASE_PRESERVING (1U << 7) +#define NFS_CAP_CASE_NONPRESERVING (1U << 7) #define NFS_CAP_REBOOT_LAYOUTRETURN (1U << 8) #define NFS_CAP_OFFLOAD_STATUS (1U << 9) #define NFS_CAP_ZERO_RANGE (1U << 10) diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index fcbd21b5685f..35ea18a40b66 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -182,6 +182,8 @@ struct nfs_pathconf { struct nfs_fattr *fattr; /* Post-op attributes */ __u32 max_link; /* max # of hard links */ __u32 max_namelen; /* max name length */ + bool case_insensitive; + bool case_preserving; }; struct nfs4_change_info { @@ -1743,7 +1745,6 @@ struct nfs_unlinkdata { struct nfs_removeargs args; struct nfs_removeres res; struct dentry *dentry; - wait_queue_head_t wq; const struct cred *cred; struct nfs_fattr dir_attr; long timeout; diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index c8cb010d655e..39d5bf8e6562 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -107,6 +107,8 @@ static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem) return ret; } +extern void __percpu_up_read(struct percpu_rw_semaphore *sem); + static inline void percpu_up_read(struct percpu_rw_semaphore *sem) { rwsem_release(&sem->dep_map, _RET_IP_); @@ -118,18 +120,7 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem) if (likely(rcu_sync_is_idle(&sem->rss))) { this_cpu_dec(*sem->read_count); } else { - /* - * slowpath; reader will only ever wake a single blocked - * writer. - */ - smp_mb(); /* B matches C */ - /* - * In other words, if they see our decrement (presumably to - * aggregate zero, as that is the only time it matters) they - * will also see our critical section. - */ - this_cpu_dec(*sem->read_count); - rcuwait_wake_up(&sem->writer); + __percpu_up_read(sem); } preempt_enable(); } diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 85bf8dd9f087..2f5a889aa50d 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -3,13 +3,14 @@ #define __LINUX_PERCPU_H #include <linux/alloc_tag.h> +#include <linux/cleanup.h> +#include <linux/compiler_types.h> +#include <linux/init.h> #include <linux/mmdebug.h> -#include <linux/preempt.h> -#include <linux/smp.h> #include <linux/pfn.h> -#include <linux/init.h> -#include <linux/cleanup.h> +#include <linux/preempt.h> #include <linux/sched.h> +#include <linux/smp.h> #include <asm/percpu.h> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index cdd68ed3ae1a..07483ed9b3ce 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -301,6 +301,28 @@ static inline void lazy_mmu_mode_disable(void) } /** + * __task_lazy_mmu_mode_pause() - Pause the lazy MMU mode for a task. + * @tsk: The task to check. + * + * Pauses the lazy MMU mode of @tsk. + * + * This function only operates on the state saved in task_struct; to pause + * current lazy_mmu_mode_pause() should be used instead. + * + * This function is intended for architectures that implement the lazy MMU + * mode; it must not be called from generic code. + */ +static inline void __task_lazy_mmu_mode_pause(struct task_struct *tsk) +{ + struct lazy_mmu_state *state = &tsk->lazy_mmu_state; + + VM_WARN_ON_ONCE(state->pause_count == U8_MAX); + + if (state->pause_count++ == 0 && state->enable_count > 0) + arch_leave_lazy_mmu_mode(); +} + +/** * lazy_mmu_mode_pause() - Pause the lazy MMU mode. * * Pauses the lazy MMU mode; if it is currently active, disables it and calls @@ -315,15 +337,32 @@ static inline void lazy_mmu_mode_disable(void) */ static inline void lazy_mmu_mode_pause(void) { - struct lazy_mmu_state *state = ¤t->lazy_mmu_state; - if (in_interrupt()) return; - VM_WARN_ON_ONCE(state->pause_count == U8_MAX); + __task_lazy_mmu_mode_pause(current); +} - if (state->pause_count++ == 0 && state->enable_count > 0) - arch_leave_lazy_mmu_mode(); +/** + * __task_lazy_mmu_mode_resume() - Resume the lazy MMU mode for a task. + * @tsk: The task to check. + * + * Resumes the lazy MMU mode of @tsk. + * + * This function only operates on the state saved in task_struct; to resume + * current lazy_mmu_mode_resume() should be used instead. + * + * This function is intended for architectures that implement the lazy MMU + * mode; it must not be called from generic code. + */ +static inline void __task_lazy_mmu_mode_resume(struct task_struct *tsk) +{ + struct lazy_mmu_state *state = &tsk->lazy_mmu_state; + + VM_WARN_ON_ONCE(state->pause_count == 0); + + if (--state->pause_count == 0 && state->enable_count > 0) + arch_enter_lazy_mmu_mode(); } /** @@ -341,15 +380,10 @@ static inline void lazy_mmu_mode_pause(void) */ static inline void lazy_mmu_mode_resume(void) { - struct lazy_mmu_state *state = ¤t->lazy_mmu_state; - if (in_interrupt()) return; - VM_WARN_ON_ONCE(state->pause_count == 0); - - if (--state->pause_count == 0 && state->enable_count > 0) - arch_enter_lazy_mmu_mode(); + __task_lazy_mmu_mode_resume(current); } #else static inline void lazy_mmu_mode_enable(void) {} diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 975400a472e3..26e6a43358e2 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -293,18 +293,19 @@ struct platform_driver { * use a macro to avoid include chaining to get THIS_MODULE */ #define platform_driver_register(drv) \ - __platform_driver_register(drv, THIS_MODULE) + __platform_driver_register(drv, THIS_MODULE, KBUILD_MODNAME) extern int __platform_driver_register(struct platform_driver *, - struct module *); + struct module *, const char *mod_name); extern void platform_driver_unregister(struct platform_driver *); /* non-hotpluggable platform devices may use this so that probe() and * its support may live in __init sections, conserving runtime memory. */ #define platform_driver_probe(drv, probe) \ - __platform_driver_probe(drv, probe, THIS_MODULE) + __platform_driver_probe(drv, probe, THIS_MODULE, KBUILD_MODNAME) extern int __platform_driver_probe(struct platform_driver *driver, - int (*probe)(struct platform_device *), struct module *module); + int (*probe)(struct platform_device *), struct module *module, + const char *mod_name); static inline void *platform_get_drvdata(const struct platform_device *pdev) { @@ -368,19 +369,19 @@ static int __init __platform_driver##_init(void) \ device_initcall(__platform_driver##_init); \ #define platform_create_bundle(driver, probe, res, n_res, data, size) \ - __platform_create_bundle(driver, probe, res, n_res, data, size, THIS_MODULE) + __platform_create_bundle(driver, probe, res, n_res, data, size, THIS_MODULE, KBUILD_MODNAME) extern struct platform_device *__platform_create_bundle( struct platform_driver *driver, int (*probe)(struct platform_device *), struct resource *res, unsigned int n_res, - const void *data, size_t size, struct module *module); + const void *data, size_t size, struct module *module, const char *mod_name); int __platform_register_drivers(struct platform_driver * const *drivers, - unsigned int count, struct module *owner); + unsigned int count, struct module *owner, const char *mod_name); void platform_unregister_drivers(struct platform_driver * const *drivers, unsigned int count); #define platform_register_drivers(drivers, count) \ - __platform_register_drivers(drivers, count, THIS_MODULE) + __platform_register_drivers(drivers, count, THIS_MODULE, KBUILD_MODNAME) #ifdef CONFIG_SUSPEND extern int platform_pm_suspend(struct device *dev); diff --git a/include/linux/pps_kernel.h b/include/linux/pps_kernel.h index aab0aebb529e..9f088c9023b1 100644 --- a/include/linux/pps_kernel.h +++ b/include/linux/pps_kernel.h @@ -99,12 +99,14 @@ static inline void timespec_to_pps_ktime(struct pps_ktime *kt, static inline void pps_get_ts(struct pps_event_time *ts) { +#ifdef CONFIG_NTP_PPS struct system_time_snapshot snap; - ktime_get_snapshot(&snap); - ts->ts_real = ktime_to_timespec64(snap.real); -#ifdef CONFIG_NTP_PPS - ts->ts_raw = ktime_to_timespec64(snap.raw); + ktime_get_snapshot_id(CLOCK_REALTIME, &snap); + ts->ts_real = ktime_to_timespec64(snap.systime); + ts->ts_raw = ktime_to_timespec64(snap.monoraw); +#else + ktime_get_real_ts64(&ts->ts_real); #endif } diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 19d1c5e5f335..47d7deaeed8f 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -67,6 +67,7 @@ enum proc_pidonly { struct proc_fs_info { struct pid_namespace *pid_ns; kgid_t pid_gid; + const struct cred *mounter_cred; enum proc_hidepid hide_pid; enum proc_pidonly pidonly; struct rcu_head rcu; @@ -248,4 +249,16 @@ static inline struct pid_namespace *proc_pid_ns(struct super_block *sb) bool proc_ns_file(const struct file *file); +#if defined CONFIG_PROC_FS && !defined MODULE +void impl_proc_make_permanent(struct proc_dir_entry *pde); +#endif + +static inline void proc_make_permanent(struct proc_dir_entry *pde) +{ + /* Don't give matches to modules. */ +#if defined CONFIG_PROC_FS && !defined MODULE + impl_proc_make_permanent(pde); +#endif +} + #endif /* _LINUX_PROC_FS_H */ diff --git a/include/linux/property.h b/include/linux/property.h index e30ef23a9af3..14c304db4664 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -471,12 +471,18 @@ struct property_entry { #define PROPERTY_ENTRY_STRING(_name_, _val_) \ __PROPERTY_ENTRY_ELEMENT(_name_, str, STRING, _val_) +#define __PROPERTY_ENTRY_REF_ARGS(_ref_, ...) \ + _Generic(_ref_, \ + const struct software_node_ref_args *: _ref_, \ + struct software_node_ref_args *: _ref_, \ + default: &SOFTWARE_NODE_REFERENCE(_ref_, ##__VA_ARGS__)) + #define PROPERTY_ENTRY_REF(_name_, _ref_, ...) \ (struct property_entry) { \ .name = _name_, \ .length = sizeof(struct software_node_ref_args), \ .type = DEV_PROP_REF, \ - { .pointer = &SOFTWARE_NODE_REFERENCE(_ref_, ##__VA_ARGS__), }, \ + { .pointer = __PROPERTY_ENTRY_REF_ARGS(_ref_, ##__VA_ARGS__) }, \ } #define PROPERTY_ENTRY_BOOL(_name_) \ diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 884364596dd3..36a27a910595 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -12,6 +12,7 @@ #include <linux/pps_kernel.h> #include <linux/ptp_clock.h> #include <linux/timecounter.h> +#include <linux/timekeeping.h> #include <linux/skbuff.h> #define PTP_CLOCK_NAME_LEN 32 @@ -45,13 +46,13 @@ struct system_device_crosststamp; /** * struct ptp_system_timestamp - system time corresponding to a PHC timestamp - * @pre_ts: system timestamp before capturing PHC - * @post_ts: system timestamp after capturing PHC - * @clockid: clock-base used for capturing the system timestamps + * @pre_sts: system time snapshot before capturing PHC + * @post_sts: system time snapshot after capturing PHC + * @clockid: clock-base used for capturing the system timestamps */ struct ptp_system_timestamp { - struct timespec64 pre_ts; - struct timespec64 post_ts; + struct system_time_snapshot pre_sts; + struct system_time_snapshot post_sts; clockid_t clockid; }; @@ -510,13 +511,13 @@ static inline ktime_t ptp_convert_timestamp(const ktime_t *hwtstamp, static inline void ptp_read_system_prets(struct ptp_system_timestamp *sts) { if (sts) - ktime_get_clock_ts64(sts->clockid, &sts->pre_ts); + ktime_get_snapshot_id(sts->clockid, &sts->pre_sts); } static inline void ptp_read_system_postts(struct ptp_system_timestamp *sts) { if (sts) - ktime_get_clock_ts64(sts->clockid, &sts->post_ts); + ktime_get_snapshot_id(sts->clockid, &sts->post_sts); } #endif diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 90507d4afcd6..ef314f7a9ecc 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -17,6 +17,7 @@ struct syscall_info { struct seccomp_data data; }; +bool ptracer_access_allowed(struct task_struct *tsk); extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index bfa765132de8..5e95acc33989 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -592,11 +592,13 @@ context_unsafe( \ * lockdep checks for being in an RCU read-side critical section. This is * useful when the value of this pointer is accessed, but the pointer is * not dereferenced, for example, when testing an RCU-protected pointer - * against NULL. Although rcu_access_pointer() may also be used in cases - * where update-side locks prevent the value of the pointer from changing, - * you should instead use rcu_dereference_protected() for this use case. - * Within an RCU read-side critical section, there is little reason to - * use rcu_access_pointer(). + * against NULL. Within an RCU read-side critical section, there is little + * reason to use rcu_access_pointer(). Although rcu_access_pointer() may + * also be used in cases where update-side locks prevent the value of the + * pointer from changing, you should instead use rcu_dereference_protected() + * for this use case. It is also permissible to use rcu_access_pointer() + * within lockless updaters to obtain the old value for an atomic operation, + * for example, for cmpxchg(). * * It is usually best to test the rcu_access_pointer() return value * directly in order to avoid accidental dereferences being introduced diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h index fc2f596a6df1..57c11ec9dc64 100644 --- a/include/linux/rhashtable-types.h +++ b/include/linux/rhashtable-types.h @@ -136,12 +136,26 @@ struct rhashtable_iter { bool end_of_table; }; -int rhashtable_init_noprof(struct rhashtable *ht, - const struct rhashtable_params *params); +int __rhashtable_init_noprof(struct rhashtable *ht, + const struct rhashtable_params *params, + struct lock_class_key *key); +#define rhashtable_init_noprof(ht, params) \ +({ \ + static struct lock_class_key __key; \ + \ + __rhashtable_init_noprof(ht, params, &__key); \ +}) #define rhashtable_init(...) alloc_hooks(rhashtable_init_noprof(__VA_ARGS__)) -int rhltable_init_noprof(struct rhltable *hlt, - const struct rhashtable_params *params); +int __rhltable_init_noprof(struct rhltable *hlt, + const struct rhashtable_params *params, + struct lock_class_key *key); +#define rhltable_init_noprof(hlt, params) \ +({ \ + static struct lock_class_key __key; \ + \ + __rhltable_init_noprof(hlt, params, &__key); \ +}) #define rhltable_init(...) alloc_hooks(rhltable_init_noprof(__VA_ARGS__)) #endif /* _LINUX_RHASHTABLE_TYPES_H */ diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h index 83266ce14642..2e40eb54155e 100644 --- a/include/linux/rpmsg.h +++ b/include/linux/rpmsg.h @@ -41,9 +41,6 @@ struct rpmsg_channel_info { * rpmsg_device - device that belong to the rpmsg bus * @dev: the device struct * @id: device id (used to match between rpmsg drivers and devices) - * @driver_override: driver name to force a match; do not set directly, - * because core frees it; use driver_set_override() to - * set or clear it. * @src: local address * @dst: destination address * @ept: the rpmsg endpoint of this channel @@ -53,7 +50,6 @@ struct rpmsg_channel_info { struct rpmsg_device { struct device dev; struct rpmsg_device_id id; - const char *driver_override; u32 src; u32 dst; struct rpmsg_endpoint *ept; diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index 78e7e588817c..9e1f012f89db 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -56,6 +56,8 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) #endif extern void rt_mutex_base_init(struct rt_mutex_base *rtb); +context_lock_struct(rt_mutex); + /** * The rt_mutex structure * @@ -108,8 +110,10 @@ do { \ extern void __rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key); #ifdef CONFIG_DEBUG_LOCK_ALLOC -extern void rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass); -extern void _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock); +extern void rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass) + __acquires(lock); +extern void _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock) + __acquires(lock); #define rt_mutex_lock(lock) rt_mutex_lock_nested(lock, 0) #define rt_mutex_lock_nest_lock(lock, nest_lock) \ do { \ @@ -118,15 +122,19 @@ extern void _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map * } while (0) #else -extern void rt_mutex_lock(struct rt_mutex *lock); +extern void rt_mutex_lock(struct rt_mutex *lock) __acquires(lock); #define rt_mutex_lock_nested(lock, subclass) rt_mutex_lock(lock) #define rt_mutex_lock_nest_lock(lock, nest_lock) rt_mutex_lock(lock) #endif -extern int rt_mutex_lock_interruptible(struct rt_mutex *lock); -extern int rt_mutex_lock_killable(struct rt_mutex *lock); -extern int rt_mutex_trylock(struct rt_mutex *lock); +extern int rt_mutex_lock_interruptible(struct rt_mutex *lock) + __cond_acquires(0, lock); +extern int rt_mutex_lock_killable(struct rt_mutex *lock) + __cond_acquires(0, lock); +extern int rt_mutex_trylock(struct rt_mutex *lock) + __cond_acquires(true, lock); -extern void rt_mutex_unlock(struct rt_mutex *lock); +extern void rt_mutex_unlock(struct rt_mutex *lock) + __releases(lock); #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index ee06cba5c6f5..b3204a15d512 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -16,6 +16,7 @@ #include <linux/cpumask_types.h> #include <linux/cache.h> +#include <linux/futex_types.h> #include <linux/irqflags_types.h> #include <linux/smp_types.h> #include <linux/pid_types.h> @@ -64,7 +65,6 @@ struct bpf_net_context; struct capture_control; struct cfs_rq; struct fs_struct; -struct futex_pi_state; struct io_context; struct io_uring_task; struct mempolicy; @@ -76,7 +76,6 @@ struct pid_namespace; struct pipe_inode_info; struct rcu_node; struct reclaim_state; -struct robust_list_head; struct root_domain; struct rq; struct sched_attr; @@ -85,6 +84,7 @@ struct seq_file; struct sighand_struct; struct signal_struct; struct task_delay_info; +struct task_exec_state; struct task_group; struct task_struct; struct timespec64; @@ -161,7 +161,7 @@ struct user_event_mm; */ #define is_special_task_state(state) \ ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | \ - TASK_DEAD | TASK_FROZEN)) + TASK_DEAD | TASK_WAKING | TASK_FROZEN)) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP # define debug_normal_state_change(state_value) \ @@ -702,6 +702,11 @@ struct sched_dl_entity { * running, skipping the defer phase. * * @dl_defer_idle tracks idle state + * + * @dl_bw_attached tells if this server's bandwidth currently + * contributes to the root domain's total_bw. Only meaningful for server + * entities (@dl_server == 1). Allows toggling the reservation on/off + * without losing the configured @dl_runtime/@dl_period. */ unsigned int dl_throttled : 1; unsigned int dl_yielded : 1; @@ -713,6 +718,7 @@ struct sched_dl_entity { unsigned int dl_defer_armed : 1; unsigned int dl_defer_running : 1; unsigned int dl_defer_idle : 1; + unsigned int dl_bw_attached : 1; /* * Bandwidth enforcement timer. Each -deadline task has its @@ -846,7 +852,11 @@ struct task_struct { struct alloc_tag *alloc_tag; #endif - int on_cpu; + u8 on_cpu; + u8 on_rq; + u8 is_blocked; + u8 __pad; + struct __call_single_node wake_entry; unsigned int wakee_flips; unsigned long wakee_flip_decay_ts; @@ -861,7 +871,6 @@ struct task_struct { */ int recent_used_cpu; int wake_cpu; - int on_rq; int prio; int static_prio; @@ -962,6 +971,8 @@ struct task_struct { struct mm_struct *mm; struct mm_struct *active_mm; + struct task_exec_state __rcu *exec_state; + int exit_state; int exit_code; int exit_signal; @@ -1002,9 +1013,6 @@ struct task_struct { unsigned sched_rt_mutex:1; #endif - /* Save user-dumpable when mm goes away */ - unsigned user_dumpable:1; - /* Bit to tell TOMOYO we're in execve(): */ unsigned in_execve:1; unsigned in_iowait:1; @@ -1244,6 +1252,13 @@ struct task_struct { struct mutex *blocked_on; /* lock we're blocked on */ raw_spinlock_t blocked_lock; + /* + * The task that is boosting this task; a back link for the current + * donor stack. Set in schedule() -> find_proxy_task() and only stable + * under preempt_disable(). + */ + struct task_struct *blocked_donor; + #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER /* * Encoded lock address causing task block (lower 2 bits = type from @@ -1334,16 +1349,9 @@ struct task_struct { u32 closid; u32 rmid; #endif -#ifdef CONFIG_FUTEX - struct robust_list_head __user *robust_list; -#ifdef CONFIG_COMPAT - struct compat_robust_list_head __user *compat_robust_list; -#endif - struct list_head pi_state_list; - struct futex_pi_state *pi_state_cache; - struct mutex futex_exit_mutex; - unsigned int futex_state; -#endif + + struct futex_sched_data futex; + #ifdef CONFIG_PERF_EVENTS u8 perf_recursion[PERF_NR_CONTEXTS]; struct perf_event_context *perf_event_ctxp; @@ -1411,6 +1419,13 @@ struct task_struct { unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ +#ifdef CONFIG_SCHED_CACHE + struct callback_head cache_work; + int preferred_llc; + /* 1: task was enqueued to its preferred LLC, 0 otherwise */ + int pref_llc_queued; +#endif + struct rseq_data rseq; struct sched_mm_cid mm_cid; @@ -2185,19 +2200,10 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock) __must_hold(lock); #ifndef CONFIG_PREEMPT_RT -/* - * With proxy exec, if a task has been proxy-migrated, it may be a donor - * on a cpu that it can't actually run on. Thus we need a special state - * to denote that the task is being woken, but that it needs to be - * evaluated for return-migration before it is run. So if the task is - * blocked_on PROXY_WAKING, return migrate it before running it. - */ -#define PROXY_WAKING ((struct mutex *)(-1L)) - static inline struct mutex *__get_task_blocked_on(struct task_struct *p) { lockdep_assert_held_once(&p->blocked_lock); - return p->blocked_on == PROXY_WAKING ? NULL : p->blocked_on; + return p->blocked_on; } static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) @@ -2225,7 +2231,7 @@ static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex * * blocked_on relationships, but make sure we are not * clearing the relationship with a different lock. */ - WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m && p->blocked_on != PROXY_WAKING); + WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m); p->blocked_on = NULL; } @@ -2234,35 +2240,6 @@ static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m) guard(raw_spinlock_irqsave)(&p->blocked_lock); __clear_task_blocked_on(p, m); } - -static inline void __set_task_blocked_on_waking(struct task_struct *p, struct mutex *m) -{ - /* Currently we serialize blocked_on under the task::blocked_lock */ - lockdep_assert_held_once(&p->blocked_lock); - - if (!sched_proxy_exec()) { - __clear_task_blocked_on(p, m); - return; - } - - /* Don't set PROXY_WAKING if blocked_on was already cleared */ - if (!p->blocked_on) - return; - /* - * There may be cases where we set PROXY_WAKING on tasks that were - * already set to waking, but make sure we are not changing - * the relationship with a different lock. - */ - WARN_ON_ONCE(m && p->blocked_on != m && p->blocked_on != PROXY_WAKING); - p->blocked_on = PROXY_WAKING; -} - -static inline void set_task_blocked_on_waking(struct task_struct *p, struct mutex *m) -{ - guard(raw_spinlock_irqsave)(&p->blocked_lock); - __set_task_blocked_on_waking(p, m); -} - #else static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) { @@ -2271,14 +2248,6 @@ static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mute static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) { } - -static inline void __set_task_blocked_on_waking(struct task_struct *p, struct rt_mutex *m) -{ -} - -static inline void set_task_blocked_on_waking(struct task_struct *p, struct rt_mutex *m) -{ -} #endif /* !CONFIG_PREEMPT_RT */ static __always_inline bool need_resched(void) @@ -2411,6 +2380,29 @@ static __always_inline int task_mm_cid(struct task_struct *t) } #endif +#ifdef CONFIG_SCHED_CACHE + +struct sched_cache_time { + u64 runtime; + unsigned long epoch; +}; + +struct sched_cache_stat { + struct sched_cache_time __percpu *pcpu_sched; + raw_spinlock_t lock; + unsigned long epoch; + u64 nr_running_avg; + unsigned long next_scan; + unsigned long footprint; + int cpu; +} ____cacheline_aligned_in_smp; + +#else + +struct sched_cache_stat { }; + +#endif + #ifndef MODULE #ifndef COMPILE_OFFSETS diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h index 196f0ca351a2..39f0a7f94bfc 100644 --- a/include/linux/sched/clock.h +++ b/include/linux/sched/clock.h @@ -33,6 +33,11 @@ extern u64 sched_clock_cpu(int cpu); extern void sched_clock_init(void); #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static inline int sched_clock_stable(void) +{ + return 1; +} + static inline void sched_clock_tick(void) { } diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 624fda17a785..20957ccde3b5 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -2,43 +2,18 @@ #ifndef _LINUX_SCHED_COREDUMP_H #define _LINUX_SCHED_COREDUMP_H -#include <linux/mm_types.h> - -#define SUID_DUMP_DISABLE 0 /* No setuid dumping */ -#define SUID_DUMP_USER 1 /* Dump as user of process */ -#define SUID_DUMP_ROOT 2 /* Dump as root */ - -static inline unsigned long __mm_flags_get_dumpable(const struct mm_struct *mm) -{ - /* - * By convention, dumpable bits are contained in first 32 bits of the - * bitmap, so we can simply access this first unsigned long directly. - */ - return __mm_flags_get_word(mm); -} - -static inline void __mm_flags_set_mask_dumpable(struct mm_struct *mm, int value) -{ - __mm_flags_set_mask_bits_word(mm, MMF_DUMPABLE_MASK, value); -} - -extern void set_dumpable(struct mm_struct *mm, int value); /* - * This returns the actual value of the suid_dumpable flag. For things - * that are using this for checking for privilege transitions, it must - * test against SUID_DUMP_USER rather than treating it as a boolean - * value. + * Task dumpability mode. Gates core dump production and ptrace_attach() + * authorization. The numeric values are stable ABI (suid_dumpable + * sysctl, prctl(PR_SET_DUMPABLE)); do not renumber. */ -static inline int __get_dumpable(unsigned long mm_flags) -{ - return mm_flags & MMF_DUMPABLE_MASK; -} - -static inline int get_dumpable(struct mm_struct *mm) -{ - unsigned long flags = __mm_flags_get_dumpable(mm); - - return __get_dumpable(flags); -} +enum task_dumpable { + TASK_DUMPABLE_OFF = 0, /* no dump; ptrace needs CAP_SYS_PTRACE */ + TASK_DUMPABLE_OWNER = 1, /* default; dump and ptrace by uid match */ + TASK_DUMPABLE_ROOT = 2, /* dump as root; ptrace needs CAP_SYS_PTRACE */ +}; + +void task_exec_state_set_dumpable(enum task_dumpable value); +enum task_dumpable task_exec_state_get_dumpable(struct task_struct *task); #endif /* _LINUX_SCHED_COREDUMP_H */ diff --git a/include/linux/sched/exec_state.h b/include/linux/sched/exec_state.h new file mode 100644 index 000000000000..9b61782510b8 --- /dev/null +++ b/include/linux/sched/exec_state.h @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Christian Brauner <brauner@kernel.org> */ +#ifndef _LINUX_SCHED_EXEC_STATE_H +#define _LINUX_SCHED_EXEC_STATE_H + +#include <linux/init.h> +#include <linux/rcupdate.h> +#include <linux/refcount.h> +#include <linux/sched/coredump.h> +#include <linux/user_namespace.h> + +struct task_exec_state { + refcount_t count; + enum task_dumpable dumpable; + struct user_namespace *user_ns; + struct rcu_head rcu; +}; + +extern struct task_exec_state init_task_exec_state; + +struct task_exec_state *alloc_task_exec_state(struct user_namespace *user_ns); +void put_task_exec_state(struct task_exec_state *exec_state); +struct task_exec_state *task_exec_state_rcu(const struct task_struct *tsk); +struct task_exec_state *task_exec_state_replace(struct task_struct *tsk, + struct task_exec_state *exec_state); +int task_exec_state_copy(struct task_struct *tsk); +void __init exec_state_init(void); + +DEFINE_FREE(put_task_exec_state, struct task_exec_state *, put_task_exec_state(_T)) + +#endif /* _LINUX_SCHED_EXEC_STATE_H */ diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h index 166b19af956f..cde6679c0278 100644 --- a/include/linux/sched/smt.h +++ b/include/linux/sched/smt.h @@ -4,16 +4,12 @@ #include <linux/static_key.h> -#ifdef CONFIG_SCHED_SMT extern struct static_key_false sched_smt_present; static __always_inline bool sched_smt_active(void) { return static_branch_likely(&sched_smt_present); } -#else -static __always_inline bool sched_smt_active(void) { return false; } -#endif void arch_smt_update(void); diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 36553e14866d..b5d9d7c2b8ad 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -67,7 +67,25 @@ struct sched_domain_shared { atomic_t ref; atomic_t nr_busy_cpus; int has_idle_cores; - int nr_idle_scan; + union { + int nr_idle_scan; + /* + * Used during allocation to claim the sched_domain_shared + * object at multiple levels. + * + * Note: between build and the first periodic LB tick, which + * rewrites the union via update_idle_cpu_scan(), readers of + * nr_idle_scan may observe the transient SD_* flag value as + * the scan bound. The flag bits are small positive integers, + * so the effect is just a slightly relaxed scan bound for one + * window and self-heals on the first tick. + */ + int alloc_flags; + }; +#ifdef CONFIG_SCHED_CACHE + unsigned long util_avg; + unsigned long capacity; +#endif }; struct sched_domain { @@ -99,6 +117,12 @@ struct sched_domain { u64 max_newidle_lb_cost; unsigned long last_decay_max_lb_cost; +#ifdef CONFIG_SCHED_CACHE + unsigned int llc_max; + unsigned int *llc_counts __counted_by_ptr(llc_max); + unsigned long llc_bytes; +#endif + #ifdef CONFIG_SCHEDSTATS /* sched_balance_rq() stats */ unsigned int lb_count[CPU_MAX_IDLE_TYPES]; @@ -256,4 +280,10 @@ static inline int task_node(const struct task_struct *p) return cpu_to_node(task_cpu(p)); } +#ifdef CONFIG_SCHED_CACHE +extern void sched_update_llc_bytes(unsigned int cpu); +#else +static inline void sched_update_llc_bytes(unsigned int cpu) { } +#endif + #endif /* _LINUX_SCHED_TOPOLOGY_H */ diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 5a40252b8334..f865491c4f2c 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -1259,14 +1259,15 @@ static __always_inline void __scoped_seqlock_cleanup(struct ss_tmp *sst) extern void __scoped_seqlock_invalid_target(void); -#if (defined(CONFIG_CC_IS_GCC) && CONFIG_GCC_VERSION < 90000) || defined(CONFIG_KASAN) +#if (defined(CONFIG_CC_IS_GCC) && CONFIG_GCC_VERSION < 90000) || \ + defined(CONFIG_KASAN) || defined(CONFIG_UBSAN_ALIGNMENT) /* * For some reason some GCC-8 architectures (nios2, alpha) have trouble * determining that the ss_done state is impossible in __scoped_seqlock_next() * below. * - * Similarly KASAN is known to confuse compilers enough to break this. But we - * don't care about code quality for KASAN builds anyway. + * Similarly KASAN and UBSAN_ALIGNMENT are known to confuse compilers enough + * to break this. But we don't care about code quality for such builds anyway. */ static inline void __scoped_seqlock_bug(void) { } #else diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 93a0ba872ebe..69b0177da156 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -48,7 +48,7 @@ struct shmem_inode_info { }; struct timespec64 i_crtime; /* file creation time */ struct shared_policy policy; /* NUMA memory alloc policy */ - struct simple_xattrs *xattrs; /* list of xattrs */ + struct list_head xattrs; /* list of xattrs */ pgoff_t fallocend; /* highest fallocate endindex */ unsigned int fsflags; /* for FS_IOC_[SG]ETFLAGS */ atomic_t stop_eviction; /* hold when working on inode */ @@ -89,6 +89,7 @@ struct shmem_sb_info { struct list_head shrinklist; /* List of shinkable inodes */ unsigned long shrinklist_len; /* Length of shrinklist */ struct shmem_quota_limits qlimits; /* Default quota limits */ + struct simple_xattr_cache xa_cache; }; static inline struct shmem_inode_info *SHMEM_I(struct inode *inode) diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h index 3e6c8e9d67ae..9c2429c1a570 100644 --- a/include/linux/sockptr.h +++ b/include/linux/sockptr.h @@ -87,24 +87,10 @@ static inline int copy_safe_from_sockptr(void *dst, size_t ksize, static inline int copy_struct_from_sockptr(void *dst, size_t ksize, sockptr_t src, size_t usize) { - size_t size = min(ksize, usize); - size_t rest = max(ksize, usize) - size; - if (!sockptr_is_kernel(src)) - return copy_struct_from_user(dst, ksize, src.user, size); - - if (usize < ksize) { - memset(dst + size, 0, rest); - } else if (usize > ksize) { - char *p = src.kernel; + return copy_struct_from_user(dst, ksize, src.user, usize); - while (rest--) { - if (*p++) - return -E2BIG; - } - } - memcpy(dst, src.kernel, size); - return 0; + return copy_struct_from_bounce_buffer(dst, ksize, src.kernel, usize); } static inline int copy_to_sockptr_offset(sockptr_t dst, size_t offset, @@ -121,6 +107,16 @@ static inline int copy_to_sockptr(sockptr_t dst, const void *src, size_t size) return copy_to_sockptr_offset(dst, 0, src, size); } +static inline int +copy_struct_to_sockptr(sockptr_t dst, size_t usize, const void *src, + size_t ksize, bool *ignored_trailing) +{ + if (!sockptr_is_kernel(dst)) + return copy_struct_to_user(dst.user, usize, src, ksize, ignored_trailing); + + return copy_struct_to_bounce_buffer(dst.kernel, usize, src, ksize, ignored_trailing); +} + static inline void *memdup_sockptr_noprof(sockptr_t src, size_t len) { void *p = kmalloc_track_caller_noprof(len, GFP_USER | __GFP_NOWARN); diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 81b1938512d5..a54ce9e808b9 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -397,7 +397,7 @@ static inline struct srcu_ctr __percpu *srcu_read_lock_fast_notrace(struct srcu_ * * The same srcu_struct may be used concurrently by srcu_down_read_fast() * and srcu_read_lock_fast(). However, the same definition/initialization - * requirements called out for srcu_read_lock_safe() apply. + * requirements called out for srcu_read_lock_fast_updown() apply. */ static inline struct srcu_ctr __percpu *srcu_down_read_fast(struct srcu_struct *ssp) __acquires_shared(ssp) { diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 0ddc77aeeca2..083b4f533933 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -125,7 +125,6 @@ struct thermal_cooling_device { const char *type; unsigned long max_state; struct device device; - struct device_node *np; void *devdata; void *stats; const struct thermal_cooling_device_ops *ops; @@ -133,6 +132,10 @@ struct thermal_cooling_device { struct mutex lock; /* protect thermal_instances list */ struct list_head thermal_instances; struct list_head node; +#ifdef CONFIG_THERMAL_OF + struct device_node *np; + u32 cdev_id; +#endif #ifdef CONFIG_THERMAL_DEBUGFS struct thermal_debugfs *debugfs; #endif @@ -198,6 +201,21 @@ struct thermal_zone_device *devm_thermal_of_zone_register(struct device *dev, in void devm_thermal_of_zone_unregister(struct device *dev, struct thermal_zone_device *tz); +struct thermal_cooling_device * +thermal_of_cooling_device_register(struct device_node *np, u32 cdev_id, + const char *type, void *data, + const struct thermal_cooling_device_ops *ops); + +struct thermal_cooling_device * +devm_thermal_of_cooling_device_register(struct device *dev, u32 cdev_id, + const char *type, void *devdata, + const struct thermal_cooling_device_ops *ops); + +struct thermal_cooling_device * +devm_thermal_of_child_cooling_device_register(struct device *dev, + struct device_node *np, + const char *type, void *devdata, + const struct thermal_cooling_device_ops *ops); #else static inline @@ -211,6 +229,31 @@ static inline void devm_thermal_of_zone_unregister(struct device *dev, struct thermal_zone_device *tz) { } + +static inline struct thermal_cooling_device * +thermal_of_cooling_device_register(struct device_node *np, u32 cdev_id, + const char *type, void *devdata, + const struct thermal_cooling_device_ops *ops) +{ + return ERR_PTR(-ENODEV); +} + +static inline struct thermal_cooling_device * +devm_thermal_of_cooling_device_register(struct device *dev, u32 cdev_id, + const char *type, void *devdata, + const struct thermal_cooling_device_ops *ops) +{ + return ERR_PTR(-ENODEV); +} + +static inline struct thermal_cooling_device * +devm_thermal_of_child_cooling_device_register(struct device *dev, + struct device_node *np, + const char *type, void *devdata, + const struct thermal_cooling_device_ops *ops) +{ + return ERR_PTR(-ENODEV); +} #endif int for_each_thermal_trip(struct thermal_zone_device *tz, @@ -252,14 +295,11 @@ void thermal_zone_device_update(struct thermal_zone_device *, struct thermal_cooling_device *thermal_cooling_device_register(const char *, void *, const struct thermal_cooling_device_ops *); + struct thermal_cooling_device * -thermal_of_cooling_device_register(struct device_node *np, const char *, void *, - const struct thermal_cooling_device_ops *); -struct thermal_cooling_device * -devm_thermal_of_cooling_device_register(struct device *dev, - struct device_node *np, - const char *type, void *devdata, - const struct thermal_cooling_device_ops *ops); +devm_thermal_cooling_device_register(struct device *dev, const char *type, void *devdata, + const struct thermal_cooling_device_ops *ops); + void thermal_cooling_device_update(struct thermal_cooling_device *); void thermal_cooling_device_unregister(struct thermal_cooling_device *); struct thermal_zone_device *thermal_zone_get_zone_by_name(const char *name); @@ -304,19 +344,12 @@ static inline struct thermal_cooling_device * thermal_cooling_device_register(const char *type, void *devdata, const struct thermal_cooling_device_ops *ops) { return ERR_PTR(-ENODEV); } + static inline struct thermal_cooling_device * -thermal_of_cooling_device_register(struct device_node *np, - const char *type, void *devdata, - const struct thermal_cooling_device_ops *ops) +devm_thermal_cooling_device_register(struct device *dev, const char *type, void *devdata, + const struct thermal_cooling_device_ops *ops) { return ERR_PTR(-ENODEV); } -static inline struct thermal_cooling_device * -devm_thermal_of_cooling_device_register(struct device *dev, - struct device_node *np, - const char *type, void *devdata, - const struct thermal_cooling_device_ops *ops) -{ - return ERR_PTR(-ENODEV); -} + static inline void thermal_cooling_device_unregister( struct thermal_cooling_device *cdev) { } diff --git a/include/linux/tick.h b/include/linux/tick.h index 738007d6f577..1cf4651f09ad 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -139,8 +139,6 @@ extern bool tick_nohz_idle_got_tick(void); extern ktime_t tick_nohz_get_next_hrtimer(void); extern ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next); extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu); -extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); -extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); #else /* !CONFIG_NO_HZ_COMMON */ #define tick_nohz_enabled (0) static inline bool tick_nohz_is_active(void) { return false; } @@ -162,8 +160,6 @@ static inline ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) *delta_next = TICK_NSEC; return *delta_next; } -static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } -static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } #endif /* !CONFIG_NO_HZ_COMMON */ /* diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index e36d11e33e0c..4486dfd5d0de 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -190,7 +190,7 @@ struct timekeeper { s32 tai_offset; }; -#ifdef CONFIG_GENERIC_TIME_VSYSCALL +#ifdef CONFIG_GENERIC_GETTIMEOFDAY extern void update_vsyscall(struct timekeeper *tk); extern void update_vsyscall_tz(void); diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index aee2c1a46e47..984a866d293b 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -276,37 +276,30 @@ static inline bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *kt) { ret #endif /** - * struct system_time_snapshot - simultaneous raw/real time capture with - * counter value - * @cycles: Clocksource counter value to produce the system times - * @real: Realtime system time - * @boot: Boot time - * @raw: Monotonic raw system time - * @cs_id: Clocksource ID + * struct system_time_snapshot - Simultaneous time capture of CLOCK_MONOTONIC_RAW, + * a selected CLOCK_* and the clocksource counter value + * @cycles: Clocksource counter value to produce the system times + * @hw_cycles: For derived clocksources, the hardware counter value from + * which @cycles was derived + * @systime: The system time of the selected CLOCK ID + * @monoraw: Monotonic raw system time + * @cs_id: Clocksource ID + * @hw_csid: Clocksource ID of the underlying hardware counter for derived + * clocksources which implement the read_snapshot() callback. * @clock_was_set_seq: The sequence number of clock-was-set events * @cs_was_changed_seq: The sequence number of clocksource change events + * @valid: True if the snapshot is valid */ struct system_time_snapshot { u64 cycles; - ktime_t real; - ktime_t boot; - ktime_t raw; + u64 hw_cycles; + ktime_t systime; + ktime_t monoraw; enum clocksource_ids cs_id; + enum clocksource_ids hw_csid; unsigned int clock_was_set_seq; u8 cs_was_changed_seq; -}; - -/** - * struct system_device_crosststamp - system/device cross-timestamp - * (synchronized capture) - * @device: Device time - * @sys_realtime: Realtime simultaneous with device time - * @sys_monoraw: Monotonic raw simultaneous with device time - */ -struct system_device_crosststamp { - ktime_t device; - ktime_t sys_realtime; - ktime_t sys_monoraw; + u8 valid; }; /** @@ -325,6 +318,23 @@ struct system_counterval_t { bool use_nsecs; }; +/** + * struct system_device_crosststamp - system/device cross-timestamp + * (synchronized capture) + * @clock_id: System time Clock ID to capture + * @device: Device time + * @sys_counter: Clocksource counter value simultaneous with device time + * @sys_systime: System time for @clock_id + * @sys_monoraw: Monotonic raw simultaneous with device time + */ +struct system_device_crosststamp { + clockid_t clock_id; + ktime_t device; + struct system_counterval_t sys_counter; + ktime_t sys_systime; + ktime_t sys_monoraw; +}; + extern bool ktime_real_to_base_clock(ktime_t treal, enum clocksource_ids base_id, u64 *cycles); extern bool timekeeping_clocksource_has_base(enum clocksource_ids id); @@ -341,9 +351,10 @@ extern int get_device_system_crosststamp( struct system_device_crosststamp *xtstamp); /* - * Simultaneously snapshot realtime and monotonic raw clocks + * Simultaneously snapshot a given clock with MONOTONIC_RAW and the underlying + * clocksource counter value. */ -extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot); +extern void ktime_get_snapshot_id(clockid_t clock_id, struct system_time_snapshot *systime_snapshot); /* * Persistent clock related interfaces diff --git a/include/linux/topology.h b/include/linux/topology.h index 6575af39fd10..709a2dcf4c73 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -230,11 +230,24 @@ static inline int cpu_to_mem(int cpu) #define topology_drawer_cpumask(cpu) cpumask_of(cpu) #endif -#if defined(CONFIG_SCHED_SMT) && !defined(cpu_smt_mask) +/* + * Defining cpu_smt_mask as cpumask_of that CPU helps to get + * rid of lot of ifdeffery all around the codebase in case of + * CONFIG_SCHED_SMT=n. It just means there are no other siblings, which + * is what is expected. + */ +#if defined(CONFIG_SCHED_SMT) +# if !defined(cpu_smt_mask) static inline const struct cpumask *cpu_smt_mask(int cpu) { return topology_sibling_cpumask(cpu); } +# endif +#else /* !CONFIG_SCHED_SMT */ +static inline const struct cpumask *cpu_smt_mask(int cpu) +{ + return cpumask_of(cpu); +} #endif #ifndef topology_is_primary_thread diff --git a/include/linux/torture.h b/include/linux/torture.h index 1b59056c3b18..c9b47d138302 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -129,6 +129,7 @@ void _torture_stop_kthread(char *m, struct task_struct **tp); #else #define torture_preempt_schedule() do { } while (0) #endif +void torture_sched_set_normal(struct task_struct *t, int nice); #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) || IS_ENABLED(CONFIG_LOCK_TORTURE_TEST) || IS_MODULE(CONFIG_LOCK_TORTURE_TEST) long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask, bool dowarn); diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 56328601218c..8a264662b242 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -510,7 +510,7 @@ copy_struct_to_user(void __user *dst, size_t usize, const void *src, return -EFAULT; } if (ignored_trailing) - *ignored_trailing = ksize < usize && + *ignored_trailing = usize < ksize && memchr_inv(src + size, 0, rest) != NULL; /* Copy the interoperable parts of the struct. */ if (copy_to_user(dst, src, size)) @@ -518,6 +518,69 @@ copy_struct_to_user(void __user *dst, size_t usize, const void *src, return 0; } +static __always_inline void +__copy_struct_generic_bounce_buffer(void *dst, size_t dstsize, + const void *src, size_t srcsize, + bool *ignored_trailing) +{ + size_t size = min(dstsize, srcsize); + size_t rest = max(dstsize, srcsize) - size; + + /* Deal with trailing bytes. */ + if (dstsize > srcsize) + memset(dst + size, 0, rest); + if (ignored_trailing) + *ignored_trailing = dstsize < srcsize && + memchr_inv(src + size, 0, rest) != NULL; + /* Copy the interoperable parts of the struct. */ + memcpy(dst, src, size); +} + +/** + * This is like copy_struct_from_user(), but the + * src buffer was already copied into a kernel + * bounce buffer, so it will never return -EFAULT. + */ +static __always_inline __must_check int +copy_struct_from_bounce_buffer(void *dst, size_t dstsize, + const void *src, size_t srcsize) +{ + bool ignored_trailing; + + /* Double check if ksize is larger than a known object size. */ + if (WARN_ON_ONCE(dstsize > __builtin_object_size(dst, 1))) + return -E2BIG; + + __copy_struct_generic_bounce_buffer(dst, dstsize, + src, srcsize, + &ignored_trailing); + if (unlikely(ignored_trailing)) + return -E2BIG; + + return 0; +} + +/** + * This is like copy_struct_to_user(), but the + * dst buffer is a kernel bounce buffer instead + * of a direct userspace buffer, so it will never return -EFAULT. + */ +static __always_inline __must_check int +copy_struct_to_bounce_buffer(void *dst, size_t dstsize, + const void *src, + size_t srcsize, + bool *ignored_trailing) +{ + /* Double check if srcsize is larger than a known object size. */ + if (WARN_ON_ONCE(srcsize > __builtin_object_size(src, 1))) + return -E2BIG; + + __copy_struct_generic_bounce_buffer(dst, dstsize, + src, srcsize, + ignored_trailing); + return 0; +} + bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size); long copy_from_kernel_nofault(void *dst, const void *src, size_t size); @@ -649,6 +712,17 @@ static inline void user_access_restore(unsigned long flags) { } #define user_read_access_end user_access_end #endif +#ifndef unsafe_atomic_store_release_user +# define unsafe_atomic_store_release_user(val, uptr, elbl) \ + do { \ + if (!IS_ENABLED(CONFIG_ARCH_MEMORY_ORDER_TSO)) \ + smp_mb(); \ + else \ + barrier(); \ + unsafe_put_user(val, uptr, elbl); \ + } while (0) +#endif + /* Define RW variant so the below _mode macro expansion works */ #define masked_user_rw_access_begin(u) masked_user_access_begin(u) #define user_rw_access_begin(u, s) user_access_begin(u, s) diff --git a/include/linux/vdso_datastore.h b/include/linux/vdso_datastore.h index 0b530428db71..3dfba9502d78 100644 --- a/include/linux/vdso_datastore.h +++ b/include/linux/vdso_datastore.h @@ -2,12 +2,12 @@ #ifndef _LINUX_VDSO_DATASTORE_H #define _LINUX_VDSO_DATASTORE_H -#ifdef CONFIG_HAVE_GENERIC_VDSO #include <linux/mm_types.h> extern const struct vm_special_mapping vdso_vvar_mapping; struct vm_area_struct *vdso_install_vvar_mapping(struct mm_struct *mm, unsigned long addr); +#ifdef CONFIG_HAVE_GENERIC_VDSO void __init vdso_setup_data_pages(void); #else /* !CONFIG_HAVE_GENERIC_VDSO */ static inline void vdso_setup_data_pages(void) { } diff --git a/include/linux/vtime.h b/include/linux/vtime.h index 29dd5b91dd7d..9dc25b04a119 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -10,7 +10,6 @@ */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING extern void vtime_account_kernel(struct task_struct *tsk); -extern void vtime_account_idle(struct task_struct *tsk); #endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN @@ -27,16 +26,33 @@ static inline void vtime_guest_exit(struct task_struct *tsk) { } static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { } #endif +static inline bool vtime_generic_enabled_cpu(int cpu) +{ + return context_tracking_enabled_cpu(cpu); +} + +static inline bool vtime_generic_enabled_this_cpu(void) +{ + return context_tracking_enabled_this_cpu(); +} + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +extern void vtime_account_idle(struct task_struct *tsk); extern void vtime_account_irq(struct task_struct *tsk, unsigned int offset); extern void vtime_account_softirq(struct task_struct *tsk); extern void vtime_account_hardirq(struct task_struct *tsk); extern void vtime_flush(struct task_struct *tsk); +extern void vtime_reset(void); +extern void vtime_dyntick_start(void); +extern void vtime_dyntick_stop(void); #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ static inline void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { } static inline void vtime_account_softirq(struct task_struct *tsk) { } static inline void vtime_account_hardirq(struct task_struct *tsk) { } static inline void vtime_flush(struct task_struct *tsk) { } +static inline void vtime_reset(void) { } +static inline void vtime_dyntick_start(void) { } +static inline void vtime_dyntick_stop(void) { } #endif /* @@ -74,12 +90,12 @@ static inline bool vtime_accounting_enabled(void) static inline bool vtime_accounting_enabled_cpu(int cpu) { - return context_tracking_enabled_cpu(cpu); + return vtime_generic_enabled_cpu(cpu); } static inline bool vtime_accounting_enabled_this_cpu(void) { - return context_tracking_enabled_this_cpu(); + return vtime_generic_enabled_this_cpu(); } extern void vtime_task_switch_generic(struct task_struct *prev); diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 8b6601367eae..54ac3cbc133f 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -106,12 +106,14 @@ static inline const char *xattr_prefix(const struct xattr_handler *handler) return handler->prefix ?: handler->name; } -struct simple_xattrs { - struct rhashtable ht; +struct simple_xattr_cache { + struct rhashtable *ht; }; struct simple_xattr { struct rhash_head hash_node; + struct list_head *parent; + struct list_head node; struct rcu_head rcu; char *name; size_t size; @@ -132,40 +134,39 @@ static inline void simple_xattr_limits_init(struct simple_xattr_limits *limits) atomic_set(&limits->xattr_size, 0); } -int simple_xattrs_init(struct simple_xattrs *xattrs); -struct simple_xattrs *simple_xattrs_alloc(void); -struct simple_xattrs *simple_xattrs_lazy_alloc(struct simple_xattrs **xattrsp, - const void *value, int flags); -void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space); +void simple_xattrs_free(struct simple_xattr_cache *cache, struct list_head *xattrs, + size_t *freed_space); size_t simple_xattr_space(const char *name, size_t size); struct simple_xattr *simple_xattr_alloc(const void *value, size_t size); void simple_xattr_free(struct simple_xattr *xattr); void simple_xattr_free_rcu(struct simple_xattr *xattr); -int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, - void *buffer, size_t size); -struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, +int simple_xattr_get(struct simple_xattr_cache *cache, struct list_head *xattrs, + const char *name, void *buffer, size_t size); +struct simple_xattr *simple_xattr_set(struct simple_xattr_cache *cache, + struct list_head *xattrs, const char *name, const void *value, size_t size, int flags); -int simple_xattr_set_limited(struct simple_xattrs *xattrs, +int simple_xattr_set_limited(struct simple_xattr_cache *cache, + struct list_head *xattrs, struct simple_xattr_limits *limits, const char *name, const void *value, size_t size, int flags); -ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, +ssize_t simple_xattr_list(struct inode *inode, struct list_head *xattrs, char *buffer, size_t size); -int simple_xattr_add(struct simple_xattrs *xattrs, +int simple_xattr_add(struct simple_xattr_cache *cache, struct list_head *xattrs, struct simple_xattr *new_xattr); +int simple_xattr_add_limited(struct simple_xattr_cache *cache, + struct list_head *xattrs, + struct simple_xattr_limits *limits, + struct simple_xattr *new_xattr); int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name); +void simple_xattr_cache_cleanup(struct simple_xattr_cache *cache); + DEFINE_CLASS(simple_xattr, struct simple_xattr *, if (!IS_ERR_OR_NULL(_T)) simple_xattr_free(_T), simple_xattr_alloc(value, size), const void *value, size_t size) -DEFINE_CLASS(simple_xattrs, - struct simple_xattrs *, - if (!IS_ERR_OR_NULL(_T)) { simple_xattrs_free(_T, NULL); kfree(_T); }, - simple_xattrs_alloc(), - void) - #endif /* _LINUX_XATTR_H */ |
