diff options
Diffstat (limited to 'include')
154 files changed, 3017 insertions, 1212 deletions
diff --git a/include/acpi/processor.h b/include/acpi/processor.h index a17e97e634a6..d0eccbd920e5 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -280,6 +280,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, struct acpi_processor_cx *cx, struct acpi_power_register *reg); void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cstate); +void __noreturn acpi_processor_ffh_play_dead(struct acpi_processor_cx *cx); #else static inline void acpi_processor_power_init_bm_check(struct acpi_processor_flags @@ -300,6 +301,10 @@ static inline void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx { return; } +static inline void __noreturn acpi_processor_ffh_play_dead(struct acpi_processor_cx *cx) +{ + BUG(); +} #endif static inline int call_on_cpu(int cpu, long (*fn)(void *), void *arg, diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h index a5cbbf3e26ec..3c61c29ff6ab 100644 --- a/include/asm-generic/io.h +++ b/include/asm-generic/io.h @@ -1212,7 +1212,7 @@ static inline void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) #ifndef memset_io /** - * memset_io Set a range of I/O memory to a constant value + * memset_io - Set a range of I/O memory to a constant value * @addr: The beginning of the I/O-memory range to set * @val: The value to set the memory to * @count: The number of bytes to set @@ -1224,7 +1224,7 @@ void memset_io(volatile void __iomem *addr, int val, size_t count); #ifndef memcpy_fromio /** - * memcpy_fromio Copy a block of data from I/O memory + * memcpy_fromio - Copy a block of data from I/O memory * @dst: The (RAM) destination for the copy * @src: The (I/O memory) source for the data * @count: The number of bytes to copy @@ -1236,7 +1236,7 @@ void memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count); #ifndef memcpy_toio /** - * memcpy_toio Copy a block of data into I/O memory + * memcpy_toio - Copy a block of data into I/O memory * @dst: The (I/O memory) destination for the copy * @src: The (RAM) source for the data * @count: The number of bytes to copy diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index a7bbe504e4f3..ccccb1cbf7df 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -28,9 +28,15 @@ #define VTPM_BASE_ADDRESS 0xfed40000 +enum hv_partition_type { + HV_PARTITION_TYPE_GUEST, + HV_PARTITION_TYPE_ROOT, +}; + struct ms_hyperv_info { u32 features; u32 priv_high; + u32 ext_features; u32 misc_features; u32 hints; u32 nested_features; @@ -58,15 +64,32 @@ struct ms_hyperv_info { }; extern struct ms_hyperv_info ms_hyperv; extern bool hv_nested; +extern u64 hv_current_partition_id; +extern enum hv_partition_type hv_curr_partition_type; extern void * __percpu *hyperv_pcpu_input_arg; extern void * __percpu *hyperv_pcpu_output_arg; -extern u64 hv_do_hypercall(u64 control, void *inputaddr, void *outputaddr); -extern u64 hv_do_fast_hypercall8(u16 control, u64 input8); +u64 hv_do_hypercall(u64 control, void *inputaddr, void *outputaddr); +u64 hv_do_fast_hypercall8(u16 control, u64 input8); +u64 hv_do_fast_hypercall16(u16 control, u64 input1, u64 input2); + bool hv_isolation_type_snp(void); bool hv_isolation_type_tdx(void); +/* + * On architectures where Hyper-V doesn't support AEOI (e.g., ARM64), + * it doesn't provide a recommendation flag and AEOI must be disabled. + */ +static inline bool hv_recommend_using_aeoi(void) +{ +#ifdef HV_DEPRECATING_AEOI_RECOMMENDED + return !(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED); +#else + return false; +#endif +} + static inline struct hv_proximity_domain_info hv_numa_node_to_pxm_info(int node) { struct hv_proximity_domain_info pxm_info = {}; @@ -185,12 +208,11 @@ void hv_setup_kexec_handler(void (*handler)(void)); void hv_remove_kexec_handler(void); void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs)); void hv_remove_crash_handler(void); +void hv_setup_mshv_handler(void (*handler)(void)); extern int vmbus_interrupt; extern int vmbus_irq; -extern bool hv_root_partition; - #if IS_ENABLED(CONFIG_HYPERV) /* * Hypervisor's notion of virtual processor ID is different from @@ -207,10 +229,12 @@ extern u64 (*hv_read_reference_counter)(void); #define VP_INVAL U32_MAX int __init hv_common_init(void); +void __init hv_get_partition_id(void); void __init hv_common_free(void); void __init ms_hyperv_late_init(void); int hv_common_cpu_init(unsigned int cpu); int hv_common_cpu_die(unsigned int cpu); +void hv_identify_partition_type(void); void *hv_alloc_hyperv_page(void); void *hv_alloc_hyperv_zeroed_page(void); @@ -291,6 +315,20 @@ static inline int cpumask_to_vpset_skip(struct hv_vpset *vpset, return __cpumask_to_vpset(vpset, cpus, func); } +#define _hv_status_fmt(fmt) "%s: Hyper-V status: %#x = %s: " fmt +#define hv_status_printk(level, status, fmt, ...) \ +do { \ + u64 __status = (status); \ + pr_##level(_hv_status_fmt(fmt), __func__, hv_result(__status), \ + hv_result_to_string(__status), ##__VA_ARGS__); \ +} while (0) +#define hv_status_err(status, fmt, ...) \ + hv_status_printk(err, status, fmt, ##__VA_ARGS__) +#define hv_status_debug(status, fmt, ...) \ + hv_status_printk(debug, status, fmt, ##__VA_ARGS__) + +const char *hv_result_to_string(u64 hv_status); +int hv_result_to_errno(u64 status); void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die); bool hv_is_hyperv_initialized(void); bool hv_is_hibernation_supported(void); @@ -303,6 +341,7 @@ void hyperv_cleanup(void); bool hv_query_ext_cap(u64 cap_query); void hv_setup_dma_ops(struct device *dev, bool coherent); #else /* CONFIG_HYPERV */ +static inline void hv_identify_partition_type(void) {} static inline bool hv_is_hyperv_initialized(void) { return false; } static inline bool hv_is_hibernation_supported(void) { return false; } static inline void hyperv_cleanup(void) {} @@ -314,4 +353,29 @@ static inline enum hv_isolation_type hv_get_isolation_type(void) } #endif /* CONFIG_HYPERV */ +#if IS_ENABLED(CONFIG_MSHV_ROOT) +static inline bool hv_root_partition(void) +{ + return hv_curr_partition_type == HV_PARTITION_TYPE_ROOT; +} +int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages); +int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id); +int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags); + +#else /* CONFIG_MSHV_ROOT */ +static inline bool hv_root_partition(void) { return false; } +static inline int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages) +{ + return -EOPNOTSUPP; +} +static inline int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id) +{ + return -EOPNOTSUPP; +} +static inline int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_MSHV_ROOT */ + #endif diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index c768de6f19a9..0755bc39b0d8 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -39,7 +39,7 @@ extern char __init_begin[], __init_end[]; extern char _sinittext[], _einittext[]; extern char __start_ro_after_init[], __end_ro_after_init[]; extern char _end[]; -extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[]; +extern char __per_cpu_start[], __per_cpu_end[]; extern char __kprobes_text_start[], __kprobes_text_end[]; extern char __entry_text_start[], __entry_text_end[]; extern char __start_rodata[], __end_rodata[]; diff --git a/include/asm-generic/vdso/vsyscall.h b/include/asm-generic/vdso/vsyscall.h index 01dafd604188..b550afa15ecd 100644 --- a/include/asm-generic/vdso/vsyscall.h +++ b/include/asm-generic/vdso/vsyscall.h @@ -4,24 +4,35 @@ #ifndef __ASSEMBLY__ -#ifndef __arch_get_k_vdso_data -static __always_inline struct vdso_data *__arch_get_k_vdso_data(void) +#ifdef CONFIG_GENERIC_VDSO_DATA_STORE + +#ifndef __arch_get_vdso_u_time_data +static __always_inline const struct vdso_time_data *__arch_get_vdso_u_time_data(void) { - return NULL; + return &vdso_u_time_data; } -#endif /* __arch_get_k_vdso_data */ +#endif + +#ifndef __arch_get_vdso_u_rng_data +static __always_inline const struct vdso_rng_data *__arch_get_vdso_u_rng_data(void) +{ + return &vdso_u_rng_data; +} +#endif + +#endif /* CONFIG_GENERIC_VDSO_DATA_STORE */ #ifndef __arch_update_vsyscall -static __always_inline void __arch_update_vsyscall(struct vdso_data *vdata) +static __always_inline void __arch_update_vsyscall(struct vdso_time_data *vdata) { } #endif /* __arch_update_vsyscall */ -#ifndef __arch_sync_vdso_data -static __always_inline void __arch_sync_vdso_data(struct vdso_data *vdata) +#ifndef __arch_sync_vdso_time_data +static __always_inline void __arch_sync_vdso_time_data(struct vdso_time_data *vdata) { } -#endif /* __arch_sync_vdso_data */ +#endif /* __arch_sync_vdso_time_data */ #endif /* !__ASSEMBLY__ */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 0d5b186abee8..58a635a6d5bd 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -385,6 +385,11 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) . = ALIGN(PAGE_SIZE); \ __nosave_end = .; +#define CACHE_HOT_DATA(align) \ + . = ALIGN(align); \ + *(SORT_BY_ALIGNMENT(.data..hot.*)) \ + . = ALIGN(align); + #define PAGE_ALIGNED_DATA(page_align) \ . = ALIGN(page_align); \ *(.data..page_aligned) \ @@ -404,7 +409,6 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) __start_init_stack = .; \ init_thread_union = .; \ init_stack = .; \ - KEEP(*(.data..init_task)) \ KEEP(*(.data..init_thread_info)) \ . = __start_init_stack + THREAD_SIZE; \ __end_init_stack = .; @@ -1062,10 +1066,13 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) */ #define PERCPU_INPUT(cacheline) \ __per_cpu_start = .; \ - *(.data..percpu..first) \ . = ALIGN(PAGE_SIZE); \ *(.data..percpu..page_aligned) \ . = ALIGN(cacheline); \ + __per_cpu_hot_start = .; \ + *(SORT_BY_ALIGNMENT(.data..percpu..hot.*)) \ + __per_cpu_hot_end = .; \ + . = ALIGN(cacheline); \ *(.data..percpu..read_mostly) \ . = ALIGN(cacheline); \ *(.data..percpu) \ @@ -1074,52 +1081,17 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) __per_cpu_end = .; /** - * PERCPU_VADDR - define output section for percpu area + * PERCPU_SECTION - define output section for percpu area * @cacheline: cacheline size - * @vaddr: explicit base address (optional) - * @phdr: destination PHDR (optional) * * Macro which expands to output section for percpu area. * * @cacheline is used to align subsections to avoid false cacheline * sharing between subsections for different purposes. - * - * If @vaddr is not blank, it specifies explicit base address and all - * percpu symbols will be offset from the given address. If blank, - * @vaddr always equals @laddr + LOAD_OFFSET. - * - * @phdr defines the output PHDR to use if not blank. Be warned that - * output PHDR is sticky. If @phdr is specified, the next output - * section in the linker script will go there too. @phdr should have - * a leading colon. - * - * Note that this macros defines __per_cpu_load as an absolute symbol. - * If there is no need to put the percpu section at a predetermined - * address, use PERCPU_SECTION. - */ -#define PERCPU_VADDR(cacheline, vaddr, phdr) \ - __per_cpu_load = .; \ - .data..percpu vaddr : AT(__per_cpu_load - LOAD_OFFSET) { \ - PERCPU_INPUT(cacheline) \ - } phdr \ - . = __per_cpu_load + SIZEOF(.data..percpu); - -/** - * PERCPU_SECTION - define output section for percpu area, simple version - * @cacheline: cacheline size - * - * Align to PAGE_SIZE and outputs output section for percpu area. This - * macro doesn't manipulate @vaddr or @phdr and __per_cpu_load and - * __per_cpu_start will be identical. - * - * This macro is equivalent to ALIGN(PAGE_SIZE); PERCPU_VADDR(@cacheline,,) - * except that __per_cpu_load is defined as a relative symbol against - * .data..percpu which is required for relocatable x86_32 configuration. */ #define PERCPU_SECTION(cacheline) \ . = ALIGN(PAGE_SIZE); \ .data..percpu : AT(ADDR(.data..percpu) - LOAD_OFFSET) { \ - __per_cpu_load = .; \ PERCPU_INPUT(cacheline) \ } @@ -1148,6 +1120,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) INIT_TASK_DATA(inittask) \ NOSAVE_DATA \ PAGE_ALIGNED_DATA(pagealigned) \ + CACHE_HOT_DATA(cacheline) \ CACHELINE_ALIGNED_DATA(cacheline) \ READ_MOSTLY_DATA(cacheline) \ DATA_DATA \ diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index 155615175965..abf0bd76e370 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -13,7 +13,7 @@ struct hv_u128 { u64 high_part; } __packed; -/* NOTE: when adding below, update hv_status_to_string() */ +/* NOTE: when adding below, update hv_result_to_string() */ #define HV_STATUS_SUCCESS 0x0 #define HV_STATUS_INVALID_HYPERCALL_CODE 0x2 #define HV_STATUS_INVALID_HYPERCALL_INPUT 0x3 @@ -51,6 +51,7 @@ struct hv_u128 { #define HV_HYP_PAGE_SHIFT 12 #define HV_HYP_PAGE_SIZE BIT(HV_HYP_PAGE_SHIFT) #define HV_HYP_PAGE_MASK (~(HV_HYP_PAGE_SIZE - 1)) +#define HV_HYP_LARGE_PAGE_SHIFT 21 #define HV_PARTITION_ID_INVALID ((u64)0) #define HV_PARTITION_ID_SELF ((u64)-1) @@ -182,7 +183,7 @@ struct hv_tsc_emulation_control { /* HV_TSC_INVARIANT_CONTROL */ #endif /* CONFIG_X86 */ -struct hv_get_partition_id { /* HV_OUTPUT_GET_PARTITION_ID */ +struct hv_output_get_partition_id { u64 partition_id; } __packed; @@ -204,7 +205,14 @@ union hv_reference_tsc_msr { /* The number of vCPUs in one sparse bank */ #define HV_VCPUS_PER_SPARSE_BANK (64) -/* Some of Hyper-V structs do not use hv_vpset where linux uses them */ +/* + * Some of Hyper-V structs do not use hv_vpset where linux uses them. + * + * struct hv_vpset is usually used as part of hypercall input. The portion + * that counts as "fixed size input header" vs. "variable size input header" + * varies per hypercall. See comments at relevant hypercall call sites as to + * how the "valid_bank_mask" field should be accounted. + */ struct hv_vpset { /* HV_VP_SET */ u64 format; u64 valid_bank_mask; @@ -374,6 +382,10 @@ union hv_hypervisor_version_info { #define HV_SHARED_GPA_BOUNDARY_ACTIVE BIT(5) #define HV_SHARED_GPA_BOUNDARY_BITS GENMASK(11, 6) +/* HYPERV_CPUID_FEATURES.ECX bits. */ +#define HV_VP_DISPATCH_INTERRUPT_INJECTION_AVAILABLE BIT(9) +#define HV_VP_GHCB_ROOT_MAPPING_AVAILABLE BIT(10) + enum hv_isolation_type { HV_ISOLATION_TYPE_NONE = 0, /* HV_PARTITION_ISOLATION_TYPE_NONE */ HV_ISOLATION_TYPE_VBS = 1, @@ -436,10 +448,13 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */ #define HVCALL_WITHDRAW_MEMORY 0x0049 #define HVCALL_MAP_GPA_PAGES 0x004b #define HVCALL_UNMAP_GPA_PAGES 0x004c +#define HVCALL_INSTALL_INTERCEPT 0x004d #define HVCALL_CREATE_VP 0x004e #define HVCALL_DELETE_VP 0x004f #define HVCALL_GET_VP_REGISTERS 0x0050 #define HVCALL_SET_VP_REGISTERS 0x0051 +#define HVCALL_TRANSLATE_VIRTUAL_ADDRESS 0x0052 +#define HVCALL_CLEAR_VIRTUAL_INTERRUPT 0x0056 #define HVCALL_DELETE_PORT 0x0058 #define HVCALL_DISCONNECT_PORT 0x005b #define HVCALL_POST_MESSAGE 0x005c @@ -447,12 +462,15 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */ #define HVCALL_POST_DEBUG_DATA 0x0069 #define HVCALL_RETRIEVE_DEBUG_DATA 0x006a #define HVCALL_RESET_DEBUG_SESSION 0x006b +#define HVCALL_MAP_STATS_PAGE 0x006c +#define HVCALL_UNMAP_STATS_PAGE 0x006d #define HVCALL_ADD_LOGICAL_PROCESSOR 0x0076 #define HVCALL_GET_SYSTEM_PROPERTY 0x007b #define HVCALL_MAP_DEVICE_INTERRUPT 0x007c #define HVCALL_UNMAP_DEVICE_INTERRUPT 0x007d #define HVCALL_RETARGET_INTERRUPT 0x007e #define HVCALL_NOTIFY_PORT_RING_EMPTY 0x008b +#define HVCALL_REGISTER_INTERCEPT_RESULT 0x0091 #define HVCALL_ASSERT_VIRTUAL_INTERRUPT 0x0094 #define HVCALL_CREATE_PORT 0x0095 #define HVCALL_CONNECT_PORT 0x0096 @@ -460,12 +478,18 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */ #define HVCALL_GET_VP_ID_FROM_APIC_ID 0x009a #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0 +#define HVCALL_SIGNAL_EVENT_DIRECT 0x00c0 +#define HVCALL_POST_MESSAGE_DIRECT 0x00c1 #define HVCALL_DISPATCH_VP 0x00c2 +#define HVCALL_GET_GPA_PAGES_ACCESS_STATES 0x00c9 +#define HVCALL_ACQUIRE_SPARSE_SPA_PAGE_HOST_ACCESS 0x00d7 +#define HVCALL_RELEASE_SPARSE_SPA_PAGE_HOST_ACCESS 0x00d8 #define HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY 0x00db #define HVCALL_MAP_VP_STATE_PAGE 0x00e1 #define HVCALL_UNMAP_VP_STATE_PAGE 0x00e2 #define HVCALL_GET_VP_STATE 0x00e3 #define HVCALL_SET_VP_STATE 0x00e4 +#define HVCALL_GET_VP_CPUID_VALUES 0x00f4 #define HVCALL_MMIO_READ 0x0106 #define HVCALL_MMIO_WRITE 0x0107 @@ -775,10 +799,10 @@ struct hv_message_page { /* Define timer message payload structure. */ struct hv_timer_message_payload { - __u32 timer_index; - __u32 reserved; - __u64 expiration_time; /* When the timer expired */ - __u64 delivery_time; /* When the message was delivered */ + u32 timer_index; + u32 reserved; + u64 expiration_time; /* When the timer expired */ + u64 delivery_time; /* When the message was delivered */ } __packed; struct hv_x64_segment_register { @@ -807,6 +831,8 @@ struct hv_x64_table_register { u64 base; } __packed; +#define HV_NORMAL_VTL 0 + union hv_input_vtl { u8 as_uint8; struct { @@ -1325,6 +1351,49 @@ struct hv_retarget_device_interrupt { /* HV_INPUT_RETARGET_DEVICE_INTERRUPT */ struct hv_device_interrupt_target int_target; } __packed __aligned(8); +enum hv_intercept_type { +#if defined(CONFIG_X86) + HV_INTERCEPT_TYPE_X64_IO_PORT = 0x00000000, + HV_INTERCEPT_TYPE_X64_MSR = 0x00000001, + HV_INTERCEPT_TYPE_X64_CPUID = 0x00000002, +#endif + HV_INTERCEPT_TYPE_EXCEPTION = 0x00000003, + /* Used to be HV_INTERCEPT_TYPE_REGISTER */ + HV_INTERCEPT_TYPE_RESERVED0 = 0x00000004, + HV_INTERCEPT_TYPE_MMIO = 0x00000005, +#if defined(CONFIG_X86) + HV_INTERCEPT_TYPE_X64_GLOBAL_CPUID = 0x00000006, + HV_INTERCEPT_TYPE_X64_APIC_SMI = 0x00000007, +#endif + HV_INTERCEPT_TYPE_HYPERCALL = 0x00000008, +#if defined(CONFIG_X86) + HV_INTERCEPT_TYPE_X64_APIC_INIT_SIPI = 0x00000009, + HV_INTERCEPT_MC_UPDATE_PATCH_LEVEL_MSR_READ = 0x0000000A, + HV_INTERCEPT_TYPE_X64_APIC_WRITE = 0x0000000B, + HV_INTERCEPT_TYPE_X64_MSR_INDEX = 0x0000000C, +#endif + HV_INTERCEPT_TYPE_MAX, + HV_INTERCEPT_TYPE_INVALID = 0xFFFFFFFF, +}; + +union hv_intercept_parameters { + /* HV_INTERCEPT_PARAMETERS is defined to be an 8-byte field. */ + u64 as_uint64; +#if defined(CONFIG_X86) + /* HV_INTERCEPT_TYPE_X64_IO_PORT */ + u16 io_port; + /* HV_INTERCEPT_TYPE_X64_CPUID */ + u32 cpuid_index; + /* HV_INTERCEPT_TYPE_X64_APIC_WRITE */ + u32 apic_write_mask; + /* HV_INTERCEPT_TYPE_EXCEPTION */ + u16 exception_vector; + /* HV_INTERCEPT_TYPE_X64_MSR_INDEX */ + u32 msr_index; +#endif + /* N.B. Other intercept types do not have any parameters. */ +}; + /* Data structures for HVCALL_MMIO_READ and HVCALL_MMIO_WRITE */ #define HV_HYPERCALL_MMIO_MAX_DATA_LENGTH 64 diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h index 64407c2a3809..b4067ada02cf 100644 --- a/include/hyperv/hvhdk.h +++ b/include/hyperv/hvhdk.h @@ -19,11 +19,24 @@ #define HV_VP_REGISTER_PAGE_VERSION_1 1u +#define HV_VP_REGISTER_PAGE_MAX_VECTOR_COUNT 7 + +union hv_vp_register_page_interrupt_vectors { + u64 as_uint64; + struct { + u8 vector_count; + u8 vector[HV_VP_REGISTER_PAGE_MAX_VECTOR_COUNT]; + } __packed; +}; + struct hv_vp_register_page { u16 version; u8 isvalid; u8 rsvdz; u32 dirty; + +#if IS_ENABLED(CONFIG_X86) + union { struct { /* General purpose registers @@ -95,6 +108,22 @@ struct hv_vp_register_page { union hv_x64_pending_interruption_register pending_interruption; union hv_x64_interrupt_state_register interrupt_state; u64 instruction_emulation_hints; + u64 xfem; + + /* + * Fields from this point are not included in the register page save chunk. + * The reserved field is intended to maintain alignment for unsaved fields. + */ + u8 reserved1[0x100]; + + /* + * Interrupts injected as part of HvCallDispatchVp. + */ + union hv_vp_register_page_interrupt_vectors interrupt_vectors; + +#elif IS_ENABLED(CONFIG_ARM64) + /* Not yet supported in ARM */ +#endif } __packed; #define HV_PARTITION_PROCESSOR_FEATURES_BANKS 2 @@ -299,10 +328,11 @@ union hv_partition_isolation_properties { #define HV_PARTITION_ISOLATION_HOST_TYPE_RESERVED 0x2 /* Note: Exo partition is enabled by default */ -#define HV_PARTITION_CREATION_FLAG_EXO_PARTITION BIT(8) -#define HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED BIT(13) -#define HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED BIT(19) -#define HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE BIT(22) +#define HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED BIT(4) +#define HV_PARTITION_CREATION_FLAG_EXO_PARTITION BIT(8) +#define HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED BIT(13) +#define HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED BIT(19) +#define HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE BIT(22) struct hv_input_create_partition { u64 flags; @@ -349,13 +379,23 @@ struct hv_input_set_partition_property { enum hv_vp_state_page_type { HV_VP_STATE_PAGE_REGISTERS = 0, HV_VP_STATE_PAGE_INTERCEPT_MESSAGE = 1, + HV_VP_STATE_PAGE_GHCB = 2, HV_VP_STATE_PAGE_COUNT }; struct hv_input_map_vp_state_page { u64 partition_id; u32 vp_index; - u32 type; /* enum hv_vp_state_page_type */ + u16 type; /* enum hv_vp_state_page_type */ + union hv_input_vtl input_vtl; + union { + u8 as_uint8; + struct { + u8 map_location_provided : 1; + u8 reserved : 7; + }; + } flags; + u64 requested_map_location; } __packed; struct hv_output_map_vp_state_page { @@ -365,7 +405,14 @@ struct hv_output_map_vp_state_page { struct hv_input_unmap_vp_state_page { u64 partition_id; u32 vp_index; - u32 type; /* enum hv_vp_state_page_type */ + u16 type; /* enum hv_vp_state_page_type */ + union hv_input_vtl input_vtl; + u8 reserved0; +} __packed; + +struct hv_x64_apic_eoi_message { + u32 vp_index; + u32 interrupt_vector; } __packed; struct hv_opaque_intercept_message { @@ -515,6 +562,13 @@ struct hv_synthetic_timers_state { u64 reserved[5]; } __packed; +struct hv_async_completion_message_payload { + u64 partition_id; + u32 status; + u32 completion_count; + u64 sub_status; +} __packed; + union hv_input_delete_vp { u64 as_uint64[2]; struct { @@ -649,6 +703,57 @@ struct hv_input_set_vp_state { union hv_input_set_vp_state_data data[]; } __packed; +union hv_x64_vp_execution_state { + u16 as_uint16; + struct { + u16 cpl:2; + u16 cr0_pe:1; + u16 cr0_am:1; + u16 efer_lma:1; + u16 debug_active:1; + u16 interruption_pending:1; + u16 vtl:4; + u16 enclave_mode:1; + u16 interrupt_shadow:1; + u16 virtualization_fault_active:1; + u16 reserved:2; + } __packed; +}; + +struct hv_x64_intercept_message_header { + u32 vp_index; + u8 instruction_length:4; + u8 cr8:4; /* Only set for exo partitions */ + u8 intercept_access_type; + union hv_x64_vp_execution_state execution_state; + struct hv_x64_segment_register cs_segment; + u64 rip; + u64 rflags; +} __packed; + +union hv_x64_memory_access_info { + u8 as_uint8; + struct { + u8 gva_valid:1; + u8 gva_gpa_valid:1; + u8 hypercall_output_pending:1; + u8 tlb_locked_no_overlay:1; + u8 reserved:4; + } __packed; +}; + +struct hv_x64_memory_intercept_message { + struct hv_x64_intercept_message_header header; + u32 cache_type; /* enum hv_cache_type */ + u8 instruction_byte_count; + union hv_x64_memory_access_info memory_access_info; + u8 tpr_priority; + u8 reserved1; + u64 guest_virtual_address; + u64 guest_physical_address; + u8 instruction_bytes[16]; +} __packed; + /* * Dispatch state for the VP communicated by the hypervisor to the * VP-dispatching thread in the root on return from HVCALL_DISPATCH_VP. @@ -716,6 +821,7 @@ static_assert(sizeof(struct hv_vp_signal_pair_scheduler_message) == #define HV_DISPATCH_VP_FLAG_SKIP_VP_SPEC_FLUSH 0x8 #define HV_DISPATCH_VP_FLAG_SKIP_CALLER_SPEC_FLUSH 0x10 #define HV_DISPATCH_VP_FLAG_SKIP_CALLER_USER_SPEC_FLUSH 0x20 +#define HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION 0x40 struct hv_input_dispatch_vp { u64 partition_id; @@ -730,4 +836,18 @@ struct hv_output_dispatch_vp { u32 dispatch_event; /* enum hv_vp_dispatch_event */ } __packed; +struct hv_input_modify_sparse_spa_page_host_access { + u32 host_access : 2; + u32 reserved : 30; + u32 flags; + u64 partition_id; + u64 spa_page_list[]; +} __packed; + +/* hv_input_modify_sparse_spa_page_host_access flags */ +#define HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE 0x1 +#define HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED 0x2 +#define HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE 0x4 +#define HV_MODIFY_SPA_PAGE_HOST_ACCESS_HUGE_PAGE 0x8 + #endif /* _HV_HVHDK_H */ diff --git a/include/hyperv/hvhdk_mini.h b/include/hyperv/hvhdk_mini.h index f8a39d3e9ce6..42e7876455b5 100644 --- a/include/hyperv/hvhdk_mini.h +++ b/include/hyperv/hvhdk_mini.h @@ -36,6 +36,52 @@ enum hv_scheduler_type { HV_SCHEDULER_TYPE_MAX }; +/* HV_STATS_AREA_TYPE */ +enum hv_stats_area_type { + HV_STATS_AREA_SELF = 0, + HV_STATS_AREA_PARENT = 1, + HV_STATS_AREA_INTERNAL = 2, + HV_STATS_AREA_COUNT +}; + +enum hv_stats_object_type { + HV_STATS_OBJECT_HYPERVISOR = 0x00000001, + HV_STATS_OBJECT_LOGICAL_PROCESSOR = 0x00000002, + HV_STATS_OBJECT_PARTITION = 0x00010001, + HV_STATS_OBJECT_VP = 0x00010002 +}; + +union hv_stats_object_identity { + /* hv_stats_hypervisor */ + struct { + u8 reserved[15]; + u8 stats_area_type; + } __packed hv; + + /* hv_stats_logical_processor */ + struct { + u32 lp_index; + u8 reserved[11]; + u8 stats_area_type; + } __packed lp; + + /* hv_stats_partition */ + struct { + u64 partition_id; + u8 reserved[7]; + u8 stats_area_type; + } __packed partition; + + /* hv_stats_vp */ + struct { + u64 partition_id; + u32 vp_index; + u16 flags; + u8 reserved; + u8 stats_area_type; + } __packed vp; +}; + enum hv_partition_property_code { /* Privilege properties */ HV_PARTITION_PROPERTY_PRIVILEGE_FLAGS = 0x00010000, @@ -47,19 +93,45 @@ enum hv_partition_property_code { /* Compatibility properties */ HV_PARTITION_PROPERTY_PROCESSOR_XSAVE_FEATURES = 0x00060002, + HV_PARTITION_PROPERTY_XSAVE_STATES = 0x00060007, HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE = 0x00060008, HV_PARTITION_PROPERTY_PROCESSOR_CLOCK_FREQUENCY = 0x00060009, }; +enum hv_snp_status { + HV_SNP_STATUS_NONE = 0, + HV_SNP_STATUS_AVAILABLE = 1, + HV_SNP_STATUS_INCOMPATIBLE = 2, + HV_SNP_STATUS_PSP_UNAVAILABLE = 3, + HV_SNP_STATUS_PSP_INIT_FAILED = 4, + HV_SNP_STATUS_PSP_BAD_FW_VERSION = 5, + HV_SNP_STATUS_BAD_CONFIGURATION = 6, + HV_SNP_STATUS_PSP_FW_UPDATE_IN_PROGRESS = 7, + HV_SNP_STATUS_PSP_RB_INIT_FAILED = 8, + HV_SNP_STATUS_PSP_PLATFORM_STATUS_FAILED = 9, + HV_SNP_STATUS_PSP_INIT_LATE_FAILED = 10, +}; + enum hv_system_property { /* Add more values when needed */ HV_SYSTEM_PROPERTY_SCHEDULER_TYPE = 15, + HV_DYNAMIC_PROCESSOR_FEATURE_PROPERTY = 21, +}; + +enum hv_dynamic_processor_feature_property { + /* Add more values when needed */ + HV_X64_DYNAMIC_PROCESSOR_FEATURE_MAX_ENCRYPTED_PARTITIONS = 13, + HV_X64_DYNAMIC_PROCESSOR_FEATURE_SNP_STATUS = 16, }; struct hv_input_get_system_property { u32 property_id; /* enum hv_system_property */ union { u32 as_uint32; +#if IS_ENABLED(CONFIG_X86) + /* enum hv_dynamic_processor_feature_property */ + u32 hv_processor_feature; +#endif /* More fields to be filled in when needed */ }; } __packed; @@ -67,9 +139,28 @@ struct hv_input_get_system_property { struct hv_output_get_system_property { union { u32 scheduler_type; /* enum hv_scheduler_type */ +#if IS_ENABLED(CONFIG_X86) + u64 hv_processor_feature_value; +#endif }; } __packed; +struct hv_input_map_stats_page { + u32 type; /* enum hv_stats_object_type */ + u32 padding; + union hv_stats_object_identity identity; +} __packed; + +struct hv_output_map_stats_page { + u64 map_location; +} __packed; + +struct hv_input_unmap_stats_page { + u32 type; /* enum hv_stats_object_type */ + u32 padding; + union hv_stats_object_identity identity; +} __packed; + struct hv_proximity_domain_flags { u32 proximity_preferred : 1; u32 reserved : 30; diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 147bd3ee4f7b..96754b51b411 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -37,21 +37,15 @@ struct arm_pmu_entry { struct arm_pmu *arm_pmu; }; -DECLARE_STATIC_KEY_FALSE(kvm_arm_pmu_available); - -static __always_inline bool kvm_arm_support_pmu_v3(void) -{ - return static_branch_likely(&kvm_arm_pmu_available); -} - +bool kvm_supports_guest_pmuv3(void); #define kvm_arm_pmu_irq_initialized(v) ((v)->arch.pmu.irq_num >= VGIC_NR_SGIS) u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx); void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val); +void kvm_pmu_set_counter_value_user(struct kvm_vcpu *vcpu, u64 select_idx, u64 val); u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu); u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu); u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1); void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu); -void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu); void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu); void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu); @@ -86,7 +80,7 @@ void kvm_vcpu_pmu_resync_el0(void); */ #define kvm_pmu_update_vcpu_events(vcpu) \ do { \ - if (!has_vhe() && kvm_arm_support_pmu_v3()) \ + if (!has_vhe() && system_supports_pmuv3()) \ vcpu->arch.pmu.events = *kvm_get_pmu_events(); \ } while (0) @@ -102,7 +96,7 @@ void kvm_pmu_nested_transition(struct kvm_vcpu *vcpu); struct kvm_pmu { }; -static inline bool kvm_arm_support_pmu_v3(void) +static inline bool kvm_supports_guest_pmuv3(void) { return false; } @@ -115,6 +109,8 @@ static inline u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, } static inline void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) {} +static inline void kvm_pmu_set_counter_value_user(struct kvm_vcpu *vcpu, + u64 select_idx, u64 val) {} static inline u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu) { return 0; @@ -124,7 +120,6 @@ static inline u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu) return 0; } static inline void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) {} -static inline void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu) {} diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 3a8ccfda34d2..714cef854c1c 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -249,6 +249,9 @@ struct vgic_dist { int nr_spis; + /* The GIC maintenance IRQ for nested hypervisors. */ + u32 mi_intid; + /* base addresses in guest physical address space: */ gpa_t vgic_dist_base; /* distributor */ union { @@ -369,6 +372,7 @@ extern struct static_key_false vgic_v3_cpuif_trap; int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr); void kvm_vgic_early_init(struct kvm *kvm); int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu); +int kvm_vgic_vcpu_nv_init(struct kvm_vcpu *vcpu); int kvm_vgic_create(struct kvm *kvm, u32 type); void kvm_vgic_destroy(struct kvm *kvm); void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu); @@ -389,6 +393,10 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu); void kvm_vgic_load(struct kvm_vcpu *vcpu); void kvm_vgic_put(struct kvm_vcpu *vcpu); +u16 vgic_v3_get_eisr(struct kvm_vcpu *vcpu); +u16 vgic_v3_get_elrsr(struct kvm_vcpu *vcpu); +u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu); + #define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel)) #define vgic_initialized(k) ((k)->arch.vgic.initialized) #define vgic_ready(k) ((k)->arch.vgic.ready) @@ -433,6 +441,8 @@ int vgic_v4_load(struct kvm_vcpu *vcpu); void vgic_v4_commit(struct kvm_vcpu *vcpu); int vgic_v4_put(struct kvm_vcpu *vcpu); +bool vgic_state_is_nested(struct kvm_vcpu *vcpu); + /* CPU HP callbacks */ void kvm_vgic_cpu_up(void); void kvm_vgic_cpu_down(void); diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 4e495b29c640..a70e62d69dc7 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -330,7 +330,6 @@ static inline bool acpi_sci_irq_valid(void) } extern int sbf_port; -extern unsigned long acpi_realmode_flags; int acpi_register_gsi (struct device *dev, u32 gsi, int triggering, int polarity); int acpi_gsi_to_irq (u32 gsi, unsigned int *irq); diff --git a/include/linux/align.h b/include/linux/align.h index 2b4acec7b95a..55debf105a5d 100644 --- a/include/linux/align.h +++ b/include/linux/align.h @@ -2,14 +2,6 @@ #ifndef _LINUX_ALIGN_H #define _LINUX_ALIGN_H -#include <linux/const.h> - -/* @a is a power of 2 value */ -#define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) -#define ALIGN_DOWN(x, a) __ALIGN_KERNEL((x) - ((a) - 1), (a)) -#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask)) -#define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a))) -#define PTR_ALIGN_DOWN(p, a) ((typeof(p))ALIGN_DOWN((unsigned long)(p), (a))) -#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) +#include <vdso/align.h> #endif /* _LINUX_ALIGN_H */ diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 67f6fdf2e7cd..f19be5754090 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -179,6 +179,9 @@ #define ARM_SMCCC_KVM_FUNC_PKVM_RESV_62 62 #define ARM_SMCCC_KVM_FUNC_PKVM_RESV_63 63 /* End of pKVM hypercall range */ +#define ARM_SMCCC_KVM_FUNC_DISCOVER_IMPL_VER 64 +#define ARM_SMCCC_KVM_FUNC_DISCOVER_IMPL_CPUS 65 + #define ARM_SMCCC_KVM_FUNC_FEATURES_2 127 #define ARM_SMCCC_KVM_NUM_FUNCS 128 @@ -225,6 +228,18 @@ ARM_SMCCC_OWNER_VENDOR_HYP, \ ARM_SMCCC_KVM_FUNC_MMIO_GUARD) +#define ARM_SMCCC_VENDOR_HYP_KVM_DISCOVER_IMPL_VER_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_DISCOVER_IMPL_VER) + +#define ARM_SMCCC_VENDOR_HYP_KVM_DISCOVER_IMPL_CPUS_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_DISCOVER_IMPL_CPUS) + /* ptp_kvm counter type ID */ #define KVM_PTP_VIRT_COUNTER 0 #define KVM_PTP_PHYS_COUNTER 1 diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 60d674af3080..1625c8529e70 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -64,7 +64,7 @@ struct linux_binprm { const char *fdpath; /* generated filename for execveat */ unsigned interp_flags; int execfd; /* File descriptor of the executable */ - unsigned long loader, exec; + unsigned long exec; struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */ diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 2026953e2c4e..595217b7a6e7 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -560,9 +560,9 @@ void bitmap_replace(unsigned long *dst, * ...0..11...0..10 * dst: 0000001100000010 * - * A relationship exists between bitmap_scatter() and bitmap_gather(). + * A relationship exists between bitmap_scatter() and bitmap_gather(). See + * bitmap_gather() for the bitmap gather detailed operations. TL;DR: * bitmap_gather() can be seen as the 'reverse' bitmap_scatter() operation. - * See bitmap_scatter() for details related to this relationship. */ static __always_inline void bitmap_scatter(unsigned long *dst, const unsigned long *src, @@ -608,7 +608,9 @@ void bitmap_scatter(unsigned long *dst, const unsigned long *src, * dst: 0000000000011010 * * A relationship exists between bitmap_gather() and bitmap_scatter(). See - * bitmap_scatter() for the bitmap scatter detailed operations. + * bitmap_scatter() for the bitmap scatter detailed operations. TL;DR: + * bitmap_scatter() can be seen as the 'reverse' bitmap_gather() operation. + * * Suppose scattered computed using bitmap_scatter(scattered, src, mask, n). * The operation bitmap_gather(result, scattered, mask, n) leads to a result * equal or equivalent to src. diff --git a/include/linux/bits.h b/include/linux/bits.h index 61a75d3f294b..14fd0ca9a6cd 100644 --- a/include/linux/bits.h +++ b/include/linux/bits.h @@ -40,7 +40,7 @@ * Missing asm support * * __GENMASK_U128() depends on _BIT128() which would not work - * in the asm code, as it shifts an 'unsigned __init128' data + * in the asm code, as it shifts an 'unsigned __int128' data * type instead of direct representation of 128 bit constants * such as long and unsigned long. The fundamental problem is * that a 128 bit constant will get silently truncated by the diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d37751789bf5..1c0cf6af392c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -268,10 +268,16 @@ static inline dev_t disk_devt(struct gendisk *disk) return MKDEV(disk->major, disk->first_minor); } +/* + * We should strive for 1 << (PAGE_SHIFT + MAX_PAGECACHE_ORDER) + * however we constrain this to what we can validate and test. + */ +#define BLK_MAX_BLOCK_SIZE SZ_64K + /* blk_validate_limits() validates bsize, so drivers don't usually need to */ static inline int blk_validate_block_size(unsigned long bsize) { - if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize)) + if (bsize < 512 || bsize > BLK_MAX_BLOCK_SIZE || !is_power_of_2(bsize)) return -EINVAL; return 0; diff --git a/include/linux/cache.h b/include/linux/cache.h index ca2a05682a54..e69768f50d53 100644 --- a/include/linux/cache.h +++ b/include/linux/cache.h @@ -3,16 +3,13 @@ #define __LINUX_CACHE_H #include <uapi/linux/kernel.h> +#include <vdso/cache.h> #include <asm/cache.h> #ifndef L1_CACHE_ALIGN #define L1_CACHE_ALIGN(x) __ALIGN_KERNEL(x, L1_CACHE_BYTES) #endif -#ifndef SMP_CACHE_BYTES -#define SMP_CACHE_BYTES L1_CACHE_BYTES -#endif - /** * SMP_CACHE_ALIGN - align a value to the L2 cacheline size * @x: value to align @@ -63,10 +60,6 @@ #define __ro_after_init __section(".data..ro_after_init") #endif -#ifndef ____cacheline_aligned -#define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) -#endif - #ifndef ____cacheline_aligned_in_smp #ifdef CONFIG_SMP #define ____cacheline_aligned_in_smp ____cacheline_aligned diff --git a/include/linux/cfi.h b/include/linux/cfi.h index f0df518e11dd..1db17ecbb86c 100644 --- a/include/linux/cfi.h +++ b/include/linux/cfi.h @@ -11,6 +11,8 @@ #include <linux/module.h> #include <asm/cfi.h> +extern bool cfi_warn; + #ifndef cfi_get_offset static inline int cfi_get_offset(void) { diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 17960a1e858d..485b651869d9 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -619,9 +619,8 @@ struct cgroup_root { */ struct cftype { /* - * By convention, the name should begin with the name of the - * subsystem, followed by a period. Zero length string indicates - * end of cftype array. + * Name of the subsystem is prepended in cgroup_file_name(). + * Zero length string indicates end of cftype array. */ char name[MAX_CFTYPE_NAME]; unsigned long private; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f8ef47f8a634..28e999f2c642 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -113,6 +113,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); +int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cftype *cfts); void cgroup_file_notify(struct cgroup_file *cfile); void cgroup_file_show(struct cgroup_file *cfile, bool show); @@ -689,8 +690,6 @@ static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) */ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu); void cgroup_rstat_flush(struct cgroup *cgrp); -void cgroup_rstat_flush_hold(struct cgroup *cgrp); -void cgroup_rstat_flush_release(struct cgroup *cgrp); /* * Basic resource stats. diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index ee2614adb785..2b32a5759b22 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -216,6 +216,23 @@ const volatile void * __must_check_fn(const volatile void *val) #define return_ptr(p) return no_free_ptr(p) +/* + * Only for situations where an allocation is handed in to another function + * and consumed by that function on success. + * + * struct foo *f __free(kfree) = kzalloc(sizeof(*f), GFP_KERNEL); + * + * setup(f); + * if (some_condition) + * return -EINVAL; + * .... + * ret = bar(f); + * if (!ret) + * retain_ptr(f); + * return ret; + */ +#define retain_ptr(p) \ + __get_and_null(p, NULL) /* * DEFINE_CLASS(name, type, exit, init, init_args...): @@ -291,11 +308,21 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ #define __DEFINE_CLASS_IS_CONDITIONAL(_name, _is_cond) \ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond -#define DEFINE_GUARD(_name, _type, _lock, _unlock) \ +#define __DEFINE_GUARD_LOCK_PTR(_name, _exp) \ + static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \ + { return (void *)(__force unsigned long)*(_exp); } + +#define DEFINE_CLASS_IS_GUARD(_name) \ __DEFINE_CLASS_IS_CONDITIONAL(_name, false); \ + __DEFINE_GUARD_LOCK_PTR(_name, _T) + +#define DEFINE_CLASS_IS_COND_GUARD(_name) \ + __DEFINE_CLASS_IS_CONDITIONAL(_name, true); \ + __DEFINE_GUARD_LOCK_PTR(_name, _T) + +#define DEFINE_GUARD(_name, _type, _lock, _unlock) \ DEFINE_CLASS(_name, _type, if (_T) { _unlock; }, ({ _lock; _T; }), _type _T); \ - static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \ - { return (void *)(__force unsigned long)*_T; } + DEFINE_CLASS_IS_GUARD(_name) #define DEFINE_GUARD_COND(_name, _ext, _condlock) \ __DEFINE_CLASS_IS_CONDITIONAL(_name##_ext, true); \ @@ -375,11 +402,7 @@ static inline void class_##_name##_destructor(class_##_name##_t *_T) \ if (_T->lock) { _unlock; } \ } \ \ -static inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \ -{ \ - return (void *)(__force unsigned long)_T->lock; \ -} - +__DEFINE_GUARD_LOCK_PTR(_name, &_T->lock) #define __DEFINE_LOCK_GUARD_1(_name, _type, _lock) \ static inline class_##_name##_t class_##_name##_constructor(_type *l) \ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 155385754824..9fc30b6b80c9 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -206,12 +206,38 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #define __must_be_byte_array(a) __BUILD_BUG_ON_ZERO_MSG(!__is_byte_array(a), \ "must be byte array") +/* + * If the "nonstring" attribute isn't available, we have to return true + * so the __must_*() checks pass when "nonstring" isn't supported. + */ +#if __has_attribute(__nonstring__) && defined(__annotated) +#define __is_cstr(a) (!__annotated(a, nonstring)) +#define __is_noncstr(a) (__annotated(a, nonstring)) +#else +#define __is_cstr(a) (true) +#define __is_noncstr(a) (true) +#endif + /* Require C Strings (i.e. NUL-terminated) lack the "nonstring" attribute. */ #define __must_be_cstr(p) \ - __BUILD_BUG_ON_ZERO_MSG(__annotated(p, nonstring), "must be cstr (NUL-terminated)") + __BUILD_BUG_ON_ZERO_MSG(!__is_cstr(p), \ + "must be C-string (NUL-terminated)") +#define __must_be_noncstr(p) \ + __BUILD_BUG_ON_ZERO_MSG(!__is_noncstr(p), \ + "must be non-C-string (not NUL-terminated)") #endif /* __KERNEL__ */ +#if defined(CONFIG_CFI_CLANG) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +/* + * Force a reference to the external symbol so the compiler generates + * __kcfi_typid. + */ +#define KCFI_REFERENCE(sym) __ADDRESSABLE(sym) +#else +#define KCFI_REFERENCE(sym) +#endif + /** * offset_to_ptr - convert a relative memory offset to an absolute pointer * @off: the address of the 32-bit offset value diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 981cc3d7e3aa..e09d323be845 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -349,6 +349,18 @@ struct ftrace_likely_data { #endif /* + * Optional: only supported since gcc >= 15 + * Optional: not supported by Clang + * + * gcc: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117178 + */ +#ifdef CONFIG_CC_HAS_MULTIDIMENSIONAL_NONSTRING +# define __nonstring_array __attribute__((__nonstring__)) +#else +# define __nonstring_array +#endif + +/* * Apply __counted_by() when the Endianness matches to increase test coverage. */ #ifdef __LITTLE_ENDIAN @@ -360,7 +372,7 @@ struct ftrace_likely_data { #endif /* Do not trap wrapping arithmetic within an annotated function. */ -#ifdef CONFIG_UBSAN_SIGNED_WRAP +#ifdef CONFIG_UBSAN_INTEGER_WRAP # define __signed_wrap __attribute__((no_sanitize("signed-integer-overflow"))) #else # define __signed_wrap @@ -446,11 +458,14 @@ struct ftrace_likely_data { #define __member_size(p) __builtin_object_size(p, 1) #endif -/* Determine if an attribute has been applied to a variable. */ +/* + * Determine if an attribute has been applied to a variable. + * Using __annotated needs to check for __annotated being available, + * or negative tests may fail when annotation cannot be checked. For + * example, see the definition of __is_cstr(). + */ #if __has_builtin(__builtin_has_attribute) #define __annotated(var, attr) __builtin_has_attribute(var, attr) -#else -#define __annotated(var, attr) (false) #endif /* diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 7fe0981a7e46..400fee6427a5 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -144,6 +144,9 @@ struct cpufreq_policy { /* Per policy boost enabled flag. */ bool boost_enabled; + /* Per policy boost supported flag. */ + bool boost_supported; + /* Cached frequency lookup from cpufreq_driver_resolve_freq. */ unsigned int cached_target_freq; unsigned int cached_resolved_idx; @@ -210,6 +213,9 @@ static inline struct cpufreq_policy *cpufreq_cpu_get(unsigned int cpu) static inline void cpufreq_cpu_put(struct cpufreq_policy *policy) { } #endif +/* Scope based cleanup macro for cpufreq_policy kobject reference counting */ +DEFINE_FREE(put_cpufreq_policy, struct cpufreq_policy *, if (_T) cpufreq_cpu_put(_T)) + static inline bool policy_is_inactive(struct cpufreq_policy *policy) { return cpumask_empty(policy->cpus); @@ -778,10 +784,8 @@ int cpufreq_frequency_table_get_index(struct cpufreq_policy *policy, ssize_t cpufreq_show_cpus(const struct cpumask *mask, char *buf); #ifdef CONFIG_CPU_FREQ -int cpufreq_boost_trigger_state(int state); bool cpufreq_boost_enabled(void); -int cpufreq_enable_boost_support(void); -bool policy_has_boost_freq(struct cpufreq_policy *policy); +int cpufreq_boost_set_sw(struct cpufreq_policy *policy, int state); /* Find lowest freq at or above target in a table in ascending order */ static inline int cpufreq_table_find_index_al(struct cpufreq_policy *policy, @@ -1150,23 +1154,14 @@ static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_ return 0; } #else -static inline int cpufreq_boost_trigger_state(int state) -{ - return 0; -} static inline bool cpufreq_boost_enabled(void) { return false; } -static inline int cpufreq_enable_boost_support(void) +static inline int cpufreq_boost_set_sw(struct cpufreq_policy *policy, int state) { - return -EINVAL; -} - -static inline bool policy_has_boost_freq(struct cpufreq_policy *policy) -{ - return false; + return -EOPNOTSUPP; } static inline int @@ -1184,7 +1179,7 @@ static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_ } #endif -extern unsigned int arch_freq_get_on_cpu(int cpu); +extern int arch_freq_get_on_cpu(int cpu); #ifndef arch_set_freq_scale static __always_inline @@ -1198,7 +1193,6 @@ void arch_set_freq_scale(const struct cpumask *cpus, /* the following are really really optional */ extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs; extern struct freq_attr cpufreq_freq_attr_scaling_boost_freqs; -extern struct freq_attr *cpufreq_generic_attr[]; int cpufreq_table_validate_and_sort(struct cpufreq_policy *policy); unsigned int cpufreq_generic_get(unsigned int cpu); diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 36a890d0dd57..f9a868384083 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -81,7 +81,7 @@ static __always_inline void set_nr_cpu_ids(unsigned int nr) * * cpu_possible_mask- has bit 'cpu' set iff cpu is populatable * cpu_present_mask - has bit 'cpu' set iff cpu is populated - * cpu_enabled_mask - has bit 'cpu' set iff cpu can be brought online + * cpu_enabled_mask - has bit 'cpu' set iff cpu can be brought online * cpu_online_mask - has bit 'cpu' set iff cpu available to scheduler * cpu_active_mask - has bit 'cpu' set iff cpu available to migration * @@ -285,35 +285,52 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p, } /** - * for_each_cpu - iterate over every cpu in a mask - * @cpu: the (optionally unsigned) integer iterator - * @mask: the cpumask pointer + * cpumask_next_and_wrap - get the next cpu in *src1p & *src2p, starting from + * @n+1. If nothing found, wrap around and start from + * the beginning + * @n: the cpu prior to the place to search (i.e. search starts from @n+1) + * @src1p: the first cpumask pointer + * @src2p: the second cpumask pointer * - * After the loop, cpu is >= nr_cpu_ids. + * Return: next set bit, wrapped if needed, or >= nr_cpu_ids if @src1p & @src2p is empty. */ -#define for_each_cpu(cpu, mask) \ - for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits) - -#if NR_CPUS == 1 static __always_inline -unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap) +unsigned int cpumask_next_and_wrap(int n, const struct cpumask *src1p, + const struct cpumask *src2p) { - cpumask_check(start); + /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); + return find_next_and_bit_wrap(cpumask_bits(src1p), cpumask_bits(src2p), + small_cpumask_bits, n + 1); +} - /* - * Return the first available CPU when wrapping, or when starting before cpu0, - * since there is only one valid option. - */ - if (wrap && n >= 0) - return nr_cpumask_bits; - - return cpumask_first(mask); +/** + * cpumask_next_wrap - get the next cpu in *src, starting from @n+1. If nothing + * found, wrap around and start from the beginning + * @n: the cpu prior to the place to search (i.e. search starts from @n+1) + * @src: cpumask pointer + * + * Return: next set bit, wrapped if needed, or >= nr_cpu_ids if @src is empty. + */ +static __always_inline +unsigned int cpumask_next_wrap(int n, const struct cpumask *src) +{ + /* -1 is a legal arg here. */ + if (n != -1) + cpumask_check(n); + return find_next_bit_wrap(cpumask_bits(src), small_cpumask_bits, n + 1); } -#else -unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap); -#endif + +/** + * for_each_cpu - iterate over every cpu in a mask + * @cpu: the (optionally unsigned) integer iterator + * @mask: the cpumask pointer + * + * After the loop, cpu is >= nr_cpu_ids. + */ +#define for_each_cpu(cpu, mask) \ + for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits) /** * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location @@ -1033,11 +1050,21 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); #define for_each_possible_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++) #define for_each_online_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++) #define for_each_present_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++) + +#define for_each_possible_cpu_wrap(cpu, start) \ + for ((void)(start), (cpu) = 0; (cpu) < 1; (cpu)++) +#define for_each_online_cpu_wrap(cpu, start) \ + for ((void)(start), (cpu) = 0; (cpu) < 1; (cpu)++) #else #define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask) #define for_each_online_cpu(cpu) for_each_cpu((cpu), cpu_online_mask) #define for_each_enabled_cpu(cpu) for_each_cpu((cpu), cpu_enabled_mask) #define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask) + +#define for_each_possible_cpu_wrap(cpu, start) \ + for_each_cpu_wrap((cpu), cpu_possible_mask, (start)) +#define for_each_online_cpu_wrap(cpu, start) \ + for_each_cpu_wrap((cpu), cpu_online_mask, (start)) #endif /* Wrappers for arch boot code to manipulate normally-constant masks */ diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 835e7b793f6a..5466c96a33db 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -125,9 +125,11 @@ static inline int cpuset_do_page_mem_spread(void) extern bool current_cpuset_is_being_rebound(void); +extern void dl_rebuild_rd_accounting(void); extern void rebuild_sched_domains(void); extern void cpuset_print_current_mems_allowed(void); +extern void cpuset_reset_sched_domains(void); /* * read_mems_allowed_begin is required when making decisions involving @@ -259,11 +261,20 @@ static inline bool current_cpuset_is_being_rebound(void) return false; } +static inline void dl_rebuild_rd_accounting(void) +{ +} + static inline void rebuild_sched_domains(void) { partition_sched_domains(1, NULL, NULL); } +static inline void cpuset_reset_sched_domains(void) +{ + partition_sched_domains(1, NULL, NULL); +} + static inline void cpuset_print_current_mems_allowed(void) { } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 4afb60365675..45bff10d3773 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -203,34 +203,34 @@ struct dentry_operations { #define DCACHE_NFSFS_RENAMED BIT(12) /* this dentry has been "silly renamed" and has to be deleted on the last * dput() */ -#define DCACHE_FSNOTIFY_PARENT_WATCHED BIT(14) +#define DCACHE_FSNOTIFY_PARENT_WATCHED BIT(13) /* Parent inode is watched by some fsnotify listener */ -#define DCACHE_DENTRY_KILLED BIT(15) +#define DCACHE_DENTRY_KILLED BIT(14) -#define DCACHE_MOUNTED BIT(16) /* is a mountpoint */ -#define DCACHE_NEED_AUTOMOUNT BIT(17) /* handle automount on this dir */ -#define DCACHE_MANAGE_TRANSIT BIT(18) /* manage transit from this dirent */ +#define DCACHE_MOUNTED BIT(15) /* is a mountpoint */ +#define DCACHE_NEED_AUTOMOUNT BIT(16) /* handle automount on this dir */ +#define DCACHE_MANAGE_TRANSIT BIT(17) /* manage transit from this dirent */ #define DCACHE_MANAGED_DENTRY \ (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT) -#define DCACHE_LRU_LIST BIT(19) +#define DCACHE_LRU_LIST BIT(18) -#define DCACHE_ENTRY_TYPE (7 << 20) /* bits 20..22 are for storing type: */ -#define DCACHE_MISS_TYPE (0 << 20) /* Negative dentry */ -#define DCACHE_WHITEOUT_TYPE (1 << 20) /* Whiteout dentry (stop pathwalk) */ -#define DCACHE_DIRECTORY_TYPE (2 << 20) /* Normal directory */ -#define DCACHE_AUTODIR_TYPE (3 << 20) /* Lookupless directory (presumed automount) */ -#define DCACHE_REGULAR_TYPE (4 << 20) /* Regular file type */ -#define DCACHE_SPECIAL_TYPE (5 << 20) /* Other file type */ -#define DCACHE_SYMLINK_TYPE (6 << 20) /* Symlink */ +#define DCACHE_ENTRY_TYPE (7 << 19) /* bits 19..21 are for storing type: */ +#define DCACHE_MISS_TYPE (0 << 19) /* Negative dentry */ +#define DCACHE_WHITEOUT_TYPE (1 << 19) /* Whiteout dentry (stop pathwalk) */ +#define DCACHE_DIRECTORY_TYPE (2 << 19) /* Normal directory */ +#define DCACHE_AUTODIR_TYPE (3 << 19) /* Lookupless directory (presumed automount) */ +#define DCACHE_REGULAR_TYPE (4 << 19) /* Regular file type */ +#define DCACHE_SPECIAL_TYPE (5 << 19) /* Other file type */ +#define DCACHE_SYMLINK_TYPE (6 << 19) /* Symlink */ -#define DCACHE_NOKEY_NAME BIT(25) /* Encrypted name encoded without key */ -#define DCACHE_OP_REAL BIT(26) +#define DCACHE_NOKEY_NAME BIT(22) /* Encrypted name encoded without key */ +#define DCACHE_OP_REAL BIT(23) -#define DCACHE_PAR_LOOKUP BIT(28) /* being looked up (with parent locked shared) */ -#define DCACHE_DENTRY_CURSOR BIT(29) -#define DCACHE_NORCU BIT(30) /* No RCU delay for freeing */ +#define DCACHE_PAR_LOOKUP BIT(24) /* being looked up (with parent locked shared) */ +#define DCACHE_DENTRY_CURSOR BIT(25) +#define DCACHE_NORCU BIT(26) /* No RCU delay for freeing */ extern seqlock_t rename_lock; @@ -253,7 +253,6 @@ extern struct dentry * d_splice_alias(struct inode *, struct dentry *); extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); extern bool d_same_name(const struct dentry *dentry, const struct dentry *parent, const struct qstr *name); -extern struct dentry * d_exact_alias(struct dentry *, struct inode *); extern struct dentry *d_find_any_alias(struct inode *inode); extern struct dentry * d_obtain_alias(struct inode *); extern struct dentry * d_obtain_root(struct inode *); diff --git a/include/linux/device.h b/include/linux/device.h index 80a5b3268986..615282365052 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1025,6 +1025,15 @@ static inline bool dev_pm_test_driver_flags(struct device *dev, u32 flags) return !!(dev->power.driver_flags & flags); } +static inline bool dev_pm_smart_suspend(struct device *dev) +{ +#ifdef CONFIG_PM_SLEEP + return dev->power.smart_suspend; +#else + return false; +#endif +} + static inline void device_lock(struct device *dev) { mutex_lock(&dev->mutex); diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index d7e30d4f7503..f3bc0bcd7098 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -78,14 +78,18 @@ static inline dma_addr_t dma_range_map_max(const struct bus_dma_region *map) #define phys_to_dma_unencrypted phys_to_dma #endif #else -static inline dma_addr_t phys_to_dma_unencrypted(struct device *dev, - phys_addr_t paddr) +static inline dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr) { if (dev->dma_range_map) return translate_phys_to_dma(dev, paddr); return paddr; } +static inline dma_addr_t phys_to_dma_unencrypted(struct device *dev, + phys_addr_t paddr) +{ + return dma_addr_unencrypted(__phys_to_dma(dev, paddr)); +} /* * If memory encryption is supported, phys_to_dma will set the memory encryption * bit in the DMA address, and dma_to_phys will clear it. @@ -94,19 +98,20 @@ static inline dma_addr_t phys_to_dma_unencrypted(struct device *dev, */ static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) { - return __sme_set(phys_to_dma_unencrypted(dev, paddr)); + return dma_addr_encrypted(__phys_to_dma(dev, paddr)); } static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dma_addr) { phys_addr_t paddr; + dma_addr = dma_addr_canonical(dma_addr); if (dev->dma_range_map) paddr = translate_dma_to_phys(dev, dma_addr); else paddr = dma_addr; - return __sme_clr(paddr); + return paddr; } #endif /* !CONFIG_ARCH_HAS_PHYS_TO_DMA */ diff --git a/include/linux/edac.h b/include/linux/edac.h index b4ee8961e623..451f9c152c99 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -661,4 +661,219 @@ static inline struct dimm_info *edac_get_dimm(struct mem_ctl_info *mci, return mci->dimms[index]; } + +#define EDAC_FEAT_NAME_LEN 128 + +/* RAS feature type */ +enum edac_dev_feat { + RAS_FEAT_SCRUB, + RAS_FEAT_ECS, + RAS_FEAT_MEM_REPAIR, + RAS_FEAT_MAX +}; + +/** + * struct edac_scrub_ops - scrub device operations (all elements optional) + * @read_addr: read base address of scrubbing range. + * @read_size: read offset of scrubbing range. + * @write_addr: set base address of the scrubbing range. + * @write_size: set offset of the scrubbing range. + * @get_enabled_bg: check if currently performing background scrub. + * @set_enabled_bg: start or stop a bg-scrub. + * @get_min_cycle: get minimum supported scrub cycle duration in seconds. + * @get_max_cycle: get maximum supported scrub cycle duration in seconds. + * @get_cycle_duration: get current scrub cycle duration in seconds. + * @set_cycle_duration: set current scrub cycle duration in seconds. + */ +struct edac_scrub_ops { + int (*read_addr)(struct device *dev, void *drv_data, u64 *base); + int (*read_size)(struct device *dev, void *drv_data, u64 *size); + int (*write_addr)(struct device *dev, void *drv_data, u64 base); + int (*write_size)(struct device *dev, void *drv_data, u64 size); + int (*get_enabled_bg)(struct device *dev, void *drv_data, bool *enable); + int (*set_enabled_bg)(struct device *dev, void *drv_data, bool enable); + int (*get_min_cycle)(struct device *dev, void *drv_data, u32 *min); + int (*get_max_cycle)(struct device *dev, void *drv_data, u32 *max); + int (*get_cycle_duration)(struct device *dev, void *drv_data, u32 *cycle); + int (*set_cycle_duration)(struct device *dev, void *drv_data, u32 cycle); +}; + +#if IS_ENABLED(CONFIG_EDAC_SCRUB) +int edac_scrub_get_desc(struct device *scrub_dev, + const struct attribute_group **attr_groups, + u8 instance); +#else +static inline int edac_scrub_get_desc(struct device *scrub_dev, + const struct attribute_group **attr_groups, + u8 instance) +{ return -EOPNOTSUPP; } +#endif /* CONFIG_EDAC_SCRUB */ + +/** + * struct edac_ecs_ops - ECS device operations (all elements optional) + * @get_log_entry_type: read the log entry type value. + * @set_log_entry_type: set the log entry type value. + * @get_mode: read the mode value. + * @set_mode: set the mode value. + * @reset: reset the ECS counter. + * @get_threshold: read the threshold count per gigabits of memory cells. + * @set_threshold: set the threshold count per gigabits of memory cells. + */ +struct edac_ecs_ops { + int (*get_log_entry_type)(struct device *dev, void *drv_data, int fru_id, u32 *val); + int (*set_log_entry_type)(struct device *dev, void *drv_data, int fru_id, u32 val); + int (*get_mode)(struct device *dev, void *drv_data, int fru_id, u32 *val); + int (*set_mode)(struct device *dev, void *drv_data, int fru_id, u32 val); + int (*reset)(struct device *dev, void *drv_data, int fru_id, u32 val); + int (*get_threshold)(struct device *dev, void *drv_data, int fru_id, u32 *threshold); + int (*set_threshold)(struct device *dev, void *drv_data, int fru_id, u32 threshold); +}; + +struct edac_ecs_ex_info { + u16 num_media_frus; +}; + +#if IS_ENABLED(CONFIG_EDAC_ECS) +int edac_ecs_get_desc(struct device *ecs_dev, + const struct attribute_group **attr_groups, + u16 num_media_frus); +#else +static inline int edac_ecs_get_desc(struct device *ecs_dev, + const struct attribute_group **attr_groups, + u16 num_media_frus) +{ return -EOPNOTSUPP; } +#endif /* CONFIG_EDAC_ECS */ + +enum edac_mem_repair_type { + EDAC_REPAIR_MAX +}; + +enum edac_mem_repair_cmd { + EDAC_DO_MEM_REPAIR = 1, +}; + +/** + * struct edac_mem_repair_ops - memory repair operations + * (all elements are optional except do_repair, set_hpa/set_dpa) + * @get_repair_type: get the memory repair type, listed in + * enum edac_mem_repair_function. + * @get_persist_mode: get the current persist mode. + * false - Soft repair type (temporary repair). + * true - Hard memory repair type (permanent repair). + * @set_persist_mode: set the persist mode of the memory repair instance. + * @get_repair_safe_when_in_use: get whether memory media is accessible and + * data is retained during repair operation. + * @get_hpa: get current host physical address (HPA) of memory to repair. + * @set_hpa: set host physical address (HPA) of memory to repair. + * @get_min_hpa: get the minimum supported host physical address (HPA). + * @get_max_hpa: get the maximum supported host physical address (HPA). + * @get_dpa: get current device physical address (DPA) of memory to repair. + * @set_dpa: set device physical address (DPA) of memory to repair. + * In some states of system configuration (e.g. before address decoders + * have been configured), memory devices (e.g. CXL) may not have an active + * mapping in the host physical address map. As such, the memory + * to repair must be identified by a device specific physical addressing + * scheme using a device physical address(DPA). The DPA and other control + * attributes to use for the repair operations will be presented in related + * error records. + * @get_min_dpa: get the minimum supported device physical address (DPA). + * @get_max_dpa: get the maximum supported device physical address (DPA). + * @get_nibble_mask: get current nibble mask of memory to repair. + * @set_nibble_mask: set nibble mask of memory to repair. + * @get_bank_group: get current bank group of memory to repair. + * @set_bank_group: set bank group of memory to repair. + * @get_bank: get current bank of memory to repair. + * @set_bank: set bank of memory to repair. + * @get_rank: get current rank of memory to repair. + * @set_rank: set rank of memory to repair. + * @get_row: get current row of memory to repair. + * @set_row: set row of memory to repair. + * @get_column: get current column of memory to repair. + * @set_column: set column of memory to repair. + * @get_channel: get current channel of memory to repair. + * @set_channel: set channel of memory to repair. + * @get_sub_channel: get current subchannel of memory to repair. + * @set_sub_channel: set subchannel of memory to repair. + * @do_repair: Issue memory repair operation for the HPA/DPA and + * other control attributes set for the memory to repair. + * + * All elements are optional except do_repair and at least one of set_hpa/set_dpa. + */ +struct edac_mem_repair_ops { + int (*get_repair_type)(struct device *dev, void *drv_data, const char **type); + int (*get_persist_mode)(struct device *dev, void *drv_data, bool *persist); + int (*set_persist_mode)(struct device *dev, void *drv_data, bool persist); + int (*get_repair_safe_when_in_use)(struct device *dev, void *drv_data, bool *safe); + int (*get_hpa)(struct device *dev, void *drv_data, u64 *hpa); + int (*set_hpa)(struct device *dev, void *drv_data, u64 hpa); + int (*get_min_hpa)(struct device *dev, void *drv_data, u64 *hpa); + int (*get_max_hpa)(struct device *dev, void *drv_data, u64 *hpa); + int (*get_dpa)(struct device *dev, void *drv_data, u64 *dpa); + int (*set_dpa)(struct device *dev, void *drv_data, u64 dpa); + int (*get_min_dpa)(struct device *dev, void *drv_data, u64 *dpa); + int (*get_max_dpa)(struct device *dev, void *drv_data, u64 *dpa); + int (*get_nibble_mask)(struct device *dev, void *drv_data, u32 *val); + int (*set_nibble_mask)(struct device *dev, void *drv_data, u32 val); + int (*get_bank_group)(struct device *dev, void *drv_data, u32 *val); + int (*set_bank_group)(struct device *dev, void *drv_data, u32 val); + int (*get_bank)(struct device *dev, void *drv_data, u32 *val); + int (*set_bank)(struct device *dev, void *drv_data, u32 val); + int (*get_rank)(struct device *dev, void *drv_data, u32 *val); + int (*set_rank)(struct device *dev, void *drv_data, u32 val); + int (*get_row)(struct device *dev, void *drv_data, u32 *val); + int (*set_row)(struct device *dev, void *drv_data, u32 val); + int (*get_column)(struct device *dev, void *drv_data, u32 *val); + int (*set_column)(struct device *dev, void *drv_data, u32 val); + int (*get_channel)(struct device *dev, void *drv_data, u32 *val); + int (*set_channel)(struct device *dev, void *drv_data, u32 val); + int (*get_sub_channel)(struct device *dev, void *drv_data, u32 *val); + int (*set_sub_channel)(struct device *dev, void *drv_data, u32 val); + int (*do_repair)(struct device *dev, void *drv_data, u32 val); +}; + +#if IS_ENABLED(CONFIG_EDAC_MEM_REPAIR) +int edac_mem_repair_get_desc(struct device *dev, + const struct attribute_group **attr_groups, + u8 instance); +#else +static inline int edac_mem_repair_get_desc(struct device *dev, + const struct attribute_group **attr_groups, + u8 instance) +{ return -EOPNOTSUPP; } +#endif /* CONFIG_EDAC_MEM_REPAIR */ + +/* EDAC device feature information structure */ +struct edac_dev_data { + union { + const struct edac_scrub_ops *scrub_ops; + const struct edac_ecs_ops *ecs_ops; + const struct edac_mem_repair_ops *mem_repair_ops; + }; + u8 instance; + void *private; +}; + +struct edac_dev_feat_ctx { + struct device dev; + void *private; + struct edac_dev_data *scrub; + struct edac_dev_data ecs; + struct edac_dev_data *mem_repair; +}; + +struct edac_dev_feature { + enum edac_dev_feat ft_type; + u8 instance; + union { + const struct edac_scrub_ops *scrub_ops; + const struct edac_ecs_ops *ecs_ops; + const struct edac_mem_repair_ops *mem_repair_ops; + }; + void *ctx; + struct edac_ecs_ex_info ecs_info; +}; + +int edac_dev_register(struct device *parent, char *dev_name, + void *parent_pvt_data, int num_features, + const struct edac_dev_feature *ras_features); #endif /* _LINUX_EDAC_H_ */ diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 78318d49276d..d8eabbf86a5b 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -167,13 +167,13 @@ struct em_data_callback { struct em_perf_domain *em_cpu_get(int cpu); struct em_perf_domain *em_pd_get(struct device *dev); int em_dev_update_perf_domain(struct device *dev, - struct em_perf_table __rcu *new_table); + struct em_perf_table *new_table); int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, - struct em_data_callback *cb, cpumask_t *span, - bool microwatts); + const struct em_data_callback *cb, + const cpumask_t *cpus, bool microwatts); void em_dev_unregister_perf_domain(struct device *dev); -struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd); -void em_table_free(struct em_perf_table __rcu *table); +struct em_perf_table *em_table_alloc(struct em_perf_domain *pd); +void em_table_free(struct em_perf_table *table); int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, int nr_states); int em_dev_update_chip_binning(struct device *dev); @@ -240,9 +240,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, struct em_perf_state *ps; int i; -#ifdef CONFIG_SCHED_DEBUG WARN_ONCE(!rcu_read_lock_held(), "EM: rcu read lock needed\n"); -#endif if (!sum_util) return 0; @@ -346,8 +344,8 @@ struct em_data_callback {}; static inline int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, - struct em_data_callback *cb, cpumask_t *span, - bool microwatts) + const struct em_data_callback *cb, + const cpumask_t *cpus, bool microwatts) { return -EINVAL; } @@ -373,14 +371,14 @@ static inline int em_pd_nr_perf_states(struct em_perf_domain *pd) return 0; } static inline -struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd) +struct em_perf_table *em_table_alloc(struct em_perf_domain *pd) { return NULL; } -static inline void em_table_free(struct em_perf_table __rcu *table) {} +static inline void em_table_free(struct em_perf_table *table) {} static inline int em_dev_update_perf_domain(struct device *dev, - struct em_perf_table __rcu *new_table) + struct em_perf_table *new_table) { return -EINVAL; } diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index 0c0d00fcd131..ccb478eb174b 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -25,6 +25,10 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long t /* Used to release the epoll bits inside the "struct file" */ void eventpoll_release_file(struct file *file); +/* Copy ready events to userspace */ +int epoll_sendevents(struct file *file, struct epoll_event __user *events, + int maxevents); + /* * This is called from inside fs/file_table.c:__fput() to unlink files * from the eventpoll interface. We need to have this facility to cleanup diff --git a/include/linux/execmem.h b/include/linux/execmem.h index 64130ae19690..65655a5d1be2 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -65,6 +65,37 @@ enum execmem_range_flags { * Architectures that use EXECMEM_ROX_CACHE must implement this. */ void execmem_fill_trapping_insns(void *ptr, size_t size, bool writable); + +/** + * execmem_make_temp_rw - temporarily remap region with read-write + * permissions + * @ptr: address of the region to remap + * @size: size of the region to remap + * + * Remaps a part of the cached large page in the ROX cache in the range + * [@ptr, @ptr + @size) as writable and not executable. The caller must + * have exclusive ownership of this range and ensure nothing will try to + * execute code in this range. + * + * Return: 0 on success or negative error code on failure. + */ +int execmem_make_temp_rw(void *ptr, size_t size); + +/** + * execmem_restore_rox - restore read-only-execute permissions + * @ptr: address of the region to remap + * @size: size of the region to remap + * + * Restores read-only-execute permissions on a range [@ptr, @ptr + @size) + * after it was temporarily remapped as writable. Relies on architecture + * implementation of set_memory_rox() to restore mapping using large pages. + * + * Return: 0 on success or negative error code on failure. + */ +int execmem_restore_rox(void *ptr, size_t size); +#else +static inline int execmem_make_temp_rw(void *ptr, size_t size) { return 0; } +static inline int execmem_restore_rox(void *ptr, size_t size) { return 0; } #endif /** diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index 78f660ebc318..3c817dc6292e 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -25,7 +25,7 @@ #define FANOTIFY_FID_BITS (FAN_REPORT_DFID_NAME_TARGET) -#define FANOTIFY_INFO_MODES (FANOTIFY_FID_BITS | FAN_REPORT_PIDFD) +#define FANOTIFY_INFO_MODES (FANOTIFY_FID_BITS | FAN_REPORT_PIDFD | FAN_REPORT_MNT) /* * fanotify_init() flags that require CAP_SYS_ADMIN. @@ -38,7 +38,8 @@ FAN_REPORT_PIDFD | \ FAN_REPORT_FD_ERROR | \ FAN_UNLIMITED_QUEUE | \ - FAN_UNLIMITED_MARKS) + FAN_UNLIMITED_MARKS | \ + FAN_REPORT_MNT) /* * fanotify_init() flags that are allowed for user without CAP_SYS_ADMIN. @@ -58,7 +59,7 @@ #define FANOTIFY_INTERNAL_GROUP_FLAGS (FANOTIFY_UNPRIV) #define FANOTIFY_MARK_TYPE_BITS (FAN_MARK_INODE | FAN_MARK_MOUNT | \ - FAN_MARK_FILESYSTEM) + FAN_MARK_FILESYSTEM | FAN_MARK_MNTNS) #define FANOTIFY_MARK_CMD_BITS (FAN_MARK_ADD | FAN_MARK_REMOVE | \ FAN_MARK_FLUSH) @@ -109,10 +110,13 @@ /* Events that can only be reported with data type FSNOTIFY_EVENT_ERROR */ #define FANOTIFY_ERROR_EVENTS (FAN_FS_ERROR) +#define FANOTIFY_MOUNT_EVENTS (FAN_MNT_ATTACH | FAN_MNT_DETACH) + /* Events that user can request to be notified on */ #define FANOTIFY_EVENTS (FANOTIFY_PATH_EVENTS | \ FANOTIFY_INODE_EVENTS | \ - FANOTIFY_ERROR_EVENTS) + FANOTIFY_ERROR_EVENTS | \ + FANOTIFY_MOUNT_EVENTS) /* Extra flags that may be reported with event or control handling of events */ #define FANOTIFY_EVENT_FLAGS (FAN_EVENT_ON_CHILD | FAN_ONDIR) diff --git a/include/linux/file_ref.h b/include/linux/file_ref.h index 9b3a8d9b17ab..7db62fbc0500 100644 --- a/include/linux/file_ref.h +++ b/include/linux/file_ref.h @@ -61,6 +61,7 @@ static inline void file_ref_init(file_ref_t *ref, unsigned long cnt) atomic_long_set(&ref->refcnt, cnt - 1); } +bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt); bool __file_ref_put(file_ref_t *ref, unsigned long cnt); /** @@ -161,6 +162,39 @@ static __always_inline __must_check bool file_ref_put(file_ref_t *ref) } /** + * file_ref_put_close - drop a reference expecting it would transition to FILE_REF_NOREF + * @ref: Pointer to the reference count + * + * Semantically it is equivalent to calling file_ref_put(), but it trades lower + * performance in face of other CPUs also modifying the refcount for higher + * performance when this happens to be the last reference. + * + * For the last reference file_ref_put() issues 2 atomics. One to drop the + * reference and another to transition it to FILE_REF_DEAD. This routine does + * the work in one step, but in order to do it has to pre-read the variable which + * decreases scalability. + * + * Use with close() et al, stick to file_ref_put() by default. + */ +static __always_inline __must_check bool file_ref_put_close(file_ref_t *ref) +{ + long old, new; + + old = atomic_long_read(&ref->refcnt); + do { + if (unlikely(old < 0)) + return __file_ref_put_badval(ref, old); + + if (old == FILE_REF_ONEREF) + new = FILE_REF_DEAD; + else + new = old - 1; + } while (!atomic_long_try_cmpxchg(&ref->refcnt, &old, new)); + + return new == FILE_REF_DEAD; +} + +/** * file_ref_read - Read the number of file references * @ref: Pointer to the reference count * @@ -174,4 +208,18 @@ static inline unsigned long file_ref_read(file_ref_t *ref) return c >= FILE_REF_RELEASED ? 0 : c + 1; } +/* + * __file_ref_read_raw - Return the value stored in ref->refcnt + * @ref: Pointer to the reference count + * + * Return: The raw value found in the counter + * + * A hack for file_needs_f_pos_lock(), you probably want to use + * file_ref_read() instead. + */ +static inline unsigned long __file_ref_read_raw(file_ref_t *ref) +{ + return atomic_long_read(&ref->refcnt); +} + #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index 2788df98080f..1a0e23a5d02d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2,6 +2,7 @@ #ifndef _LINUX_FS_H #define _LINUX_FS_H +#include <linux/vfsdebug.h> #include <linux/linkage.h> #include <linux/wait_bit.h> #include <linux/kdev_t.h> @@ -790,19 +791,8 @@ struct inode { static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen) { - int testlen; - - /* - * TODO: patch it into a debug-only check if relevant macros show up. - * In the meantime, since we are suffering strlen even on production kernels - * to find the right length, do a fixup if the wrong value got passed. - */ - testlen = strlen(link); - if (testlen != linklen) { - WARN_ONCE(1, "bad length passed for symlink [%s] (got %d, expected %d)", - link, linklen, testlen); - linklen = testlen; - } + VFS_WARN_ON_INODE(strlen(link) != linklen, inode); + VFS_WARN_ON_INODE(inode->i_opflags & IOP_CACHED_LINK, inode); inode->i_link = link; inode->i_linklen = linklen; inode->i_opflags |= IOP_CACHED_LINK; @@ -1067,7 +1057,6 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) /** * struct file - Represents a file - * @f_ref: reference count * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context. * @f_mode: FMODE_* flags often used in hotpaths * @f_op: file operations @@ -1077,12 +1066,12 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) * @f_flags: file flags * @f_iocb_flags: iocb flags * @f_cred: stashed credentials of creator/opener + * @f_owner: file owner * @f_path: path of the file * @f_pos_lock: lock protecting file position * @f_pipe: specific to pipes * @f_pos: file position * @f_security: LSM security context of this file - * @f_owner: file owner * @f_wb_err: writeback error * @f_sb_err: per sb writeback errors * @f_ep: link of all epoll hooks for this file @@ -1090,9 +1079,9 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) * @f_llist: work queue entrypoint * @f_ra: file's readahead state * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.) + * @f_ref: reference count */ struct file { - file_ref_t f_ref; spinlock_t f_lock; fmode_t f_mode; const struct file_operations *f_op; @@ -1102,6 +1091,7 @@ struct file { unsigned int f_flags; unsigned int f_iocb_flags; const struct cred *f_cred; + struct fown_struct *f_owner; /* --- cacheline 1 boundary (64 bytes) --- */ struct path f_path; union { @@ -1115,7 +1105,6 @@ struct file { void *f_security; #endif /* --- cacheline 2 boundary (128 bytes) --- */ - struct fown_struct *f_owner; errseq_t f_wb_err; errseq_t f_sb_err; #ifdef CONFIG_EPOLL @@ -1127,6 +1116,7 @@ struct file { struct file_ra_state f_ra; freeptr_t f_freeptr; }; + file_ref_t f_ref; /* --- cacheline 3 boundary (192 bytes) --- */ } __randomize_layout __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ @@ -1981,8 +1971,8 @@ bool inode_owner_or_capable(struct mnt_idmap *idmap, */ int vfs_create(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, bool); -int vfs_mkdir(struct mnt_idmap *, struct inode *, - struct dentry *, umode_t); +struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *, + struct dentry *, umode_t); int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); int vfs_symlink(struct mnt_idmap *, struct inode *, @@ -2039,7 +2029,7 @@ int vfs_fchown(struct file *file, uid_t user, gid_t group); int vfs_fchmod(struct file *file, umode_t mode); int vfs_utimes(const struct path *path, struct timespec64 *times); -extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #ifdef CONFIG_COMPAT extern long compat_ptr_ioctl(struct file *file, unsigned int cmd, @@ -2211,8 +2201,8 @@ struct inode_operations { int (*unlink) (struct inode *,struct dentry *); int (*symlink) (struct mnt_idmap *, struct inode *,struct dentry *, const char *); - int (*mkdir) (struct mnt_idmap *, struct inode *,struct dentry *, - umode_t); + struct dentry *(*mkdir) (struct mnt_idmap *, struct inode *, + struct dentry *, umode_t); int (*rmdir) (struct inode *,struct dentry *); int (*mknod) (struct mnt_idmap *, struct inode *,struct dentry *, umode_t,dev_t); @@ -2616,6 +2606,7 @@ struct file_system_type { #define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */ #define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */ #define FS_MGTIME 64 /* FS uses multigrain timestamps */ +#define FS_LBS 128 /* FS supports LBS */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; @@ -2653,9 +2644,6 @@ static inline bool is_mgtime(const struct inode *inode) extern struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)); -extern struct dentry *mount_single(struct file_system_type *fs_type, - int flags, void *data, - int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_nodev(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)); @@ -2794,13 +2782,13 @@ static inline bool is_idmapped_mnt(const struct vfsmount *mnt) return mnt_idmap(mnt) != &nop_mnt_idmap; } -extern long vfs_truncate(const struct path *, loff_t); +int vfs_truncate(const struct path *, loff_t); int do_truncate(struct mnt_idmap *, struct dentry *, loff_t start, unsigned int time_attrs, struct file *filp); extern int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len); -extern long do_sys_open(int dfd, const char __user *filename, int flags, - umode_t mode); +int do_sys_open(int dfd, const char __user *filename, int flags, + umode_t mode); extern struct file *file_open_name(struct filename *, int, umode_t); extern struct file *filp_open(const char *, int, umode_t); extern struct file *file_open_root(const struct path *, @@ -2851,7 +2839,10 @@ extern int filp_close(struct file *, fl_owner_t id); extern struct filename *getname_flags(const char __user *, int); extern struct filename *getname_uflags(const char __user *, int); -extern struct filename *getname(const char __user *); +static inline struct filename *getname(const char __user *name) +{ + return getname_flags(name, 0); +} extern struct filename *getname_kernel(const char *); extern struct filename *__getname_maybe_null(const char __user *); static inline struct filename *getname_maybe_null(const char __user *name, int flags) @@ -2864,6 +2855,13 @@ static inline struct filename *getname_maybe_null(const char __user *name, int f return __getname_maybe_null(name); } extern void putname(struct filename *name); +DEFINE_FREE(putname, struct filename *, if (!IS_ERR_OR_NULL(_T)) putname(_T)) + +static inline struct filename *refname(struct filename *name) +{ + atomic_inc(&name->refcnt); + return name; +} extern int finish_open(struct file *file, struct dentry *dentry, int (*open)(struct inode *, struct file *)); @@ -3297,7 +3295,11 @@ static inline void __iget(struct inode *inode) extern void iget_failed(struct inode *); extern void clear_inode(struct inode *); extern void __destroy_inode(struct inode *); -extern struct inode *new_inode_pseudo(struct super_block *sb); +struct inode *alloc_inode(struct super_block *sb); +static inline struct inode *new_inode_pseudo(struct super_block *sb) +{ + return alloc_inode(sb); +} extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *); diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index 4b4bfef6f053..a19e4bd32e4d 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -144,8 +144,6 @@ extern void put_fs_context(struct fs_context *fc); extern int vfs_parse_fs_param_source(struct fs_context *fc, struct fs_parameter *param); extern void fc_drop_locked(struct fs_context *fc); -int reconfigure_single(struct super_block *s, - int flags, void *data); extern int get_tree_nodev(struct fs_context *fc, int (*fill_super)(struct super_block *sb, diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 18855cb44b1c..56fad33043d5 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -310,10 +310,8 @@ static inline void fscrypt_prepare_dentry(struct dentry *dentry, /* crypto.c */ void fscrypt_enqueue_decrypt_work(struct work_struct *); -struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, - unsigned int len, - unsigned int offs, - gfp_t gfp_flags); +struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio, + size_t len, size_t offs, gfp_t gfp_flags); int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num, gfp_t gfp_flags); @@ -480,10 +478,8 @@ static inline void fscrypt_enqueue_decrypt_work(struct work_struct *work) { } -static inline struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, - unsigned int len, - unsigned int offs, - gfp_t gfp_flags) +static inline struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio, + size_t len, size_t offs, gfp_t gfp_flags) { return ERR_PTR(-EOPNOTSUPP); } diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 83d3ac97f826..454d8e466958 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -320,6 +320,11 @@ static inline void fsnotify_vfsmount_delete(struct vfsmount *mnt) __fsnotify_vfsmount_delete(mnt); } +static inline void fsnotify_mntns_delete(struct mnt_namespace *mntns) +{ + __fsnotify_mntns_delete(mntns); +} + /* * fsnotify_inoderemove - an inode is going away */ @@ -528,4 +533,19 @@ static inline int fsnotify_sb_error(struct super_block *sb, struct inode *inode, NULL, NULL, NULL, 0); } +static inline void fsnotify_mnt_attach(struct mnt_namespace *ns, struct vfsmount *mnt) +{ + fsnotify_mnt(FS_MNT_ATTACH, ns, mnt); +} + +static inline void fsnotify_mnt_detach(struct mnt_namespace *ns, struct vfsmount *mnt) +{ + fsnotify_mnt(FS_MNT_DETACH, ns, mnt); +} + +static inline void fsnotify_mnt_move(struct mnt_namespace *ns, struct vfsmount *mnt) +{ + fsnotify_mnt(FS_MNT_MOVE, ns, mnt); +} + #endif /* _LINUX_FS_NOTIFY_H */ diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 0d24a21a8e60..6cd8d1d28b8b 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -59,6 +59,10 @@ #define FS_PRE_ACCESS 0x00100000 /* Pre-content access hook */ +#define FS_MNT_ATTACH 0x01000000 /* Mount was attached */ +#define FS_MNT_DETACH 0x02000000 /* Mount was detached */ +#define FS_MNT_MOVE (FS_MNT_ATTACH | FS_MNT_DETACH) + /* * Set on inode mark that cares about things that happen to its children. * Always set for dnotify and inotify. @@ -80,6 +84,9 @@ */ #define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME) +/* Mount namespace events */ +#define FSNOTIFY_MNT_EVENTS (FS_MNT_ATTACH | FS_MNT_DETACH) + /* Content events can be used to inspect file content */ #define FSNOTIFY_CONTENT_PERM_EVENTS (FS_OPEN_PERM | FS_OPEN_EXEC_PERM | \ FS_ACCESS_PERM) @@ -108,6 +115,7 @@ /* Events that can be reported to backends */ #define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \ + FSNOTIFY_MNT_EVENTS | \ FS_EVENTS_POSS_ON_CHILD | \ FS_DELETE_SELF | FS_MOVE_SELF | \ FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \ @@ -298,6 +306,7 @@ enum fsnotify_data_type { FSNOTIFY_EVENT_PATH, FSNOTIFY_EVENT_INODE, FSNOTIFY_EVENT_DENTRY, + FSNOTIFY_EVENT_MNT, FSNOTIFY_EVENT_ERROR, }; @@ -318,6 +327,11 @@ static inline const struct path *file_range_path(const struct file_range *range) return range->path; } +struct fsnotify_mnt { + const struct mnt_namespace *ns; + u64 mnt_id; +}; + static inline struct inode *fsnotify_data_inode(const void *data, int data_type) { switch (data_type) { @@ -383,6 +397,24 @@ static inline struct super_block *fsnotify_data_sb(const void *data, } } +static inline const struct fsnotify_mnt *fsnotify_data_mnt(const void *data, + int data_type) +{ + switch (data_type) { + case FSNOTIFY_EVENT_MNT: + return data; + default: + return NULL; + } +} + +static inline u64 fsnotify_data_mnt_id(const void *data, int data_type) +{ + const struct fsnotify_mnt *mnt_data = fsnotify_data_mnt(data, data_type); + + return mnt_data ? mnt_data->mnt_id : 0; +} + static inline struct fs_error_report *fsnotify_data_error_report( const void *data, int data_type) @@ -420,6 +452,7 @@ enum fsnotify_iter_type { FSNOTIFY_ITER_TYPE_SB, FSNOTIFY_ITER_TYPE_PARENT, FSNOTIFY_ITER_TYPE_INODE2, + FSNOTIFY_ITER_TYPE_MNTNS, FSNOTIFY_ITER_TYPE_COUNT }; @@ -429,6 +462,7 @@ enum fsnotify_obj_type { FSNOTIFY_OBJ_TYPE_INODE, FSNOTIFY_OBJ_TYPE_VFSMOUNT, FSNOTIFY_OBJ_TYPE_SB, + FSNOTIFY_OBJ_TYPE_MNTNS, FSNOTIFY_OBJ_TYPE_COUNT, FSNOTIFY_OBJ_TYPE_DETACHED = FSNOTIFY_OBJ_TYPE_COUNT }; @@ -613,8 +647,10 @@ extern int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data extern void __fsnotify_inode_delete(struct inode *inode); extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt); extern void fsnotify_sb_delete(struct super_block *sb); +extern void __fsnotify_mntns_delete(struct mnt_namespace *mntns); extern void fsnotify_sb_free(struct super_block *sb); extern u32 fsnotify_get_cookie(void); +extern void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt); static inline __u32 fsnotify_parent_needed_mask(__u32 mask) { @@ -928,6 +964,9 @@ static inline void __fsnotify_vfsmount_delete(struct vfsmount *mnt) static inline void fsnotify_sb_delete(struct super_block *sb) {} +static inline void __fsnotify_mntns_delete(struct mnt_namespace *mntns) +{} + static inline void fsnotify_sb_free(struct super_block *sb) {} @@ -942,6 +981,9 @@ static inline u32 fsnotify_get_cookie(void) static inline void fsnotify_unmount_inodes(struct super_block *sb) {} +static inline void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt) +{} + #endif /* CONFIG_FSNOTIFY */ #endif /* __KERNEL __ */ diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index f7bfdcf0dda3..88e078871158 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -223,6 +223,11 @@ static inline void hrtimer_cancel_wait_running(struct hrtimer *timer) } #endif +static inline enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused) +{ + return HRTIMER_NORESTART; +} + /* Exported timer functions: */ /* Initialize timers: */ @@ -333,6 +338,7 @@ static inline int hrtimer_callback_running(struct hrtimer *timer) static inline void hrtimer_update_function(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *)) { +#ifdef CONFIG_PROVE_LOCKING guard(raw_spinlock_irqsave)(&timer->base->cpu_base->lock); if (WARN_ON_ONCE(hrtimer_is_queued(timer))) @@ -340,7 +346,7 @@ static inline void hrtimer_update_function(struct hrtimer *timer, if (WARN_ON_ONCE(!function)) return; - +#endif timer->function = function; } diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 4179add2864b..675959fb97ba 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -371,19 +371,6 @@ struct vmtransfer_page_packet_header { struct vmtransfer_page_range ranges[]; } __packed; -struct vmgpadl_packet_header { - struct vmpacket_descriptor d; - u32 gpadl; - u32 reserved; -} __packed; - -struct vmadd_remove_transfer_page_set { - struct vmpacket_descriptor d; - u32 gpadl; - u16 xfer_pageset_id; - u16 reserved; -} __packed; - /* * This structure defines a range in guest physical space that can be made to * look virtually contiguous. @@ -395,30 +382,6 @@ struct gpa_range { }; /* - * This is the format for an Establish Gpadl packet, which contains a handle by - * which this GPADL will be known and a set of GPA ranges associated with it. - * This can be converted to a MDL by the guest OS. If there are multiple GPA - * ranges, then the resulting MDL will be "chained," representing multiple VA - * ranges. - */ -struct vmestablish_gpadl { - struct vmpacket_descriptor d; - u32 gpadl; - u32 range_cnt; - struct gpa_range range[1]; -} __packed; - -/* - * This is the format for a Teardown Gpadl packet, which indicates that the - * GPADL handle in the Establish Gpadl packet will never be referenced again. - */ -struct vmteardown_gpadl { - struct vmpacket_descriptor d; - u32 gpadl; - u32 reserved; /* for alignment to a 8-byte boundary */ -} __packed; - -/* * This is the format for a GPA-Direct packet, which contains a set of GPA * ranges, in addition to commands and/or data. */ @@ -429,25 +392,6 @@ struct vmdata_gpa_direct { struct gpa_range range[1]; } __packed; -/* This is the format for a Additional Data Packet. */ -struct vmadditional_data { - struct vmpacket_descriptor d; - u64 total_bytes; - u32 offset; - u32 byte_cnt; - unsigned char data[1]; -} __packed; - -union vmpacket_largest_possible_header { - struct vmpacket_descriptor simple_hdr; - struct vmtransfer_page_packet_header xfer_page_hdr; - struct vmgpadl_packet_header gpadl_hdr; - struct vmadd_remove_transfer_page_set add_rm_xfer_page_hdr; - struct vmestablish_gpadl establish_gpadl_hdr; - struct vmteardown_gpadl teardown_gpadl_hdr; - struct vmdata_gpa_direct data_gpa_direct_hdr; -}; - #define VMPACKET_DATA_START_ADDRESS(__packet) \ (void *)(((unsigned char *)__packet) + \ ((struct vmpacket_descriptor)__packet)->offset8 * 8) @@ -1661,6 +1605,7 @@ int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id, const guid_t *shv_host_servie_id); int vmbus_send_modifychannel(struct vmbus_channel *channel, u32 target_vp); void vmbus_set_event(struct vmbus_channel *channel); +int vmbus_channel_set_cpu(struct vmbus_channel *channel, u32 target_cpu); /* Get the start of the ring buffer. */ static inline void * diff --git a/include/linux/idr.h b/include/linux/idr.h index da5f5fa4a3a6..cd729be369b3 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -15,6 +15,7 @@ #include <linux/radix-tree.h> #include <linux/gfp.h> #include <linux/percpu.h> +#include <linux/cleanup.h> struct idr { struct radix_tree_root idr_rt; @@ -124,6 +125,22 @@ void *idr_get_next_ul(struct idr *, unsigned long *nextid); void *idr_replace(struct idr *, void *, unsigned long id); void idr_destroy(struct idr *); +struct __class_idr { + struct idr *idr; + int id; +}; + +#define idr_null ((struct __class_idr){ NULL, -1 }) +#define take_idr_id(id) __get_and_null(id, idr_null) + +DEFINE_CLASS(idr_alloc, struct __class_idr, + if (_T.id >= 0) idr_remove(_T.idr, _T.id), + ((struct __class_idr){ + .idr = idr, + .id = idr_alloc(idr, ptr, start, end, gfp), + }), + struct idr *idr, void *ptr, int start, int end, gfp_t gfp); + /** * idr_init_base() - Initialise an IDR. * @idr: IDR handle. diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 8cd9327e4e78..c782a74d2a30 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -448,7 +448,7 @@ irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec, static inline void disable_irq_nosync_lockdep(unsigned int irq) { disable_irq_nosync(irq); -#ifdef CONFIG_LOCKDEP +#if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT) local_irq_disable(); #endif } @@ -456,22 +456,14 @@ static inline void disable_irq_nosync_lockdep(unsigned int irq) static inline void disable_irq_nosync_lockdep_irqsave(unsigned int irq, unsigned long *flags) { disable_irq_nosync(irq); -#ifdef CONFIG_LOCKDEP +#if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT) local_irq_save(*flags); #endif } -static inline void disable_irq_lockdep(unsigned int irq) -{ - disable_irq(irq); -#ifdef CONFIG_LOCKDEP - local_irq_disable(); -#endif -} - static inline void enable_irq_lockdep(unsigned int irq) { -#ifdef CONFIG_LOCKDEP +#if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT) local_irq_enable(); #endif enable_irq(irq); @@ -479,7 +471,7 @@ static inline void enable_irq_lockdep(unsigned int irq) static inline void enable_irq_lockdep_irqrestore(unsigned int irq, unsigned long *flags) { -#ifdef CONFIG_LOCKDEP +#if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT) local_irq_restore(*flags); #endif enable_irq(irq); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 75bf54e76f3b..02fe001feebb 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -56,6 +56,13 @@ struct vm_fault; * * IOMAP_F_BOUNDARY indicates that I/O and I/O completions for this iomap must * never be merged with the mapping before it. + * + * IOMAP_F_ANON_WRITE indicates that (write) I/O does not have a target block + * assigned to it yet and the file system will do that in the bio submission + * handler, splitting the I/O as needed. + * + * IOMAP_F_ATOMIC_BIO indicates that (write) I/O will be issued as an atomic + * bio, i.e. set REQ_ATOMIC. */ #define IOMAP_F_NEW (1U << 0) #define IOMAP_F_DIRTY (1U << 1) @@ -68,6 +75,8 @@ struct vm_fault; #endif /* CONFIG_BUFFER_HEAD */ #define IOMAP_F_XATTR (1U << 5) #define IOMAP_F_BOUNDARY (1U << 6) +#define IOMAP_F_ANON_WRITE (1U << 7) +#define IOMAP_F_ATOMIC_BIO (1U << 8) /* * Flags set by the core iomap code during operations: @@ -111,6 +120,8 @@ struct iomap { static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos) { + if (iomap->flags & IOMAP_F_ANON_WRITE) + return U64_MAX; /* invalid */ return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT; } @@ -182,7 +193,8 @@ struct iomap_folio_ops { #else #define IOMAP_DAX 0 #endif /* CONFIG_FS_DAX */ -#define IOMAP_ATOMIC (1 << 9) +#define IOMAP_ATOMIC (1 << 9) /* torn-write protection */ +#define IOMAP_DONTCACHE (1 << 10) struct iomap_ops { /* @@ -211,8 +223,10 @@ struct iomap_ops { * calls to iomap_iter(). Treat as read-only in the body. * @len: The remaining length of the file segment we're operating on. * It is updated at the same time as @pos. - * @processed: The number of bytes processed by the body in the most recent - * iteration, or a negative errno. 0 causes the iteration to stop. + * @iter_start_pos: The original start pos for the current iomap. Used for + * incremental iter advance. + * @status: Status of the most recent iteration. Zero on success or a negative + * errno on error. * @flags: Zero or more of the iomap_begin flags above. * @iomap: Map describing the I/O iteration * @srcmap: Source map for COW operations @@ -221,7 +235,8 @@ struct iomap_iter { struct inode *inode; loff_t pos; u64 len; - s64 processed; + loff_t iter_start_pos; + int status; unsigned flags; struct iomap iomap; struct iomap srcmap; @@ -229,20 +244,46 @@ struct iomap_iter { }; int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops); +int iomap_iter_advance(struct iomap_iter *iter, u64 *count); /** - * iomap_length - length of the current iomap iteration + * iomap_length_trim - trimmed length of the current iomap iteration * @iter: iteration structure + * @pos: File position to trim from. + * @len: Length of the mapping to trim to. * - * Returns the length that the operation applies to for the current iteration. + * Returns a trimmed length that the operation applies to for the current + * iteration. */ -static inline u64 iomap_length(const struct iomap_iter *iter) +static inline u64 iomap_length_trim(const struct iomap_iter *iter, loff_t pos, + u64 len) { u64 end = iter->iomap.offset + iter->iomap.length; if (iter->srcmap.type != IOMAP_HOLE) end = min(end, iter->srcmap.offset + iter->srcmap.length); - return min(iter->len, end - iter->pos); + return min(len, end - pos); +} + +/** + * iomap_length - length of the current iomap iteration + * @iter: iteration structure + * + * Returns the length that the operation applies to for the current iteration. + */ +static inline u64 iomap_length(const struct iomap_iter *iter) +{ + return iomap_length_trim(iter, iter->pos, iter->len); +} + +/** + * iomap_iter_advance_full - advance by the full length of current map + */ +static inline int iomap_iter_advance_full(struct iomap_iter *iter) +{ + u64 length = iomap_length(iter); + + return iomap_iter_advance(iter, &length); } /** @@ -306,12 +347,11 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio); int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, - bool *did_zero, const struct iomap_ops *ops); + bool *did_zero, const struct iomap_ops *ops, void *private); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, - const struct iomap_ops *ops); -vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, - const struct iomap_ops *ops); - + const struct iomap_ops *ops, void *private); +vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, + void *private); typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length, struct iomap *iomap); void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, @@ -328,16 +368,42 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, const struct iomap_ops *ops); /* + * Flags for iomap_ioend->io_flags. + */ +/* shared COW extent */ +#define IOMAP_IOEND_SHARED (1U << 0) +/* unwritten extent */ +#define IOMAP_IOEND_UNWRITTEN (1U << 1) +/* don't merge into previous ioend */ +#define IOMAP_IOEND_BOUNDARY (1U << 2) +/* is direct I/O */ +#define IOMAP_IOEND_DIRECT (1U << 3) + +/* + * Flags that if set on either ioend prevent the merge of two ioends. + * (IOMAP_IOEND_BOUNDARY also prevents merges, but only one-way) + */ +#define IOMAP_IOEND_NOMERGE_FLAGS \ + (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT) + +/* * Structure for writeback I/O completions. + * + * File systems implementing ->submit_ioend (for buffered I/O) or ->submit_io + * for direct I/O) can split a bio generated by iomap. In that case the parent + * ioend it was split from is recorded in ioend->io_parent. */ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ - u16 io_type; - u16 io_flags; /* IOMAP_F_* */ + u16 io_flags; /* IOMAP_IOEND_* */ struct inode *io_inode; /* file being written to */ - size_t io_size; /* size of data within eof */ + size_t io_size; /* size of the extent */ + atomic_t io_remaining; /* completetion defer count */ + int io_error; /* stashed away status */ + struct iomap_ioend *io_parent; /* parent for completions */ loff_t io_offset; /* offset in the file */ sector_t io_sector; /* start sector of ioend */ + void *io_private; /* file system private data */ struct bio io_bio; /* MUST BE LAST! */ }; @@ -362,12 +428,14 @@ struct iomap_writeback_ops { loff_t offset, unsigned len); /* - * Optional, allows the file systems to perform actions just before - * submitting the bio and/or override the bio end_io handler for complex - * operations like copy on write extent manipulation or unwritten extent - * conversions. + * Optional, allows the file systems to hook into bio submission, + * including overriding the bi_end_io handler. + * + * Returns 0 if the bio was successfully submitted, or a negative + * error code if status was non-zero or another error happened and + * the bio could not be submitted. */ - int (*prepare_ioend)(struct iomap_ioend *ioend, int status); + int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status); /* * Optional, allows the file system to discard state on a page where @@ -383,6 +451,10 @@ struct iomap_writepage_ctx { u32 nr_folios; /* folios added to the ioend */ }; +struct iomap_ioend *iomap_init_ioend(struct inode *inode, struct bio *bio, + loff_t file_offset, u16 ioend_flags); +struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, + unsigned int max_len, bool is_append); void iomap_finish_ioends(struct iomap_ioend *ioend, int error); void iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends); @@ -454,4 +526,6 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, # define iomap_swapfile_activate(sis, swapfile, pagespan, ops) (-EIO) #endif /* CONFIG_SWAP */ +extern struct bio_set iomap_ioend_bioset; + #endif /* LINUX_IOMAP_H */ diff --git a/include/linux/irq.h b/include/linux/irq.h index 8daa17f0107a..dd5df1e2d032 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -486,6 +486,7 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) * @ipi_send_mask: send an IPI to destination cpus in cpumask * @irq_nmi_setup: function called from core code before enabling an NMI * @irq_nmi_teardown: function called from core code after disabling an NMI + * @irq_force_complete_move: optional function to force complete pending irq move * @flags: chip specific flags */ struct irq_chip { @@ -537,6 +538,8 @@ struct irq_chip { int (*irq_nmi_setup)(struct irq_data *data); void (*irq_nmi_teardown)(struct irq_data *data); + void (*irq_force_complete_move)(struct irq_data *data); + unsigned long flags; }; @@ -612,6 +615,7 @@ extern int irq_affinity_online_cpu(unsigned int cpu); #endif #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ) +bool irq_can_move_in_process_context(struct irq_data *data); void __irq_move_irq(struct irq_data *data); static inline void irq_move_irq(struct irq_data *data) { @@ -619,11 +623,10 @@ static inline void irq_move_irq(struct irq_data *data) __irq_move_irq(data); } void irq_move_masked_irq(struct irq_data *data); -void irq_force_complete_move(struct irq_desc *desc); #else +static inline bool irq_can_move_in_process_context(struct irq_data *data) { return true; } static inline void irq_move_irq(struct irq_data *data) { } static inline void irq_move_masked_irq(struct irq_data *data) { } -static inline void irq_force_complete_move(struct irq_desc *desc) { } #endif extern int no_irq_affinity; diff --git a/include/linux/irqchip/irq-davinci-cp-intc.h b/include/linux/irqchip/irq-davinci-cp-intc.h deleted file mode 100644 index 8d71ed5b5a61..000000000000 --- a/include/linux/irqchip/irq-davinci-cp-intc.h +++ /dev/null @@ -1,25 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (C) 2019 Texas Instruments - */ - -#ifndef _LINUX_IRQ_DAVINCI_CP_INTC_ -#define _LINUX_IRQ_DAVINCI_CP_INTC_ - -#include <linux/ioport.h> - -/** - * struct davinci_cp_intc_config - configuration data for davinci-cp-intc - * driver. - * - * @reg: register range to map - * @num_irqs: number of HW interrupts supported by the controller - */ -struct davinci_cp_intc_config { - struct resource reg; - unsigned int num_irqs; -}; - -int davinci_cp_intc_init(const struct davinci_cp_intc_config *config); - -#endif /* _LINUX_IRQ_DAVINCI_CP_INTC_ */ diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index e432b6a12a32..5126482515cb 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -281,6 +281,8 @@ static inline struct fwnode_handle *irq_domain_alloc_fwnode(phys_addr_t *pa) void irq_domain_free_fwnode(struct fwnode_handle *fwnode); +DEFINE_FREE(irq_domain_free_fwnode, struct fwnode_handle *, if (_T) irq_domain_free_fwnode(_T)) + struct irq_domain_chip_generic_info; /** @@ -350,13 +352,13 @@ struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode, irq_hw_number_t first_hwirq, const struct irq_domain_ops *ops, void *host_data); -extern struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, - enum irq_domain_bus_token bus_token); -extern void irq_set_default_host(struct irq_domain *host); -extern struct irq_domain *irq_get_default_host(void); -extern int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, - irq_hw_number_t hwirq, int node, - const struct irq_affinity_desc *affinity); +struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, + enum irq_domain_bus_token bus_token); +void irq_set_default_host(struct irq_domain *host); +struct irq_domain *irq_get_default_host(void); +int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, + irq_hw_number_t hwirq, int node, + const struct irq_affinity_desc *affinity); static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node) { @@ -370,8 +372,8 @@ static inline bool is_fwnode_irqchip(const struct fwnode_handle *fwnode) return fwnode && fwnode->ops == &irqchip_fwnode_ops; } -extern void irq_domain_update_bus_token(struct irq_domain *domain, - enum irq_domain_bus_token bus_token); +void irq_domain_update_bus_token(struct irq_domain *domain, + enum irq_domain_bus_token bus_token); static inline struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode, @@ -454,7 +456,7 @@ static inline struct irq_domain *irq_domain_add_nomap(struct device_node *of_nod return IS_ERR(d) ? NULL : d; } -extern unsigned int irq_create_direct_mapping(struct irq_domain *host); +unsigned int irq_create_direct_mapping(struct irq_domain *host); #endif static inline struct irq_domain *irq_domain_add_tree(struct device_node *of_node, @@ -507,19 +509,19 @@ static inline struct irq_domain *irq_domain_create_tree(struct fwnode_handle *fw return IS_ERR(d) ? NULL : d; } -extern void irq_domain_remove(struct irq_domain *host); +void irq_domain_remove(struct irq_domain *host); -extern int irq_domain_associate(struct irq_domain *domain, unsigned int irq, - irq_hw_number_t hwirq); -extern void irq_domain_associate_many(struct irq_domain *domain, - unsigned int irq_base, - irq_hw_number_t hwirq_base, int count); +int irq_domain_associate(struct irq_domain *domain, unsigned int irq, + irq_hw_number_t hwirq); +void irq_domain_associate_many(struct irq_domain *domain, + unsigned int irq_base, + irq_hw_number_t hwirq_base, int count); -extern unsigned int irq_create_mapping_affinity(struct irq_domain *host, - irq_hw_number_t hwirq, - const struct irq_affinity_desc *affinity); -extern unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec); -extern void irq_dispose_mapping(unsigned int virq); +unsigned int irq_create_mapping_affinity(struct irq_domain *host, + irq_hw_number_t hwirq, + const struct irq_affinity_desc *affinity); +unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec); +void irq_dispose_mapping(unsigned int virq); static inline unsigned int irq_create_mapping(struct irq_domain *host, irq_hw_number_t hwirq) @@ -527,9 +529,9 @@ static inline unsigned int irq_create_mapping(struct irq_domain *host, return irq_create_mapping_affinity(host, hwirq, NULL); } -extern struct irq_desc *__irq_resolve_mapping(struct irq_domain *domain, - irq_hw_number_t hwirq, - unsigned int *irq); +struct irq_desc *__irq_resolve_mapping(struct irq_domain *domain, + irq_hw_number_t hwirq, + unsigned int *irq); static inline struct irq_desc *irq_resolve_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) @@ -587,19 +589,21 @@ int irq_reserve_ipi(struct irq_domain *domain, const struct cpumask *dest); int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest); /* V2 interfaces to support hierarchy IRQ domains. */ -extern struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, - unsigned int virq); -extern void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq, - const struct irq_chip *chip, - void *chip_data, irq_flow_handler_t handler, - void *handler_data, const char *handler_name); -extern void irq_domain_reset_irq_data(struct irq_data *irq_data); +struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, + unsigned int virq); +void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq, + const struct irq_chip *chip, + void *chip_data, irq_flow_handler_t handler, + void *handler_data, const char *handler_name); +void irq_domain_reset_irq_data(struct irq_data *irq_data); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY -extern struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, - unsigned int flags, unsigned int size, - struct fwnode_handle *fwnode, - const struct irq_domain_ops *ops, void *host_data); +struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, + unsigned int flags, + unsigned int size, + struct fwnode_handle *fwnode, + const struct irq_domain_ops *ops, + void *host_data); static inline struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent, unsigned int flags, @@ -613,13 +617,13 @@ static inline struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *par ops, host_data); } -extern int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, - unsigned int nr_irqs, int node, void *arg, - bool realloc, - const struct irq_affinity_desc *affinity); -extern void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs); -extern int irq_domain_activate_irq(struct irq_data *irq_data, bool early); -extern void irq_domain_deactivate_irq(struct irq_data *irq_data); +int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, + unsigned int nr_irqs, int node, void *arg, + bool realloc, + const struct irq_affinity_desc *affinity); +void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs); +int irq_domain_activate_irq(struct irq_data *irq_data, bool early); +void irq_domain_deactivate_irq(struct irq_data *irq_data); static inline int irq_domain_alloc_irqs(struct irq_domain *domain, unsigned int nr_irqs, int node, void *arg) @@ -628,32 +632,29 @@ static inline int irq_domain_alloc_irqs(struct irq_domain *domain, NULL); } -extern int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, - unsigned int irq_base, - unsigned int nr_irqs, void *arg); -extern int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, - unsigned int virq, - irq_hw_number_t hwirq, - const struct irq_chip *chip, - void *chip_data); -extern void irq_domain_free_irqs_common(struct irq_domain *domain, - unsigned int virq, - unsigned int nr_irqs); -extern void irq_domain_free_irqs_top(struct irq_domain *domain, - unsigned int virq, unsigned int nr_irqs); - -extern int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg); -extern int irq_domain_pop_irq(struct irq_domain *domain, int virq); - -extern int irq_domain_alloc_irqs_parent(struct irq_domain *domain, - unsigned int irq_base, - unsigned int nr_irqs, void *arg); - -extern void irq_domain_free_irqs_parent(struct irq_domain *domain, - unsigned int irq_base, - unsigned int nr_irqs); - -extern int irq_domain_disconnect_hierarchy(struct irq_domain *domain, +int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, + unsigned int virq, + irq_hw_number_t hwirq, + const struct irq_chip *chip, + void *chip_data); +void irq_domain_free_irqs_common(struct irq_domain *domain, + unsigned int virq, + unsigned int nr_irqs); +void irq_domain_free_irqs_top(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs); + +int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg); +int irq_domain_pop_irq(struct irq_domain *domain, int virq); + +int irq_domain_alloc_irqs_parent(struct irq_domain *domain, + unsigned int irq_base, + unsigned int nr_irqs, void *arg); + +void irq_domain_free_irqs_parent(struct irq_domain *domain, + unsigned int irq_base, + unsigned int nr_irqs); + +int irq_domain_disconnect_hierarchy(struct irq_domain *domain, unsigned int virq); static inline bool irq_domain_is_hierarchy(struct irq_domain *domain) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index f0e9f8eda7a3..c840431eadda 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -68,8 +68,6 @@ extern note_buf_t __percpu *crash_notes; #define KEXEC_CRASH_MEM_ALIGN PAGE_SIZE #endif -#define KEXEC_CORE_NOTE_NAME CRASH_CORE_NOTE_NAME - /* * This structure is used to hold the arguments that are used when loading * kernel binaries. diff --git a/include/linux/key.h b/include/linux/key.h index 074dca3222b9..ba05de8579ec 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -236,6 +236,7 @@ struct key { #define KEY_FLAG_ROOT_CAN_INVAL 7 /* set if key can be invalidated by root without permission */ #define KEY_FLAG_KEEP 8 /* set if key should not be removed */ #define KEY_FLAG_UID_KEYRING 9 /* set if key is a user or user session keyring */ +#define KEY_FLAG_FINAL_PUT 10 /* set if final put has happened on key */ /* the key type and key description string * - the desc is used to match a key against search criteria diff --git a/include/linux/kstrtox.h b/include/linux/kstrtox.h index 7fcf29a4e0de..6ea897222af1 100644 --- a/include/linux/kstrtox.h +++ b/include/linux/kstrtox.h @@ -143,6 +143,7 @@ static inline int __must_check kstrtos32_from_user(const char __user *s, size_t */ extern unsigned long simple_strtoul(const char *,char **,unsigned int); +extern unsigned long simple_strntoul(const char *,char **,unsigned int,size_t); extern long simple_strtol(const char *,char **,unsigned int); extern unsigned long long simple_strtoull(const char *,char **,unsigned int); extern long long simple_strtoll(const char *,char **,unsigned int); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f34f4cfaa513..5438a1b446a6 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -267,6 +267,7 @@ struct kvm_gfn_range { union kvm_mmu_notifier_arg arg; enum kvm_gfn_range_filter attr_filter; bool may_block; + bool lockless; }; bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); @@ -1746,7 +1747,6 @@ static inline void kvm_unregister_perf_callbacks(void) {} int kvm_arch_init_vm(struct kvm *kvm, unsigned long type); void kvm_arch_destroy_vm(struct kvm *kvm); -void kvm_arch_sync_events(struct kvm *kvm); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h index e13d2f947b51..7283bc4cf413 100644 --- a/include/linux/lsm_audit.h +++ b/include/linux/lsm_audit.h @@ -5,7 +5,7 @@ * * Author : Etienne BASSET <etienne.basset@ensta.org> * - * All credits to : Stephen Smalley, <sds@tycho.nsa.gov> + * All credits to : Stephen Smalley * All BUGS to : Etienne BASSET <etienne.basset@ensta.org> */ #ifndef _LSM_COMMON_LOGGING_ diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index e2f1ce37c41e..2bf909fa3394 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -445,7 +445,7 @@ LSM_HOOK(int, 0, bpf_token_capable, const struct bpf_token *token, int cap) LSM_HOOK(int, 0, locked_down, enum lockdown_reason what) #ifdef CONFIG_PERF_EVENTS -LSM_HOOK(int, 0, perf_event_open, struct perf_event_attr *attr, int type) +LSM_HOOK(int, 0, perf_event_open, int type) LSM_HOOK(int, 0, perf_event_alloc, struct perf_event *event) LSM_HOOK(int, 0, perf_event_read, struct perf_event *event) LSM_HOOK(int, 0, perf_event_write, struct perf_event *event) @@ -455,6 +455,7 @@ LSM_HOOK(int, 0, perf_event_write, struct perf_event *event) LSM_HOOK(int, 0, uring_override_creds, const struct cred *new) LSM_HOOK(int, 0, uring_sqpoll, void) LSM_HOOK(int, 0, uring_cmd, struct io_uring_cmd *ioucmd) +LSM_HOOK(int, 0, uring_allowed, void) #endif /* CONFIG_IO_URING */ LSM_HOOK(void, LSM_RET_VOID, initramfs_populated, void) diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h index ae4526389261..07584c5e36fb 100644 --- a/include/linux/mem_encrypt.h +++ b/include/linux/mem_encrypt.h @@ -26,11 +26,34 @@ */ #define __sme_set(x) ((x) | sme_me_mask) #define __sme_clr(x) ((x) & ~sme_me_mask) + +#define dma_addr_encrypted(x) __sme_set(x) +#define dma_addr_canonical(x) __sme_clr(x) + #else #define __sme_set(x) (x) #define __sme_clr(x) (x) #endif +/* + * dma_addr_encrypted() and dma_addr_unencrypted() are for converting a given DMA + * address to the respective type of addressing. + * + * dma_addr_canonical() is used to reverse any conversions for encrypted/decrypted + * back to the canonical address. + */ +#ifndef dma_addr_encrypted +#define dma_addr_encrypted(x) (x) +#endif + +#ifndef dma_addr_unencrypted +#define dma_addr_unencrypted(x) (x) +#endif + +#ifndef dma_addr_canonical +#define dma_addr_canonical(x) (x) +#endif + #endif /* __ASSEMBLY__ */ #endif /* __MEM_ENCRYPT_H__ */ diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h index 49eef10c8e59..4bf261d41a6d 100644 --- a/include/linux/misc_cgroup.h +++ b/include/linux/misc_cgroup.h @@ -60,7 +60,6 @@ struct misc_cg { struct misc_res res[MISC_CG_RES_TYPES]; }; -u64 misc_cg_res_total_usage(enum misc_res_type type); int misc_cg_set_capacity(enum misc_res_type type, u64 capacity); int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount); void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount); @@ -104,11 +103,6 @@ static inline void put_misc_cg(struct misc_cg *cg) #else /* !CONFIG_CGROUP_MISC */ -static inline u64 misc_cg_res_total_usage(enum misc_res_type type) -{ - return 0; -} - static inline int misc_cg_set_capacity(enum misc_res_type type, u64 capacity) { return 0; diff --git a/include/linux/mm.h b/include/linux/mm.h index 1f80baddacc5..2edb8d14d165 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2555,7 +2555,7 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, struct task_struct *task, bool bypass_rlim); struct kvec; -struct page *get_dump_page(unsigned long addr); +struct page *get_dump_page(unsigned long addr, int *locked); bool folio_mark_dirty(struct folio *folio); bool folio_mark_dirty_lock(struct folio *folio); diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h index b1b219bc3422..e71a6070a8f8 100644 --- a/include/linux/mnt_idmapping.h +++ b/include/linux/mnt_idmapping.h @@ -25,6 +25,11 @@ static_assert(sizeof(vfsgid_t) == sizeof(kgid_t)); static_assert(offsetof(vfsuid_t, val) == offsetof(kuid_t, val)); static_assert(offsetof(vfsgid_t, val) == offsetof(kgid_t, val)); +static inline bool is_valid_mnt_idmap(const struct mnt_idmap *idmap) +{ + return idmap != &nop_mnt_idmap && idmap != &invalid_mnt_idmap; +} + #ifdef CONFIG_MULTIUSER static inline uid_t __vfsuid_val(vfsuid_t uid) { diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index d67614f7b7f1..bd7e60c0b72f 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -692,6 +692,7 @@ struct x86_cpu_id { __u16 feature; /* bit index */ /* Solely for kernel-internal use: DO NOT EXPORT to userspace! */ __u16 flags; + __u8 type; kernel_ulong_t driver_data; }; @@ -703,6 +704,7 @@ struct x86_cpu_id { #define X86_STEP_MIN 0 #define X86_STEP_MAX 0xf #define X86_FEATURE_ANY 0 /* Same as FPU, you can't test for that */ +#define X86_CPU_TYPE_ANY 0 /* * Generic table type for matching CPU features. diff --git a/include/linux/module.h b/include/linux/module.h index 30e5b19bafa9..9937e71a3b5b 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -370,7 +370,6 @@ enum mod_mem_type { struct module_memory { void *base; - void *rw_copy; bool is_rox; unsigned int size; @@ -772,16 +771,6 @@ static inline bool is_livepatch_module(struct module *mod) void set_module_sig_enforced(void); -void *__module_writable_address(struct module *mod, void *loc); - -static inline void *module_writable_address(struct module *mod, void *loc) -{ - if (!IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX) || !mod || - mod->state != MODULE_STATE_UNFORMED) - return loc; - return __module_writable_address(mod, loc); -} - #else /* !CONFIG_MODULES... */ static inline struct module *__module_address(unsigned long addr) @@ -889,11 +878,6 @@ static inline bool module_is_coming(struct module *mod) { return false; } - -static inline void *module_writable_address(struct module *mod, void *loc) -{ - return loc; -} #endif /* CONFIG_MODULES */ #ifdef CONFIG_SYSFS diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index 1f5507ba5a12..e395461d59e5 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -108,10 +108,6 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mod); -int module_post_finalize(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - struct module *mod); - #ifdef CONFIG_MODULES void flush_module_init_free_work(void); #else diff --git a/include/linux/msi.h b/include/linux/msi.h index b10093c4d00e..6d7d1947f928 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -73,7 +73,6 @@ struct msi_msg { }; }; -extern int pci_msi_ignore_mask; /* Helper functions */ struct msi_desc; struct pci_dev; @@ -81,7 +80,6 @@ struct device_attribute; struct irq_domain; struct irq_affinity_desc; -void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg); #ifdef CONFIG_GENERIC_MSI_IRQ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg); #else @@ -225,8 +223,11 @@ struct msi_dev_domain { int msi_setup_device_data(struct device *dev); -void msi_lock_descs(struct device *dev); -void msi_unlock_descs(struct device *dev); +void __msi_lock_descs(struct device *dev); +void __msi_unlock_descs(struct device *dev); + +DEFINE_LOCK_GUARD_1(msi_descs_lock, struct device, __msi_lock_descs(_T->lock), + __msi_unlock_descs(_T->lock)); struct msi_desc *msi_domain_first_desc(struct device *dev, unsigned int domid, enum msi_desc_filter filter); @@ -556,6 +557,16 @@ enum { MSI_FLAG_PCI_MSIX_ALLOC_DYN = (1 << 20), /* PCI MSIs cannot be steered separately to CPU cores */ MSI_FLAG_NO_AFFINITY = (1 << 21), + /* Inhibit usage of entry masking */ + MSI_FLAG_NO_MASK = (1 << 22), +}; + +/* + * Flags for msi_parent_ops::chip_flags + */ +enum { + MSI_CHIP_FLAG_SET_EOI = (1 << 0), + MSI_CHIP_FLAG_SET_ACK = (1 << 1), }; /** @@ -563,6 +574,8 @@ enum { * * @supported_flags: Required: The supported MSI flags of the parent domain * @required_flags: Optional: The required MSI flags of the parent MSI domain + * @chip_flags: Optional: Select MSI chip callbacks to update with defaults + * in msi_lib_init_dev_msi_info(). * @bus_select_token: Optional: The bus token of the real parent domain for * irq_domain::select() * @bus_select_mask: Optional: A mask of supported BUS_DOMAINs for @@ -575,6 +588,7 @@ enum { struct msi_parent_ops { u32 supported_flags; u32 required_flags; + u32 chip_flags; u32 bus_select_token; u32 bus_select_mask; const char *prefix; @@ -603,8 +617,6 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid); bool msi_match_device_irq_domain(struct device *dev, unsigned int domid, enum irq_domain_bus_token bus_token); -int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid, - unsigned int first, unsigned int last); int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid, unsigned int first, unsigned int last); int msi_domain_alloc_irqs_all_locked(struct device *dev, unsigned int domid, int nirqs); @@ -613,8 +625,6 @@ struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, u const struct irq_affinity_desc *affdesc, union msi_instance_cookie *cookie); -void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid, - unsigned int first, unsigned int last); void msi_domain_free_irqs_range(struct device *dev, unsigned int domid, unsigned int first, unsigned int last); void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid); diff --git a/include/linux/namei.h b/include/linux/namei.h index 8ec8fed3bce8..e3042176cdf4 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -18,35 +18,36 @@ enum { MAX_NESTED_LINKS = 8 }; enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT}; /* pathwalk mode */ -#define LOOKUP_FOLLOW 0x0001 /* follow links at the end */ -#define LOOKUP_DIRECTORY 0x0002 /* require a directory */ -#define LOOKUP_AUTOMOUNT 0x0004 /* force terminal automount */ -#define LOOKUP_EMPTY 0x4000 /* accept empty path [user_... only] */ -#define LOOKUP_DOWN 0x8000 /* follow mounts in the starting point */ -#define LOOKUP_MOUNTPOINT 0x0080 /* follow mounts in the end */ - -#define LOOKUP_REVAL 0x0020 /* tell ->d_revalidate() to trust no cache */ -#define LOOKUP_RCU 0x0040 /* RCU pathwalk mode; semi-internal */ +#define LOOKUP_FOLLOW BIT(0) /* follow links at the end */ +#define LOOKUP_DIRECTORY BIT(1) /* require a directory */ +#define LOOKUP_AUTOMOUNT BIT(2) /* force terminal automount */ +#define LOOKUP_EMPTY BIT(3) /* accept empty path [user_... only] */ +#define LOOKUP_LINKAT_EMPTY BIT(4) /* Linkat request with empty path. */ +#define LOOKUP_DOWN BIT(5) /* follow mounts in the starting point */ +#define LOOKUP_MOUNTPOINT BIT(6) /* follow mounts in the end */ +#define LOOKUP_REVAL BIT(7) /* tell ->d_revalidate() to trust no cache */ +#define LOOKUP_RCU BIT(8) /* RCU pathwalk mode; semi-internal */ +#define LOOKUP_CACHED BIT(9) /* Only do cached lookup */ +#define LOOKUP_PARENT BIT(10) /* Looking up final parent in path */ +/* 5 spare bits for pathwalk */ /* These tell filesystem methods that we are dealing with the final component... */ -#define LOOKUP_OPEN 0x0100 /* ... in open */ -#define LOOKUP_CREATE 0x0200 /* ... in object creation */ -#define LOOKUP_EXCL 0x0400 /* ... in exclusive creation */ -#define LOOKUP_RENAME_TARGET 0x0800 /* ... in destination of rename() */ +#define LOOKUP_OPEN BIT(16) /* ... in open */ +#define LOOKUP_CREATE BIT(17) /* ... in object creation */ +#define LOOKUP_EXCL BIT(18) /* ... in target must not exist */ +#define LOOKUP_RENAME_TARGET BIT(19) /* ... in destination of rename() */ -/* internal use only */ -#define LOOKUP_PARENT 0x0010 +/* 4 spare bits for intent */ /* Scoping flags for lookup. */ -#define LOOKUP_NO_SYMLINKS 0x010000 /* No symlink crossing. */ -#define LOOKUP_NO_MAGICLINKS 0x020000 /* No nd_jump_link() crossing. */ -#define LOOKUP_NO_XDEV 0x040000 /* No mountpoint crossing. */ -#define LOOKUP_BENEATH 0x080000 /* No escaping from starting point. */ -#define LOOKUP_IN_ROOT 0x100000 /* Treat dirfd as fs root. */ -#define LOOKUP_CACHED 0x200000 /* Only do cached lookup */ -#define LOOKUP_LINKAT_EMPTY 0x400000 /* Linkat request with empty path. */ +#define LOOKUP_NO_SYMLINKS BIT(24) /* No symlink crossing. */ +#define LOOKUP_NO_MAGICLINKS BIT(25) /* No nd_jump_link() crossing. */ +#define LOOKUP_NO_XDEV BIT(26) /* No mountpoint crossing. */ +#define LOOKUP_BENEATH BIT(27) /* No escaping from starting point. */ +#define LOOKUP_IN_ROOT BIT(28) /* Treat dirfd as fs root. */ /* LOOKUP_* flags which do scope-related checks based on the dirfd. */ #define LOOKUP_IS_SCOPED (LOOKUP_BENEATH | LOOKUP_IN_ROOT) +/* 3 spare bits for scoping */ extern int path_pts(struct path *path); diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 9155a6ffc370..d66c61cbbd1d 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1802,7 +1802,7 @@ struct nfs_rpc_ops { int (*link) (struct inode *, struct inode *, const struct qstr *); int (*symlink) (struct inode *, struct dentry *, struct folio *, unsigned int, struct iattr *); - int (*mkdir) (struct inode *, struct dentry *, struct iattr *); + struct dentry *(*mkdir) (struct inode *, struct dentry *, struct iattr *); int (*rmdir) (struct inode *, const struct qstr *); int (*readdir) (struct nfs_readdir_arg *, struct nfs_readdir_res *); int (*mknod) (struct inode *, struct dentry *, struct iattr *, diff --git a/include/linux/nmi.h b/include/linux/nmi.h index a8dfb38c9bb6..e78fa535f61d 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -17,7 +17,6 @@ void lockup_detector_init(void); void lockup_detector_retry_init(void); void lockup_detector_soft_poweroff(void); -void lockup_detector_cleanup(void); extern int watchdog_user_enabled; extern int watchdog_thresh; @@ -37,7 +36,6 @@ extern int sysctl_hardlockup_all_cpu_backtrace; static inline void lockup_detector_init(void) { } static inline void lockup_detector_retry_init(void) { } static inline void lockup_detector_soft_poweroff(void) { } -static inline void lockup_detector_cleanup(void) { } #endif /* !CONFIG_LOCKUP_DETECTOR */ #ifdef CONFIG_SOFTLOCKUP_DETECTOR @@ -104,12 +102,10 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs); #if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF) extern void hardlockup_detector_perf_stop(void); extern void hardlockup_detector_perf_restart(void); -extern void hardlockup_detector_perf_cleanup(void); extern void hardlockup_config_perf_event(const char *str); #else static inline void hardlockup_detector_perf_stop(void) { } static inline void hardlockup_detector_perf_restart(void) { } -static inline void hardlockup_detector_perf_cleanup(void) { } static inline void hardlockup_config_perf_event(const char *str) { } #endif diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 9fd7a0ce9c1a..f0ac0633366b 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -94,7 +94,6 @@ #include <linux/bitmap.h> #include <linux/minmax.h> #include <linux/nodemask_types.h> -#include <linux/numa.h> #include <linux/random.h> extern nodemask_t _unused_nodemask_arg_; @@ -191,6 +190,13 @@ static __always_inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *s bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); } +#define nodes_copy(dst, src) __nodes_copy(&(dst), &(src), MAX_NUMNODES) +static __always_inline void __nodes_copy(nodemask_t *dstp, + const nodemask_t *srcp, unsigned int nbits) +{ + bitmap_copy(dstp->bits, srcp->bits, nbits); +} + #define nodes_complement(dst, src) \ __nodes_complement(&(dst), &(src), MAX_NUMNODES) static __always_inline void __nodes_complement(nodemask_t *dstp, diff --git a/include/linux/nodemask_types.h b/include/linux/nodemask_types.h index 6b28d97ea6ed..f850a48742f1 100644 --- a/include/linux/nodemask_types.h +++ b/include/linux/nodemask_types.h @@ -3,7 +3,16 @@ #define __LINUX_NODEMASK_TYPES_H #include <linux/bitops.h> -#include <linux/numa.h> + +#ifdef CONFIG_NODES_SHIFT +#define NODES_SHIFT CONFIG_NODES_SHIFT +#else +#define NODES_SHIFT 0 +#endif + +#define MAX_NUMNODES (1 << NODES_SHIFT) + +#define NUMA_NO_NODE (-1) typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; diff --git a/include/linux/numa.h b/include/linux/numa.h index 3567e40329eb..e6baaf6051bc 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -3,16 +3,8 @@ #define _LINUX_NUMA_H #include <linux/init.h> #include <linux/types.h> +#include <linux/nodemask.h> -#ifdef CONFIG_NODES_SHIFT -#define NODES_SHIFT CONFIG_NODES_SHIFT -#else -#define NODES_SHIFT 0 -#endif - -#define MAX_NUMNODES (1 << NODES_SHIFT) - -#define NUMA_NO_NODE (-1) #define NUMA_NO_MEMBLK (-1) static inline bool numa_valid_node(int nid) @@ -39,6 +31,8 @@ void __init alloc_offline_node_data(int nid); /* Generic implementation available */ int numa_nearest_node(int node, unsigned int state); +int nearest_node_nodemask(int node, nodemask_t *mask); + #ifndef memory_add_physaddr_to_nid int memory_add_physaddr_to_nid(u64 start); #endif @@ -55,6 +49,11 @@ static inline int numa_nearest_node(int node, unsigned int state) return NUMA_NO_NODE; } +static inline int nearest_node_nodemask(int node, nodemask_t *mask) +{ + return NUMA_NO_NODE; +} + static inline int memory_add_physaddr_to_nid(u64 start) { return 0; diff --git a/include/linux/objpool.h b/include/linux/objpool.h index cb1758eaa2d3..b713a1fe7521 100644 --- a/include/linux/objpool.h +++ b/include/linux/objpool.h @@ -170,17 +170,16 @@ static inline void *objpool_pop(struct objpool_head *pool) { void *obj = NULL; unsigned long flags; - int i, cpu; + int start, cpu; /* disable local irq to avoid preemption & interruption */ raw_local_irq_save(flags); - cpu = raw_smp_processor_id(); - for (i = 0; i < pool->nr_possible_cpus; i++) { + start = raw_smp_processor_id(); + for_each_possible_cpu_wrap(cpu, start) { obj = __objpool_try_get_slot(pool, cpu); if (obj) break; - cpu = cpumask_next_wrap(cpu, cpu_possible_mask, -1, 1); } raw_local_irq_restore(flags); diff --git a/include/linux/objtool.h b/include/linux/objtool.h index c722a921165b..3ca965a2ddc8 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -128,7 +128,7 @@ #define UNWIND_HINT(type, sp_reg, sp_offset, signal) "\n\t" #define STACK_FRAME_NON_STANDARD(func) #define STACK_FRAME_NON_STANDARD_FP(func) -#define __ASM_ANNOTATE(label, type) +#define __ASM_ANNOTATE(label, type) "" #define ASM_ANNOTATE(type) #else .macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 @@ -147,6 +147,8 @@ * these relocations will never be used for indirect calls. */ #define ANNOTATE_NOENDBR ASM_ANNOTATE(ANNOTYPE_NOENDBR) +#define ANNOTATE_NOENDBR_SYM(sym) asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOENDBR)) + /* * This should be used immediately before an indirect jump/call. It tells * objtool the subsequent indirect jump/call is vouched safe for retpoline diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 36d283552f80..df9234e5f478 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -925,14 +925,15 @@ FOLIO_FLAG_FALSE(has_hwpoisoned) enum pagetype { /* 0x00-0x7f are positive numbers, ie mapcount */ /* Reserve 0x80-0xef for mapcount overflow. */ - PGTY_buddy = 0xf0, - PGTY_offline = 0xf1, - PGTY_table = 0xf2, - PGTY_guard = 0xf3, - PGTY_hugetlb = 0xf4, - PGTY_slab = 0xf5, - PGTY_zsmalloc = 0xf6, - PGTY_unaccepted = 0xf7, + PGTY_buddy = 0xf0, + PGTY_offline = 0xf1, + PGTY_table = 0xf2, + PGTY_guard = 0xf3, + PGTY_hugetlb = 0xf4, + PGTY_slab = 0xf5, + PGTY_zsmalloc = 0xf6, + PGTY_unaccepted = 0xf7, + PGTY_large_kmalloc = 0xf8, PGTY_mapcount_underflow = 0xff }; @@ -1075,6 +1076,7 @@ PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc) * Serialized with zone lock. */ PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted) +FOLIO_TYPE_OPS(large_kmalloc, large_kmalloc) /** * PageHuge - Determine if the page belongs to hugetlbfs diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 47bfc6b1b632..d2432cb02fa0 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1044,21 +1044,23 @@ static inline pgoff_t page_pgoff(const struct folio *folio, return folio->index + folio_page_idx(folio, page); } -/* - * Return byte-offset into filesystem object for page. +/** + * folio_pos - Returns the byte position of this folio in its file. + * @folio: The folio. */ -static inline loff_t page_offset(struct page *page) +static inline loff_t folio_pos(const struct folio *folio) { - return ((loff_t)page->index) << PAGE_SHIFT; + return ((loff_t)folio->index) * PAGE_SIZE; } -/** - * folio_pos - Returns the byte position of this folio in its file. - * @folio: The folio. +/* + * Return byte-offset into filesystem object for page. */ -static inline loff_t folio_pos(struct folio *folio) +static inline loff_t page_offset(struct page *page) { - return page_offset(&folio->page); + struct folio *folio = page_folio(page); + + return folio_pos(folio) + folio_page_idx(folio, page) * PAGE_SIZE; } /* @@ -1603,34 +1605,6 @@ static inline ssize_t folio_mkwrite_check_truncate(struct folio *folio, } /** - * page_mkwrite_check_truncate - check if page was truncated - * @page: the page to check - * @inode: the inode to check the page against - * - * Returns the number of bytes in the page up to EOF, - * or -EFAULT if the page was truncated. - */ -static inline int page_mkwrite_check_truncate(struct page *page, - struct inode *inode) -{ - loff_t size = i_size_read(inode); - pgoff_t index = size >> PAGE_SHIFT; - int offset = offset_in_page(size); - - if (page->mapping != inode->i_mapping) - return -EFAULT; - - /* page is wholly inside EOF */ - if (page->index < index) - return PAGE_SIZE; - /* page is wholly past EOF */ - if (page->index > index || !offset) - return -EFAULT; - /* page is partially inside EOF */ - return offset; -} - -/** * i_blocks_per_folio - How many blocks fit in this folio. * @inode: The inode which contains the blocks. * @folio: The folio. diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 5b520fe86b60..0fcacb909778 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -26,13 +26,11 @@ #define PER_CPU_SHARED_ALIGNED_SECTION "..shared_aligned" #define PER_CPU_ALIGNED_SECTION "..shared_aligned" #endif -#define PER_CPU_FIRST_SECTION "..first" #else #define PER_CPU_SHARED_ALIGNED_SECTION "" #define PER_CPU_ALIGNED_SECTION "..shared_aligned" -#define PER_CPU_FIRST_SECTION "" #endif @@ -115,14 +113,17 @@ DEFINE_PER_CPU_SECTION(type, name, "") /* - * Declaration/definition used for per-CPU variables that must come first in - * the set of variables. + * Declaration/definition used for per-CPU variables that are frequently + * accessed and should be in a single cacheline. + * + * For use only by architecture and core code. Only use scalar or pointer + * types to maximize density. */ -#define DECLARE_PER_CPU_FIRST(type, name) \ - DECLARE_PER_CPU_SECTION(type, name, PER_CPU_FIRST_SECTION) +#define DECLARE_PER_CPU_CACHE_HOT(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, "..hot.." #name) -#define DEFINE_PER_CPU_FIRST(type, name) \ - DEFINE_PER_CPU_SECTION(type, name, PER_CPU_FIRST_SECTION) +#define DEFINE_PER_CPU_CACHE_HOT(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, "..hot.." #name) /* * Declaration/definition used for per-CPU variables that must be cacheline diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index c012df33a9f0..af7d75ede619 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -8,6 +8,7 @@ #include <linux/wait.h> #include <linux/rcu_sync.h> #include <linux/lockdep.h> +#include <linux/cleanup.h> struct percpu_rw_semaphore { struct rcu_sync rss; @@ -125,6 +126,13 @@ extern bool percpu_is_read_locked(struct percpu_rw_semaphore *); extern void percpu_down_write(struct percpu_rw_semaphore *); extern void percpu_up_write(struct percpu_rw_semaphore *); +DEFINE_GUARD(percpu_read, struct percpu_rw_semaphore *, + percpu_down_read(_T), percpu_up_read(_T)) +DEFINE_GUARD_COND(percpu_read, _try, percpu_down_read_trylock(_T)) + +DEFINE_GUARD(percpu_write, struct percpu_rw_semaphore *, + percpu_down_write(_T), percpu_up_write(_T)) + static inline bool percpu_is_write_locked(struct percpu_rw_semaphore *sem) { return atomic_read(&sem->block); diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 4b5b83677e3f..6dc5e0cd76ca 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -84,7 +84,6 @@ struct arm_pmu { struct pmu pmu; cpumask_t supported_cpus; char *name; - int pmuver; irqreturn_t (*handle_irq)(struct arm_pmu *pmu); void (*enable)(struct perf_event *event); void (*disable)(struct perf_event *event); @@ -100,20 +99,26 @@ struct arm_pmu { void (*stop)(struct arm_pmu *); void (*reset)(void *); int (*map_event)(struct perf_event *event); + /* + * Called by KVM to map the PMUv3 event space onto non-PMUv3 hardware. + */ + int (*map_pmuv3_event)(unsigned int eventsel); DECLARE_BITMAP(cntr_mask, ARMPMU_MAX_HWEVENTS); bool secure_access; /* 32-bit ARM only */ -#define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40 - DECLARE_BITMAP(pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); -#define ARMV8_PMUV3_EXT_COMMON_EVENT_BASE 0x4000 - DECLARE_BITMAP(pmceid_ext_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); struct platform_device *plat_device; struct pmu_hw_events __percpu *hw_events; struct hlist_node node; struct notifier_block cpu_pm_nb; /* the attr_groups array must be NULL-terminated */ const struct attribute_group *attr_groups[ARMPMU_NR_ATTR_GROUPS + 1]; - /* store the PMMIR_EL1 to expose slots */ + + /* PMUv3 only */ + int pmuver; u64 reg_pmmir; +#define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40 + DECLARE_BITMAP(pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); +#define ARMV8_PMUV3_EXT_COMMON_EVENT_BASE 0x4000 + DECLARE_BITMAP(pmceid_ext_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); /* Only to be used by ACPI probing code */ unsigned long acpi_cpuid; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8333f132f4a9..5a9bf15d4461 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -343,8 +343,7 @@ struct pmu { */ unsigned int scope; - int __percpu *pmu_disable_count; - struct perf_cpu_pmu_context __percpu *cpu_pmu_context; + struct perf_cpu_pmu_context * __percpu *cpu_pmu_context; atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ int task_ctx_nr; int hrtimer_interval_ms; @@ -495,7 +494,7 @@ struct pmu { * context-switches callback */ void (*sched_task) (struct perf_event_pmu_context *pmu_ctx, - bool sched_in); + struct task_struct *task, bool sched_in); /* * Kmem cache of PMU specific data @@ -503,16 +502,6 @@ struct pmu { struct kmem_cache *task_ctx_cache; /* - * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data) - * can be synchronized using this function. See Intel LBR callstack support - * implementation and Perf core context switch handling callbacks for usage - * examples. - */ - void (*swap_task_ctx) (struct perf_event_pmu_context *prev_epc, - struct perf_event_pmu_context *next_epc); - /* optional */ - - /* * Set up pmu-private data structures for an AUX area */ void *(*setup_aux) (struct perf_event *event, void **pages, @@ -673,13 +662,16 @@ struct swevent_hlist { struct rcu_head rcu_head; }; -#define PERF_ATTACH_CONTEXT 0x01 -#define PERF_ATTACH_GROUP 0x02 -#define PERF_ATTACH_TASK 0x04 -#define PERF_ATTACH_TASK_DATA 0x08 -#define PERF_ATTACH_ITRACE 0x10 -#define PERF_ATTACH_SCHED_CB 0x20 -#define PERF_ATTACH_CHILD 0x40 +#define PERF_ATTACH_CONTEXT 0x0001 +#define PERF_ATTACH_GROUP 0x0002 +#define PERF_ATTACH_TASK 0x0004 +#define PERF_ATTACH_TASK_DATA 0x0008 +#define PERF_ATTACH_GLOBAL_DATA 0x0010 +#define PERF_ATTACH_SCHED_CB 0x0020 +#define PERF_ATTACH_CHILD 0x0040 +#define PERF_ATTACH_EXCLUSIVE 0x0080 +#define PERF_ATTACH_CALLCHAIN 0x0100 +#define PERF_ATTACH_ITRACE 0x0200 struct bpf_prog; struct perf_cgroup; @@ -921,7 +913,7 @@ struct perf_event_pmu_context { struct list_head pinned_active; struct list_head flexible_active; - /* Used to avoid freeing per-cpu perf_event_pmu_context */ + /* Used to identify the per-cpu perf_event_pmu_context */ unsigned int embedded : 1; unsigned int nr_events; @@ -931,7 +923,6 @@ struct perf_event_pmu_context { atomic_t refcount; /* event <-> epc */ struct rcu_head rcu_head; - void *task_ctx_data; /* pmu specific data */ /* * Set when one or more (plausibly active) event can't be scheduled * due to pmu overcommit or pmu constraints, except tolerant to @@ -979,7 +970,6 @@ struct perf_event_context { int nr_user; int is_active; - int nr_task_data; int nr_stat; int nr_freq; int rotate_disable; @@ -1020,6 +1010,41 @@ struct perf_event_context { local_t nr_no_switch_fast; }; +/** + * struct perf_ctx_data - PMU specific data for a task + * @rcu_head: To avoid the race on free PMU specific data + * @refcount: To track users + * @global: To track system-wide users + * @ctx_cache: Kmem cache of PMU specific data + * @data: PMU specific data + * + * Currently, the struct is only used in Intel LBR call stack mode to + * save/restore the call stack of a task on context switches. + * + * The rcu_head is used to prevent the race on free the data. + * The data only be allocated when Intel LBR call stack mode is enabled. + * The data will be freed when the mode is disabled. + * The content of the data will only be accessed in context switch, which + * should be protected by rcu_read_lock(). + * + * Because of the alignment requirement of Intel Arch LBR, the Kmem cache + * is used to allocate the PMU specific data. The ctx_cache is to track + * the Kmem cache. + * + * Careful: Struct perf_ctx_data is added as a pointer in struct task_struct. + * When system-wide Intel LBR call stack mode is enabled, a buffer with + * constant size will be allocated for each task. + * Also, system memory consumption can further grow when the size of + * struct perf_ctx_data enlarges. + */ +struct perf_ctx_data { + struct rcu_head rcu_head; + refcount_t refcount; + int global; + struct kmem_cache *ctx_cache; + void *data; +}; + struct perf_cpu_pmu_context { struct perf_event_pmu_context epc; struct perf_event_pmu_context *task_epc; @@ -1029,6 +1054,7 @@ struct perf_cpu_pmu_context { int active_oncpu; int exclusive; + int pmu_disable_count; raw_spinlock_t hrtimer_lock; struct hrtimer hrtimer; @@ -1062,7 +1088,13 @@ struct perf_output_handle { struct perf_buffer *rb; unsigned long wakeup; unsigned long size; - u64 aux_flags; + union { + u64 flags; /* perf_output*() */ + u64 aux_flags; /* perf_aux_output*() */ + struct { + u64 skip_read : 1; + }; + }; union { void *addr; unsigned long head; @@ -1339,6 +1371,9 @@ static inline void perf_sample_save_brstack(struct perf_sample_data *data, if (branch_sample_hw_index(event)) size += sizeof(u64); + + brs->nr = min_t(u16, event->attr.sample_max_stack, brs->nr); + size += brs->nr * sizeof(struct perf_branch_entry); /* @@ -1646,19 +1681,10 @@ static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 } extern int sysctl_perf_event_paranoid; -extern int sysctl_perf_event_mlock; extern int sysctl_perf_event_sample_rate; -extern int sysctl_perf_cpu_time_max_percent; extern void perf_sample_event_took(u64 sample_len_ns); -int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -int perf_event_max_stack_handler(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); - /* Access to perf_event_open(2) syscall. */ #define PERF_SECURITY_OPEN 0 @@ -1672,22 +1698,22 @@ static inline int perf_is_paranoid(void) return sysctl_perf_event_paranoid > -1; } -int perf_allow_kernel(struct perf_event_attr *attr); +int perf_allow_kernel(void); -static inline int perf_allow_cpu(struct perf_event_attr *attr) +static inline int perf_allow_cpu(void) { if (sysctl_perf_event_paranoid > 0 && !perfmon_capable()) return -EACCES; - return security_perf_event_open(attr, PERF_SECURITY_CPU); + return security_perf_event_open(PERF_SECURITY_CPU); } -static inline int perf_allow_tracepoint(struct perf_event_attr *attr) +static inline int perf_allow_tracepoint(void) { if (sysctl_perf_event_paranoid > -1 && !perfmon_capable()) return -EPERM; - return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT); + return security_perf_event_open(PERF_SECURITY_TRACEPOINT); } extern int perf_exclude_event(struct perf_event *event, struct pt_regs *regs); diff --git a/include/linux/pid.h b/include/linux/pid.h index 98837a1ff0f3..311ecebd7d56 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -101,9 +101,9 @@ extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type); * these helpers must be called with the tasklist_lock write-held. */ extern void attach_pid(struct task_struct *task, enum pid_type); -extern void detach_pid(struct task_struct *task, enum pid_type); -extern void change_pid(struct task_struct *task, enum pid_type, - struct pid *pid); +void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type); +void change_pid(struct pid **pids, struct task_struct *task, enum pid_type, + struct pid *pid); extern void exchange_tids(struct task_struct *task, struct task_struct *old); extern void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type); @@ -129,6 +129,7 @@ extern struct pid *find_ge_pid(int nr, struct pid_namespace *); extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, size_t set_tid_size); extern void free_pid(struct pid *pid); +void free_pids(struct pid **pids); extern void disable_pid_allocation(struct pid_namespace *ns); /* diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index 7c830d0dec9a..05e6f8f4a026 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -6,6 +6,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); void __init pidfs_init(void); void pidfs_add_pid(struct pid *pid); void pidfs_remove_pid(struct pid *pid); +void pidfs_exit(struct task_struct *tsk); extern const struct dentry_operations pidfs_dentry_operations; #endif /* _LINUX_PID_FS_H */ diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index b698758000f8..9d42d473d201 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -108,7 +108,7 @@ struct pipe_inode_info { #ifdef CONFIG_WATCH_QUEUE bool note_loss; #endif - struct page *tmp_page; + struct page *tmp_page[2]; struct fasync_struct *fasync_readers; struct fasync_struct *fasync_writers; struct pipe_buffer *bufs; diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index 8c9df7dadd5d..a299225ab92e 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -50,7 +50,7 @@ struct platform_profile_ops { struct device *platform_profile_register(struct device *dev, const char *name, void *drvdata, const struct platform_profile_ops *ops); -int platform_profile_remove(struct device *dev); +void platform_profile_remove(struct device *dev); struct device *devm_platform_profile_register(struct device *dev, const char *name, void *drvdata, const struct platform_profile_ops *ops); diff --git a/include/linux/pm.h b/include/linux/pm.h index 78855d794342..f0bd8fbae4f2 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -597,6 +597,7 @@ enum rpm_status { RPM_RESUMING, RPM_SUSPENDED, RPM_SUSPENDING, + RPM_BLOCKED, }; /* @@ -678,9 +679,9 @@ struct dev_pm_info { bool wakeup_path:1; bool syscore:1; bool no_pm_callbacks:1; /* Owned by the PM core */ - bool async_in_progress:1; /* Owned by the PM core */ + bool work_in_progress:1; /* Owned by the PM core */ + bool smart_suspend:1; /* Owned by the PM core */ bool must_resume:1; /* Owned by the PM core */ - bool set_active:1; /* Owned by the PM core */ bool may_skip_resume:1; /* Set by subsystems */ #else bool should_wakeup:1; @@ -838,10 +839,8 @@ extern int pm_generic_resume_early(struct device *dev); extern int pm_generic_resume_noirq(struct device *dev); extern int pm_generic_resume(struct device *dev); extern int pm_generic_freeze_noirq(struct device *dev); -extern int pm_generic_freeze_late(struct device *dev); extern int pm_generic_freeze(struct device *dev); extern int pm_generic_thaw_noirq(struct device *dev); -extern int pm_generic_thaw_early(struct device *dev); extern int pm_generic_thaw(struct device *dev); extern int pm_generic_restore_noirq(struct device *dev); extern int pm_generic_restore_early(struct device *dev); @@ -883,10 +882,8 @@ static inline void dpm_for_each_dev(void *data, void (*fn)(struct device *, void #define pm_generic_resume_noirq NULL #define pm_generic_resume NULL #define pm_generic_freeze_noirq NULL -#define pm_generic_freeze_late NULL #define pm_generic_freeze NULL #define pm_generic_thaw_noirq NULL -#define pm_generic_thaw_early NULL #define pm_generic_thaw NULL #define pm_generic_restore_noirq NULL #define pm_generic_restore_early NULL diff --git a/include/linux/pm_clock.h b/include/linux/pm_clock.h index 68669ce18720..c3b46fa358d3 100644 --- a/include/linux/pm_clock.h +++ b/include/linux/pm_clock.h @@ -41,9 +41,7 @@ extern int pm_clk_create(struct device *dev); extern void pm_clk_destroy(struct device *dev); extern int pm_clk_add(struct device *dev, const char *con_id); extern int pm_clk_add_clk(struct device *dev, struct clk *clk); -extern int of_pm_clk_add_clk(struct device *dev, const char *name); extern int of_pm_clk_add_clks(struct device *dev); -extern void pm_clk_remove(struct device *dev, const char *con_id); extern void pm_clk_remove_clk(struct device *dev, struct clk *clk); extern int pm_clk_suspend(struct device *dev); extern int pm_clk_resume(struct device *dev); @@ -76,9 +74,6 @@ static inline int of_pm_clk_add_clks(struct device *dev) { return -EINVAL; } -static inline void pm_clk_remove(struct device *dev, const char *con_id) -{ -} #define pm_clk_suspend NULL #define pm_clk_resume NULL static inline void pm_clk_remove_clk(struct device *dev, struct clk *clk) diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index d39dc863f612..7fb5a459847e 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -66,6 +66,7 @@ static inline bool queue_pm_work(struct work_struct *work) extern int pm_generic_runtime_suspend(struct device *dev); extern int pm_generic_runtime_resume(struct device *dev); +extern bool pm_runtime_need_not_resume(struct device *dev); extern int pm_runtime_force_suspend(struct device *dev); extern int pm_runtime_force_resume(struct device *dev); @@ -77,6 +78,8 @@ extern int pm_runtime_get_if_in_use(struct device *dev); extern int pm_schedule_suspend(struct device *dev, unsigned int delay); extern int __pm_runtime_set_status(struct device *dev, unsigned int status); extern int pm_runtime_barrier(struct device *dev); +extern bool pm_runtime_block_if_disabled(struct device *dev); +extern void pm_runtime_unblock(struct device *dev); extern void pm_runtime_enable(struct device *dev); extern void __pm_runtime_disable(struct device *dev, bool check_resume); extern void pm_runtime_allow(struct device *dev); @@ -197,6 +200,17 @@ static inline bool pm_runtime_enabled(struct device *dev) } /** + * pm_runtime_blocked - Check if runtime PM enabling is blocked. + * @dev: Target device. + * + * Do not call this function outside system suspend/resume code paths. + */ +static inline bool pm_runtime_blocked(struct device *dev) +{ + return dev->power.last_status == RPM_BLOCKED; +} + +/** * pm_runtime_has_no_callbacks - Check if runtime PM callbacks may be present. * @dev: Target device. * @@ -241,6 +255,7 @@ static inline bool queue_pm_work(struct work_struct *work) { return false; } static inline int pm_generic_runtime_suspend(struct device *dev) { return 0; } static inline int pm_generic_runtime_resume(struct device *dev) { return 0; } +static inline bool pm_runtime_need_not_resume(struct device *dev) {return true; } static inline int pm_runtime_force_suspend(struct device *dev) { return 0; } static inline int pm_runtime_force_resume(struct device *dev) { return 0; } @@ -271,8 +286,11 @@ static inline int pm_runtime_get_if_active(struct device *dev) static inline int __pm_runtime_set_status(struct device *dev, unsigned int status) { return 0; } static inline int pm_runtime_barrier(struct device *dev) { return 0; } +static inline bool pm_runtime_block_if_disabled(struct device *dev) { return true; } +static inline void pm_runtime_unblock(struct device *dev) {} static inline void pm_runtime_enable(struct device *dev) {} static inline void __pm_runtime_disable(struct device *dev, bool c) {} +static inline bool pm_runtime_blocked(struct device *dev) { return true; } static inline void pm_runtime_allow(struct device *dev) {} static inline void pm_runtime_forbid(struct device *dev) {} @@ -556,11 +574,18 @@ static inline int pm_runtime_set_suspended(struct device *dev) * pm_runtime_disable - Disable runtime PM for a device. * @dev: Target device. * - * Prevent the runtime PM framework from working with @dev (by incrementing its - * "blocking" counter). + * Prevent the runtime PM framework from working with @dev by incrementing its + * "disable" counter. + * + * If the counter is zero when this function runs and there is a pending runtime + * resume request for @dev, it will be resumed. If the counter is still zero at + * that point, all of the pending runtime PM requests for @dev will be canceled + * and all runtime PM operations in progress involving it will be waited for to + * complete. * - * For each invocation of this function for @dev there must be a matching - * pm_runtime_enable() call in order for runtime PM to be enabled for it. + * For each invocation of this function for @dev, there must be a matching + * pm_runtime_enable() call, so that runtime PM is eventually enabled for it + * again. */ static inline void pm_runtime_disable(struct device *dev) { diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h index d501c09c60cd..51e0e8dd5f9e 100644 --- a/include/linux/pm_wakeup.h +++ b/include/linux/pm_wakeup.h @@ -205,17 +205,17 @@ static inline void device_set_awake_path(struct device *dev) static inline void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec) { - return pm_wakeup_ws_event(ws, msec, false); + pm_wakeup_ws_event(ws, msec, false); } static inline void pm_wakeup_event(struct device *dev, unsigned int msec) { - return pm_wakeup_dev_event(dev, msec, false); + pm_wakeup_dev_event(dev, msec, false); } static inline void pm_wakeup_hard_event(struct device *dev) { - return pm_wakeup_dev_event(dev, 0, true); + pm_wakeup_dev_event(dev, 0, true); } /** diff --git a/include/linux/pnp.h b/include/linux/pnp.h index b7a7158aaf65..23fe3eaf242d 100644 --- a/include/linux/pnp.h +++ b/include/linux/pnp.h @@ -290,7 +290,7 @@ static inline void pnp_set_drvdata(struct pnp_dev *pdev, void *data) } struct pnp_fixup { - char id[7]; + char id[8]; void (*quirk_function) (struct pnp_dev *dev); /* fixup function */ }; diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index f11f10c97bd9..dd48c64b605e 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -114,6 +114,7 @@ bool posixtimer_init_sigqueue(struct sigqueue *q); void posixtimer_send_sigqueue(struct k_itimer *tmr); bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq); void posixtimer_free_timer(struct k_itimer *timer); +long posixtimer_create_prctl(unsigned long ctrl); /* Init task static initializer */ #define INIT_CPU_TIMERBASE(b) { \ @@ -140,6 +141,7 @@ static inline void posixtimer_rearm_itimer(struct task_struct *p) { } static inline bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq) { return false; } static inline void posixtimer_free_timer(struct k_itimer *timer) { } +static inline long posixtimer_create_prctl(unsigned long ctrl) { return -EINVAL; } #endif #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK @@ -177,23 +179,26 @@ static inline void posix_cputimers_init_work(void) { } * @rcu: RCU head for freeing the timer. */ struct k_itimer { - struct hlist_node list; - struct hlist_node ignored_list; + /* 1st cacheline contains read-mostly fields */ struct hlist_node t_hash; - spinlock_t it_lock; - const struct k_clock *kclock; - clockid_t it_clock; + struct hlist_node list; timer_t it_id; + clockid_t it_clock; + int it_sigev_notify; + enum pid_type it_pid_type; + struct signal_struct *it_signal; + const struct k_clock *kclock; + + /* 2nd cacheline and above contain fields which are modified regularly */ + spinlock_t it_lock; int it_status; bool it_sig_periodic; s64 it_overrun; s64 it_overrun_last; unsigned int it_signal_seq; unsigned int it_sigqueue_seq; - int it_sigev_notify; - enum pid_type it_pid_type; ktime_t it_interval; - struct signal_struct *it_signal; + struct hlist_node ignored_list; union { struct pid *it_pid; struct task_struct *it_process; @@ -210,7 +215,7 @@ struct k_itimer { } alarm; } it; struct rcu_head rcu; -}; +} ____cacheline_aligned_in_smp; void run_posix_cpu_timers(void); void posix_cpu_timers_exit(struct task_struct *task); @@ -240,6 +245,13 @@ static inline void posixtimer_sigqueue_putref(struct sigqueue *q) posixtimer_putref(tmr); } + +static inline bool posixtimer_valid(const struct k_itimer *timer) +{ + unsigned long val = (unsigned long)timer->it_signal; + + return !(val & 0x1UL); +} #else /* CONFIG_POSIX_TIMERS */ static inline void posixtimer_sigqueue_getref(struct sigqueue *q) { } static inline void posixtimer_sigqueue_putref(struct sigqueue *q) { } diff --git a/include/linux/preempt.h b/include/linux/preempt.h index ca86235ac15c..b0af8d4ef6e6 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -319,6 +319,7 @@ do { \ #ifdef CONFIG_PREEMPT_NOTIFIERS struct preempt_notifier; +struct task_struct; /** * preempt_ops - notifiers called when a task is preempted and rescheduled @@ -515,6 +516,8 @@ static inline bool preempt_model_rt(void) return IS_ENABLED(CONFIG_PREEMPT_RT); } +extern const char *preempt_model_str(void); + /* * Does the preemption model allow non-cooperative preemption? * diff --git a/include/linux/printk.h b/include/linux/printk.h index 4217a9f412b2..5b462029d03c 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -207,6 +207,7 @@ void printk_legacy_allow_panic_sync(void); extern bool nbcon_device_try_acquire(struct console *con); extern void nbcon_device_release(struct console *con); void nbcon_atomic_flush_unsafe(void); +bool pr_flush(int timeout_ms, bool reset_on_progress); #else static inline __printf(1, 0) int vprintk(const char *s, va_list args) @@ -315,6 +316,11 @@ static inline void nbcon_atomic_flush_unsafe(void) { } +static inline bool pr_flush(int timeout_ms, bool reset_on_progress) +{ + return true; +} + #endif bool this_cpu_in_panic(void); diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 48e5c03df1dd..f8159f8a7d73 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -95,9 +95,9 @@ static inline void __rcu_read_lock(void) static inline void __rcu_read_unlock(void) { - preempt_enable(); if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) rcu_read_unlock_strict(); + preempt_enable(); } static inline int rcu_preempt_depth(void) @@ -121,12 +121,6 @@ void rcu_init(void); extern int rcu_scheduler_active; void rcu_sched_clock_irq(int user); -#ifdef CONFIG_TASKS_RCU_GENERIC -void rcu_init_tasks_generic(void); -#else -static inline void rcu_init_tasks_generic(void) { } -#endif - #ifdef CONFIG_RCU_STALL_COMMON void rcu_sysrq_start(void); void rcu_sysrq_end(void); @@ -806,11 +800,9 @@ do { \ * sections, invocation of the corresponding RCU callback is deferred * until after the all the other CPUs exit their critical sections. * - * In v5.0 and later kernels, synchronize_rcu() and call_rcu() also - * wait for regions of code with preemption disabled, including regions of - * code with interrupts or softirqs disabled. In pre-v5.0 kernels, which - * define synchronize_sched(), only code enclosed within rcu_read_lock() - * and rcu_read_unlock() are guaranteed to be waited for. + * Both synchronize_rcu() and call_rcu() also wait for regions of code + * with preemption disabled, including regions of code with interrupts or + * softirqs disabled. * * Note, however, that RCU callbacks are permitted to run concurrently * with new RCU read-side critical sections. One way that this can happen @@ -865,11 +857,10 @@ static __always_inline void rcu_read_lock(void) * rcu_read_unlock() - marks the end of an RCU read-side critical section. * * In almost all situations, rcu_read_unlock() is immune from deadlock. - * In recent kernels that have consolidated synchronize_sched() and - * synchronize_rcu_bh() into synchronize_rcu(), this deadlock immunity - * also extends to the scheduler's runqueue and priority-inheritance - * spinlocks, courtesy of the quiescent-state deferral that is carried - * out when rcu_read_unlock() is invoked with interrupts disabled. + * This deadlock immunity also extends to the scheduler's runqueue + * and priority-inheritance spinlocks, courtesy of the quiescent-state + * deferral that is carried out when rcu_read_unlock() is invoked with + * interrupts disabled. * * See rcu_read_lock() for more information. */ @@ -1025,12 +1016,6 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) #define RCU_POINTER_INITIALIZER(p, v) \ .p = RCU_INITIALIZER(v) -/* - * Does the specified offset indicate that the corresponding rcu_head - * structure can be handled by kvfree_rcu()? - */ -#define __is_kvfree_rcu_offset(offset) ((offset) < 4096) - /** * kfree_rcu() - kfree an object after a grace period. * @ptr: pointer to kfree for double-argument invocations. @@ -1041,11 +1026,11 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) * when they are used in a kernel module, that module must invoke the * high-latency rcu_barrier() function at module-unload time. * - * The kfree_rcu() function handles this issue. Rather than encoding a - * function address in the embedded rcu_head structure, kfree_rcu() instead - * encodes the offset of the rcu_head structure within the base structure. - * Because the functions are not allowed in the low-order 4096 bytes of - * kernel virtual memory, offsets up to 4095 bytes can be accommodated. + * The kfree_rcu() function handles this issue. In order to have a universal + * callback function handling different offsets of rcu_head, the callback needs + * to determine the starting address of the freed object, which can be a large + * kmalloc or vmalloc allocation. To allow simply aligning the pointer down to + * page boundary for those, only offsets up to 4095 bytes can be accommodated. * If the offset is larger than 4095 bytes, a compile-time error will * be generated in kvfree_rcu_arg_2(). If this error is triggered, you can * either fall back to use of call_rcu() or rearrange the structure to @@ -1082,14 +1067,23 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) #define kfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) #define kvfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) +/* + * In mm/slab_common.c, no suitable header to include here. + */ +void kvfree_call_rcu(struct rcu_head *head, void *ptr); + +/* + * The BUILD_BUG_ON() makes sure the rcu_head offset can be handled. See the + * comment of kfree_rcu() for details. + */ #define kvfree_rcu_arg_2(ptr, rhf) \ do { \ typeof (ptr) ___p = (ptr); \ \ - if (___p) { \ - BUILD_BUG_ON(!__is_kvfree_rcu_offset(offsetof(typeof(*(ptr)), rhf))); \ - kvfree_call_rcu(&((___p)->rhf), (void *) (___p)); \ - } \ + if (___p) { \ + BUILD_BUG_ON(offsetof(typeof(*(ptr)), rhf) >= 4096); \ + kvfree_call_rcu(&((___p)->rhf), (void *) (___p)); \ + } \ } while (0) #define kvfree_rcu_arg_1(ptr) \ diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h index f9bed3d3f78d..4c92d4291cce 100644 --- a/include/linux/rcupdate_wait.h +++ b/include/linux/rcupdate_wait.h @@ -16,6 +16,9 @@ struct rcu_synchronize { struct rcu_head head; struct completion completion; + + /* This is for debugging. */ + struct rcu_gp_oldstate oldstate; }; void wakeme_after_rcu(struct rcu_head *head); diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index fe42315f667f..f519cd680228 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -90,41 +90,6 @@ static inline void synchronize_rcu_expedited(void) synchronize_rcu(); } -/* - * Add one more declaration of kvfree() here. It is - * not so straight forward to just include <linux/mm.h> - * where it is defined due to getting many compile - * errors caused by that include. - */ -extern void kvfree(const void *addr); - -static inline void __kvfree_call_rcu(struct rcu_head *head, void *ptr) -{ - if (head) { - call_rcu(head, (rcu_callback_t) ((void *) head - ptr)); - return; - } - - // kvfree_rcu(one_arg) call. - might_sleep(); - synchronize_rcu(); - kvfree(ptr); -} - -static inline void kvfree_rcu_barrier(void) -{ - rcu_barrier(); -} - -#ifdef CONFIG_KASAN_GENERIC -void kvfree_call_rcu(struct rcu_head *head, void *ptr); -#else -static inline void kvfree_call_rcu(struct rcu_head *head, void *ptr) -{ - __kvfree_call_rcu(head, ptr); -} -#endif - void rcu_qs(void); static inline void rcu_softirq_qs(void) @@ -164,7 +129,6 @@ static inline void rcu_end_inkernel_boot(void) { } static inline bool rcu_inkernel_boot_has_ended(void) { return true; } static inline bool rcu_is_watching(void) { return true; } static inline void rcu_momentary_eqs(void) { } -static inline void kfree_rcu_scheduler_running(void) { } /* Avoid RCU read-side critical sections leaking across. */ static inline void rcu_all_qs(void) { barrier(); } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 27d86d912781..9d2d7bd251d4 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -34,12 +34,9 @@ static inline void rcu_virt_note_context_switch(void) } void synchronize_rcu_expedited(void); -void kvfree_call_rcu(struct rcu_head *head, void *ptr); -void kvfree_rcu_barrier(void); void rcu_barrier(void); void rcu_momentary_eqs(void); -void kfree_rcu_scheduler_running(void); struct rcu_gp_oldstate { unsigned long rgos_norm; @@ -103,7 +100,7 @@ extern int rcu_scheduler_active; void rcu_end_inkernel_boot(void); bool rcu_inkernel_boot_has_ended(void); bool rcu_is_watching(void); -#ifndef CONFIG_PREEMPTION +#ifndef CONFIG_PREEMPT_RCU void rcu_all_qs(void); #endif diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index d94abba1c716..880351ca3dfc 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -6,6 +6,7 @@ #include <linux/kernel.h> #include <linux/list.h> #include <linux/pid.h> +#include <linux/resctrl_types.h> /* CLOSID, RMID value used by the default control group */ #define RESCTRL_RESERVED_CLOSID 0 @@ -25,6 +26,24 @@ int proc_resctrl_show(struct seq_file *m, /* max value for struct rdt_domain's mbps_val */ #define MBA_MAX_MBPS U32_MAX +/* Walk all possible resources, with variants for only controls or monitors. */ +#define for_each_rdt_resource(_r) \ + for ((_r) = resctrl_arch_get_resource(0); \ + (_r) && (_r)->rid < RDT_NUM_RESOURCES; \ + (_r) = resctrl_arch_get_resource((_r)->rid + 1)) + +#define for_each_capable_rdt_resource(r) \ + for_each_rdt_resource((r)) \ + if ((r)->alloc_capable || (r)->mon_capable) + +#define for_each_alloc_capable_rdt_resource(r) \ + for_each_rdt_resource((r)) \ + if ((r)->alloc_capable) + +#define for_each_mon_capable_rdt_resource(r) \ + for_each_rdt_resource((r)) \ + if ((r)->mon_capable) + /** * enum resctrl_conf_type - The type of configuration. * @CDP_NONE: No prioritisation, both code and data are controlled or monitored. @@ -40,13 +59,42 @@ enum resctrl_conf_type { #define CDP_NUM_TYPES (CDP_DATA + 1) /* - * Event IDs, the values match those used to program IA32_QM_EVTSEL before - * reading IA32_QM_CTR on RDT systems. + * struct pseudo_lock_region - pseudo-lock region information + * @s: Resctrl schema for the resource to which this + * pseudo-locked region belongs + * @closid: The closid that this pseudo-locked region uses + * @d: RDT domain to which this pseudo-locked region + * belongs + * @cbm: bitmask of the pseudo-locked region + * @lock_thread_wq: waitqueue used to wait on the pseudo-locking thread + * completion + * @thread_done: variable used by waitqueue to test if pseudo-locking + * thread completed + * @cpu: core associated with the cache on which the setup code + * will be run + * @line_size: size of the cache lines + * @size: size of pseudo-locked region in bytes + * @kmem: the kernel memory associated with pseudo-locked region + * @minor: minor number of character device associated with this + * region + * @debugfs_dir: pointer to this region's directory in the debugfs + * filesystem + * @pm_reqs: Power management QoS requests related to this region */ -enum resctrl_event_id { - QOS_L3_OCCUP_EVENT_ID = 0x01, - QOS_L3_MBM_TOTAL_EVENT_ID = 0x02, - QOS_L3_MBM_LOCAL_EVENT_ID = 0x03, +struct pseudo_lock_region { + struct resctrl_schema *s; + u32 closid; + struct rdt_ctrl_domain *d; + u32 cbm; + wait_queue_head_t lock_thread_wq; + int thread_done; + int cpu; + unsigned int line_size; + unsigned int size; + void *kmem; + unsigned int minor; + struct dentry *debugfs_dir; + struct list_head pm_reqs; }; /** @@ -155,6 +203,7 @@ enum membw_throttle_mode { /** * struct resctrl_membw - Memory bandwidth allocation related data * @min_bw: Minimum memory bandwidth percentage user can request + * @max_bw: Maximum memory bandwidth value, used as the reset value * @bw_gran: Granularity at which the memory bandwidth is allocated * @delay_linear: True if memory B/W delay is in linear scale * @arch_needs_linear: True if we can't configure non-linear resources @@ -165,6 +214,7 @@ enum membw_throttle_mode { */ struct resctrl_membw { u32 min_bw; + u32 max_bw; u32 bw_gran; u32 delay_linear; bool arch_needs_linear; @@ -173,7 +223,6 @@ struct resctrl_membw { u32 *mb_map; }; -struct rdt_parse_data; struct resctrl_schema; enum resctrl_scope { @@ -183,6 +232,16 @@ enum resctrl_scope { }; /** + * enum resctrl_schema_fmt - The format user-space provides for a schema. + * @RESCTRL_SCHEMA_BITMAP: The schema is a bitmap in hex. + * @RESCTRL_SCHEMA_RANGE: The schema is a decimal number. + */ +enum resctrl_schema_fmt { + RESCTRL_SCHEMA_BITMAP, + RESCTRL_SCHEMA_RANGE, +}; + +/** * struct rdt_resource - attributes of a resctrl resource * @rid: The index of the resource * @alloc_capable: Is allocation available on this machine @@ -195,12 +254,10 @@ enum resctrl_scope { * @ctrl_domains: RCU list of all control domains for this resource * @mon_domains: RCU list of all monitor domains for this resource * @name: Name to use in "schemata" file. - * @data_width: Character width of data when displaying - * @default_ctrl: Specifies default cache cbm or memory B/W percent. - * @format_str: Per resource format string to show domain value - * @parse_ctrlval: Per resource function pointer to parse control values + * @schema_fmt: Which format string and parser is used for this schema. * @evt_list: List of monitoring events - * @fflags: flags to choose base and info files + * @mbm_cfg_mask: Bandwidth sources that can be tracked when bandwidth + * monitoring events can be configured. * @cdp_capable: Is the CDP feature available on this resource */ struct rdt_resource { @@ -215,22 +272,25 @@ struct rdt_resource { struct list_head ctrl_domains; struct list_head mon_domains; char *name; - int data_width; - u32 default_ctrl; - const char *format_str; - int (*parse_ctrlval)(struct rdt_parse_data *data, - struct resctrl_schema *s, - struct rdt_ctrl_domain *d); + enum resctrl_schema_fmt schema_fmt; struct list_head evt_list; - unsigned long fflags; + unsigned int mbm_cfg_mask; bool cdp_capable; }; +/* + * Get the resource that exists at this level. If the level is not supported + * a dummy/not-capable resource can be returned. Levels >= RDT_NUM_RESOURCES + * will return NULL. + */ +struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l); + /** * struct resctrl_schema - configuration abilities of a resource presented to * user-space * @list: Member of resctrl_schema_all. * @name: The name to use in the "schemata" file. + * @fmt_str: Format string to show domain value. * @conf_type: Whether this schema is specific to code/data. * @res: The resource structure exported by the architecture to describe * the hardware that is configured by this schema. @@ -241,16 +301,104 @@ struct rdt_resource { struct resctrl_schema { struct list_head list; char name[8]; + const char *fmt_str; enum resctrl_conf_type conf_type; struct rdt_resource *res; u32 num_closid; }; +struct resctrl_cpu_defaults { + u32 closid; + u32 rmid; +}; + +struct resctrl_mon_config_info { + struct rdt_resource *r; + struct rdt_mon_domain *d; + u32 evtid; + u32 mon_config; +}; + +/** + * resctrl_arch_sync_cpu_closid_rmid() - Refresh this CPU's CLOSID and RMID. + * Call via IPI. + * @info: If non-NULL, a pointer to a struct resctrl_cpu_defaults + * specifying the new CLOSID and RMID for tasks in the default + * resctrl ctrl and mon group when running on this CPU. If NULL, + * this CPU is not re-assigned to a different default group. + * + * Propagates reassignment of CPUs and/or tasks to different resctrl groups + * when requested by the resctrl core code. + * + * This function records the per-cpu defaults specified by @info (if any), + * and then reconfigures the CPU's hardware CLOSID and RMID for subsequent + * execution based on @current, in the same way as during a task switch. + */ +void resctrl_arch_sync_cpu_closid_rmid(void *info); + +/** + * resctrl_get_default_ctrl() - Return the default control value for this + * resource. + * @r: The resource whose default control type is queried. + */ +static inline u32 resctrl_get_default_ctrl(struct rdt_resource *r) +{ + switch (r->schema_fmt) { + case RESCTRL_SCHEMA_BITMAP: + return BIT_MASK(r->cache.cbm_len) - 1; + case RESCTRL_SCHEMA_RANGE: + return r->membw.max_bw; + } + + return WARN_ON_ONCE(1); +} + /* The number of closid supported by this resource regardless of CDP */ u32 resctrl_arch_get_num_closid(struct rdt_resource *r); u32 resctrl_arch_system_num_rmid_idx(void); int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); +__init bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt); + +/** + * resctrl_arch_mon_event_config_write() - Write the config for an event. + * @config_info: struct resctrl_mon_config_info describing the resource, domain + * and event. + * + * Reads resource, domain and eventid from @config_info and writes the + * event config_info->mon_config into hardware. + * + * Called via IPI to reach a CPU that is a member of the specified domain. + */ +void resctrl_arch_mon_event_config_write(void *config_info); + +/** + * resctrl_arch_mon_event_config_read() - Read the config for an event. + * @config_info: struct resctrl_mon_config_info describing the resource, domain + * and event. + * + * Reads resource, domain and eventid from @config_info and reads the + * hardware config value into config_info->mon_config. + * + * Called via IPI to reach a CPU that is a member of the specified domain. + */ +void resctrl_arch_mon_event_config_read(void *config_info); + +/* For use by arch code to remap resctrl's smaller CDP CLOSID range */ +static inline u32 resctrl_get_config_index(u32 closid, + enum resctrl_conf_type type) +{ + switch (type) { + default: + case CDP_NONE: + return closid; + case CDP_CODE: + return closid * 2 + 1; + case CDP_DATA: + return closid * 2; + } +} + /* * Update the ctrl_val and apply this config right now. * Must be called on one of the domain's CPUs. @@ -314,6 +462,20 @@ static inline void resctrl_arch_rmid_read_context_check(void) } /** + * resctrl_find_domain() - Search for a domain id in a resource domain list. + * @h: The domain list to search. + * @id: The domain id to search for. + * @pos: A pointer to position in the list id should be inserted. + * + * Search the domain list to find the domain id. If the domain id is + * found, return the domain. NULL otherwise. If the domain id is not + * found (and NULL returned) then the first domain with id bigger than + * the input id can be returned to the caller via @pos. + */ +struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id, + struct list_head **pos); + +/** * resctrl_arch_reset_rmid() - Reset any private state associated with rmid * and eventid. * @r: The domain's resource. @@ -340,7 +502,19 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, */ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d); +/** + * resctrl_arch_reset_all_ctrls() - Reset the control for each CLOSID to its + * default. + * @r: The resctrl resource to reset. + * + * This can be called from any CPU. + */ +void resctrl_arch_reset_all_ctrls(struct rdt_resource *r); + extern unsigned int resctrl_rmid_realloc_threshold; extern unsigned int resctrl_rmid_realloc_limit; +int __init resctrl_init(void); +void __exit resctrl_exit(void); + #endif /* _RESCTRL_H */ diff --git a/include/linux/resctrl_types.h b/include/linux/resctrl_types.h new file mode 100644 index 000000000000..f26450b3326b --- /dev/null +++ b/include/linux/resctrl_types.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2025 Arm Ltd. + * Based on arch/x86/kernel/cpu/resctrl/internal.h + */ + +#ifndef __LINUX_RESCTRL_TYPES_H +#define __LINUX_RESCTRL_TYPES_H + +/* Reads to Local DRAM Memory */ +#define READS_TO_LOCAL_MEM BIT(0) + +/* Reads to Remote DRAM Memory */ +#define READS_TO_REMOTE_MEM BIT(1) + +/* Non-Temporal Writes to Local Memory */ +#define NON_TEMP_WRITE_TO_LOCAL_MEM BIT(2) + +/* Non-Temporal Writes to Remote Memory */ +#define NON_TEMP_WRITE_TO_REMOTE_MEM BIT(3) + +/* Reads to Local Memory the system identifies as "Slow Memory" */ +#define READS_TO_LOCAL_S_MEM BIT(4) + +/* Reads to Remote Memory the system identifies as "Slow Memory" */ +#define READS_TO_REMOTE_S_MEM BIT(5) + +/* Dirty Victims to All Types of Memory */ +#define DIRTY_VICTIMS_TO_ALL_MEM BIT(6) + +/* Max event bits supported */ +#define MAX_EVT_CONFIG_BITS GENMASK(6, 0) + +enum resctrl_res_level { + RDT_RESOURCE_L3, + RDT_RESOURCE_L2, + RDT_RESOURCE_MBA, + RDT_RESOURCE_SMBA, + + /* Must be the last */ + RDT_NUM_RESOURCES, +}; + +/* + * Event IDs, the values match those used to program IA32_QM_EVTSEL before + * reading IA32_QM_CTR on RDT systems. + */ +enum resctrl_event_id { + QOS_L3_OCCUP_EVENT_ID = 0x01, + QOS_L3_MBM_TOTAL_EVENT_ID = 0x02, + QOS_L3_MBM_LOCAL_EVENT_ID = 0x03, +}; + +#endif /* __LINUX_RESCTRL_TYPES_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 9c15365a30c0..6e5c38718ff5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -65,6 +65,7 @@ struct mempolicy; struct nameidata; struct nsproxy; struct perf_event_context; +struct perf_ctx_data; struct pid_namespace; struct pipe_inode_info; struct rcu_node; @@ -382,6 +383,11 @@ enum uclamp_id { #ifdef CONFIG_SMP extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; +extern void sched_domains_mutex_lock(void); +extern void sched_domains_mutex_unlock(void); +#else +static inline void sched_domains_mutex_lock(void) { } +static inline void sched_domains_mutex_unlock(void) { } #endif struct sched_param { @@ -1311,6 +1317,7 @@ struct task_struct { struct perf_event_context *perf_event_ctxp; struct mutex perf_event_mutex; struct list_head perf_event_list; + struct perf_ctx_data __rcu *perf_ctx_data; #endif #ifdef CONFIG_DEBUG_PREEMPT unsigned long preempt_disable_ip; diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index 3a912ab42bb5..f9aabbc9d22e 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -34,7 +34,11 @@ static inline bool dl_time_before(u64 a, u64 b) struct root_domain; extern void dl_add_task_root_domain(struct task_struct *p); extern void dl_clear_root_domain(struct root_domain *rd); +extern void dl_clear_root_domain_cpu(int cpu); #endif /* CONFIG_SMP */ +extern u64 dl_cookie; +extern bool dl_bw_visited(int cpu, u64 cookie); + #endif /* _LINUX_SCHED_DEADLINE_H */ diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h index b5035afa2396..35ed4577a6cc 100644 --- a/include/linux/sched/debug.h +++ b/include/linux/sched/debug.h @@ -35,12 +35,10 @@ extern void show_stack(struct task_struct *task, unsigned long *sp, extern void sched_show_task(struct task_struct *p); -#ifdef CONFIG_SCHED_DEBUG struct seq_file; extern void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, struct seq_file *m); extern void proc_sched_set_task(struct task_struct *p); -#endif /* Attach to any functions which should be ignored in wchan output. */ #define __sched __section(".sched.text") diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 1d70a9867fb1..f7545430a548 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -146,6 +146,7 @@ struct sched_ext_entity { u32 weight; s32 sticky_cpu; s32 holding_cpu; + s32 selected_cpu; u32 kf_mask; /* see scx_kf_mask above */ struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */ atomic_long_t ops_state; diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h index e670ac282333..439f6029d3b9 100644 --- a/include/linux/sched/idle.h +++ b/include/linux/sched/idle.h @@ -79,6 +79,21 @@ static __always_inline bool __must_check current_clr_polling_and_test(void) return unlikely(tif_need_resched()); } +static __always_inline void current_clr_polling(void) +{ + __current_clr_polling(); + + /* + * Ensure we check TIF_NEED_RESCHED after we clear the polling bit. + * Once the bit is cleared, we'll get IPIs with every new + * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also + * fold. + */ + smp_mb__after_atomic(); /* paired with resched_curr() */ + + preempt_fold_need_resched(); +} + #else static inline void __current_set_polling(void) { } static inline void __current_clr_polling(void) { } @@ -91,21 +106,15 @@ static inline bool __must_check current_clr_polling_and_test(void) { return unlikely(tif_need_resched()); } -#endif static __always_inline void current_clr_polling(void) { __current_clr_polling(); - /* - * Ensure we check TIF_NEED_RESCHED after we clear the polling bit. - * Once the bit is cleared, we'll get IPIs with every new - * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also - * fold. - */ smp_mb(); /* paired with resched_curr() */ preempt_fold_need_resched(); } +#endif #endif /* _LINUX_SCHED_IDLE_H */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 928a626725e6..b13474825130 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -531,6 +531,13 @@ enum { static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) { + /* + * The atomic_read() below prevents CSE. The following should + * help the compiler generate more efficient code on architectures + * where sync_core_before_usermode() is a no-op. + */ + if (!IS_ENABLED(CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE)) + return; if (current->mm != mm) return; if (likely(!(atomic_read(&mm->membarrier_state) & diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index d5d03d919df8..1ef1edbaaf79 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -136,7 +136,8 @@ struct signal_struct { #ifdef CONFIG_POSIX_TIMERS /* POSIX.1b Interval Timers */ - unsigned int next_posix_timer_id; + unsigned int timer_create_restore_ids:1; + atomic_t next_posix_timer_id; struct hlist_head posix_timers; struct hlist_head ignored_posix_timers; diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 7f3dbafe1817..7b4301b7235f 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -25,16 +25,12 @@ enum { }; #undef SD_FLAG -#ifdef CONFIG_SCHED_DEBUG - struct sd_flag_debug { unsigned int meta_flags; char *name; }; extern const struct sd_flag_debug sd_flag_debug[]; -#endif - #ifdef CONFIG_SCHED_SMT static inline int cpu_smt_flags(void) { @@ -166,10 +162,6 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd) return to_cpumask(sd->span); } -extern void partition_sched_domains_locked(int ndoms_new, - cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new); - extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new); @@ -211,12 +203,6 @@ extern void __init set_sched_topology(struct sched_domain_topology_level *tl); struct sched_domain_attr; static inline void -partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new) -{ -} - -static inline void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index e45531455d3b..9b959972bf4a 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -22,21 +22,17 @@ #include <linux/atomic.h> #include <asm/seccomp.h> +extern int __secure_computing(void); + #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER -extern int __secure_computing(const struct seccomp_data *sd); static inline int secure_computing(void) { if (unlikely(test_syscall_work(SECCOMP))) - return __secure_computing(NULL); + return __secure_computing(); return 0; } #else extern void secure_computing_strict(int this_syscall); -static inline int __secure_computing(const struct seccomp_data *sd) -{ - secure_computing_strict(sd->nr); - return 0; -} #endif extern long prctl_get_seccomp(void); @@ -58,7 +54,7 @@ static inline int secure_computing(void) { return 0; } #else static inline void secure_computing_strict(int this_syscall) { return; } #endif -static inline int __secure_computing(const struct seccomp_data *sd) { return 0; } +static inline int __secure_computing(void) { return 0; } static inline long prctl_get_seccomp(void) { diff --git a/include/linux/security.h b/include/linux/security.h index 980b6c207cad..1545d515a66b 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2324,14 +2324,13 @@ struct perf_event_attr; struct perf_event; #ifdef CONFIG_SECURITY -extern int security_perf_event_open(struct perf_event_attr *attr, int type); +extern int security_perf_event_open(int type); extern int security_perf_event_alloc(struct perf_event *event); extern void security_perf_event_free(struct perf_event *event); extern int security_perf_event_read(struct perf_event *event); extern int security_perf_event_write(struct perf_event *event); #else -static inline int security_perf_event_open(struct perf_event_attr *attr, - int type) +static inline int security_perf_event_open(int type) { return 0; } @@ -2362,6 +2361,7 @@ static inline int security_perf_event_write(struct perf_event *event) extern int security_uring_override_creds(const struct cred *new); extern int security_uring_sqpoll(void); extern int security_uring_cmd(struct io_uring_cmd *ioucmd); +extern int security_uring_allowed(void); #else static inline int security_uring_override_creds(const struct cred *new) { @@ -2375,6 +2375,10 @@ static inline int security_uring_cmd(struct io_uring_cmd *ioucmd) { return 0; } +static inline int security_uring_allowed(void) +{ + return 0; +} #endif /* CONFIG_SECURITY */ #endif /* CONFIG_IO_URING */ diff --git a/include/linux/sizes.h b/include/linux/sizes.h index c3a00b967d18..49039494076f 100644 --- a/include/linux/sizes.h +++ b/include/linux/sizes.h @@ -23,17 +23,25 @@ #define SZ_4K 0x00001000 #define SZ_8K 0x00002000 #define SZ_16K 0x00004000 +#define SZ_24K 0x00006000 #define SZ_32K 0x00008000 #define SZ_64K 0x00010000 #define SZ_128K 0x00020000 +#define SZ_192K 0x00030000 #define SZ_256K 0x00040000 +#define SZ_384K 0x00060000 #define SZ_512K 0x00080000 #define SZ_1M 0x00100000 #define SZ_2M 0x00200000 +#define SZ_3M 0x00300000 #define SZ_4M 0x00400000 +#define SZ_6M 0x00600000 #define SZ_8M 0x00800000 +#define SZ_12M 0x00c00000 #define SZ_16M 0x01000000 +#define SZ_18M 0x01200000 +#define SZ_24M 0x01800000 #define SZ_32M 0x02000000 #define SZ_64M 0x04000000 #define SZ_128M 0x08000000 diff --git a/include/linux/slab.h b/include/linux/slab.h index 09eedaecf120..98e07e9e9e58 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -16,6 +16,7 @@ #include <linux/gfp.h> #include <linux/overflow.h> #include <linux/types.h> +#include <linux/rcupdate.h> #include <linux/workqueue.h> #include <linux/percpu-refcount.h> #include <linux/cleanup.h> @@ -941,8 +942,6 @@ static inline __alloc_size(1, 2) void *kmalloc_array_noprof(size_t n, size_t siz if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; - if (__builtin_constant_p(n) && __builtin_constant_p(size)) - return kmalloc_noprof(bytes, flags); return kmalloc_noprof(bytes, flags); } #define kmalloc_array(...) alloc_hooks(kmalloc_array_noprof(__VA_ARGS__)) @@ -1082,6 +1081,19 @@ extern void kvfree_sensitive(const void *addr, size_t len); unsigned int kmem_cache_size(struct kmem_cache *s); +#ifndef CONFIG_KVFREE_RCU_BATCHED +static inline void kvfree_rcu_barrier(void) +{ + rcu_barrier(); +} + +static inline void kfree_rcu_scheduler_running(void) { } +#else +void kvfree_rcu_barrier(void); + +void kfree_rcu_scheduler_running(void); +#endif + /** * kmalloc_size_roundup - Report allocation bucket size for the given size * diff --git a/include/linux/srcu.h b/include/linux/srcu.h index d7ba46e74f58..900b0d5c05f5 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -47,7 +47,13 @@ int init_srcu_struct(struct srcu_struct *ssp); #define SRCU_READ_FLAVOR_NORMAL 0x1 // srcu_read_lock(). #define SRCU_READ_FLAVOR_NMI 0x2 // srcu_read_lock_nmisafe(). #define SRCU_READ_FLAVOR_LITE 0x4 // srcu_read_lock_lite(). -#define SRCU_READ_FLAVOR_ALL 0x7 // All of the above. +#define SRCU_READ_FLAVOR_FAST 0x8 // srcu_read_lock_fast(). +#define SRCU_READ_FLAVOR_ALL (SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_NMI | \ + SRCU_READ_FLAVOR_LITE | SRCU_READ_FLAVOR_FAST) // All of the above. +#define SRCU_READ_FLAVOR_SLOWGP (SRCU_READ_FLAVOR_LITE | SRCU_READ_FLAVOR_FAST) + // Flavors requiring synchronize_rcu() + // instead of smp_mb(). +void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); #ifdef CONFIG_TINY_SRCU #include <linux/srcutiny.h> @@ -60,15 +66,6 @@ int init_srcu_struct(struct srcu_struct *ssp); void call_srcu(struct srcu_struct *ssp, struct rcu_head *head, void (*func)(struct rcu_head *head)); void cleanup_srcu_struct(struct srcu_struct *ssp); -int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp); -void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); -#ifdef CONFIG_TINY_SRCU -#define __srcu_read_lock_lite __srcu_read_lock -#define __srcu_read_unlock_lite __srcu_read_unlock -#else // #ifdef CONFIG_TINY_SRCU -int __srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp); -void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) __releases(ssp); -#endif // #else // #ifdef CONFIG_TINY_SRCU void synchronize_srcu(struct srcu_struct *ssp); #define SRCU_GET_STATE_COMPLETED 0x1 @@ -258,6 +255,51 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) } /** + * srcu_read_lock_fast - register a new reader for an SRCU-protected structure. + * @ssp: srcu_struct in which to register the new reader. + * + * Enter an SRCU read-side critical section, but for a light-weight + * smp_mb()-free reader. See srcu_read_lock() for more information. + * + * If srcu_read_lock_fast() is ever used on an srcu_struct structure, + * then none of the other flavors may be used, whether before, during, + * or after. Note that grace-period auto-expediting is disabled for _fast + * srcu_struct structures because auto-expedited grace periods invoke + * synchronize_rcu_expedited(), IPIs and all. + * + * Note that srcu_read_lock_fast() can be invoked only from those contexts + * where RCU is watching, that is, from contexts where it would be legal + * to invoke rcu_read_lock(). Otherwise, lockdep will complain. + */ +static inline struct srcu_ctr __percpu *srcu_read_lock_fast(struct srcu_struct *ssp) __acquires(ssp) +{ + struct srcu_ctr __percpu *retval; + + srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST); + retval = __srcu_read_lock_fast(ssp); + rcu_try_lock_acquire(&ssp->dep_map); + return retval; +} + +/** + * srcu_down_read_fast - register a new reader for an SRCU-protected structure. + * @ssp: srcu_struct in which to register the new reader. + * + * Enter a semaphore-like SRCU read-side critical section, but for + * a light-weight smp_mb()-free reader. See srcu_read_lock_fast() and + * srcu_down_read() for more information. + * + * The same srcu_struct may be used concurrently by srcu_down_read_fast() + * and srcu_read_lock_fast(). + */ +static inline struct srcu_ctr __percpu *srcu_down_read_fast(struct srcu_struct *ssp) __acquires(ssp) +{ + WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi()); + srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST); + return __srcu_read_lock_fast(ssp); +} + +/** * srcu_read_lock_lite - register a new reader for an SRCU-protected structure. * @ssp: srcu_struct in which to register the new reader. * @@ -278,7 +320,7 @@ static inline int srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp) { int retval; - srcu_check_read_flavor_lite(ssp); + srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_LITE); retval = __srcu_read_lock_lite(ssp); rcu_try_lock_acquire(&ssp->dep_map); return retval; @@ -335,7 +377,8 @@ srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp) * srcu_down_read() nor srcu_up_read() may be invoked from an NMI handler. * * Calls to srcu_down_read() may be nested, similar to the manner in - * which calls to down_read() may be nested. + * which calls to down_read() may be nested. The same srcu_struct may be + * used concurrently by srcu_down_read() and srcu_read_lock(). */ static inline int srcu_down_read(struct srcu_struct *ssp) __acquires(ssp) { @@ -361,9 +404,40 @@ static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) } /** + * srcu_read_unlock_fast - unregister a old reader from an SRCU-protected structure. + * @ssp: srcu_struct in which to unregister the old reader. + * @scp: return value from corresponding srcu_read_lock_fast(). + * + * Exit a light-weight SRCU read-side critical section. + */ +static inline void srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) + __releases(ssp) +{ + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST); + srcu_lock_release(&ssp->dep_map); + __srcu_read_unlock_fast(ssp, scp); +} + +/** + * srcu_up_read_fast - unregister a old reader from an SRCU-protected structure. + * @ssp: srcu_struct in which to unregister the old reader. + * @scp: return value from corresponding srcu_read_lock_fast(). + * + * Exit an SRCU read-side critical section, but not necessarily from + * the same context as the maching srcu_down_read_fast(). + */ +static inline void srcu_up_read_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) + __releases(ssp) +{ + WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi()); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST); + __srcu_read_unlock_fast(ssp, scp); +} + +/** * srcu_read_unlock_lite - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. - * @idx: return value from corresponding srcu_read_lock(). + * @idx: return value from corresponding srcu_read_lock_lite(). * * Exit a light-weight SRCU read-side critical section. */ @@ -379,7 +453,7 @@ static inline void srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) /** * srcu_read_unlock_nmisafe - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. - * @idx: return value from corresponding srcu_read_lock(). + * @idx: return value from corresponding srcu_read_lock_nmisafe(). * * Exit an SRCU read-side critical section, but in an NMI-safe manner. */ diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 1321da803274..380260317d98 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -64,13 +64,38 @@ static inline int __srcu_read_lock(struct srcu_struct *ssp) { int idx; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY idx = ((READ_ONCE(ssp->srcu_idx) + 1) & 0x2) >> 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], READ_ONCE(ssp->srcu_lock_nesting[idx]) + 1); preempt_enable(); return idx; } +struct srcu_ctr; + +static inline bool __srcu_ptr_to_ctr(struct srcu_struct *ssp, struct srcu_ctr __percpu *scpp) +{ + return (int)(intptr_t)(struct srcu_ctr __force __kernel *)scpp; +} + +static inline struct srcu_ctr __percpu *__srcu_ctr_to_ptr(struct srcu_struct *ssp, int idx) +{ + return (struct srcu_ctr __percpu *)(intptr_t)idx; +} + +static inline struct srcu_ctr __percpu *__srcu_read_lock_fast(struct srcu_struct *ssp) +{ + return __srcu_ctr_to_ptr(ssp, __srcu_read_lock(ssp)); +} + +static inline void __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) +{ + __srcu_read_unlock(ssp, __srcu_ptr_to_ctr(ssp, scp)); +} + +#define __srcu_read_lock_lite __srcu_read_lock +#define __srcu_read_unlock_lite __srcu_read_unlock + static inline void synchronize_srcu_expedited(struct srcu_struct *ssp) { synchronize_srcu(ssp); @@ -82,7 +107,7 @@ static inline void srcu_barrier(struct srcu_struct *ssp) } #define srcu_check_read_flavor(ssp, read_flavor) do { } while (0) -#define srcu_check_read_flavor_lite(ssp) do { } while (0) +#define srcu_check_read_flavor_force(ssp, read_flavor) do { } while (0) /* Defined here to avoid size increase for non-torture kernels. */ static inline void srcu_torture_stats_print(struct srcu_struct *ssp, diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index b17814c9d1c7..8bed7e6cc4c1 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -17,14 +17,19 @@ struct srcu_node; struct srcu_struct; +/* One element of the srcu_data srcu_ctrs array. */ +struct srcu_ctr { + atomic_long_t srcu_locks; /* Locks per CPU. */ + atomic_long_t srcu_unlocks; /* Unlocks per CPU. */ +}; + /* * Per-CPU structure feeding into leaf srcu_node, similar in function * to rcu_node. */ struct srcu_data { /* Read-side state. */ - atomic_long_t srcu_lock_count[2]; /* Locks per CPU. */ - atomic_long_t srcu_unlock_count[2]; /* Unlocks per CPU. */ + struct srcu_ctr srcu_ctrs[2]; /* Locks and unlocks per CPU. */ int srcu_reader_flavor; /* Reader flavor for srcu_struct structure? */ /* Values: SRCU_READ_FLAVOR_.* */ @@ -95,7 +100,7 @@ struct srcu_usage { * Per-SRCU-domain structure, similar in function to rcu_state. */ struct srcu_struct { - unsigned int srcu_idx; /* Current rdr array element. */ + struct srcu_ctr __percpu *srcu_ctrp; struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ struct lockdep_map dep_map; struct srcu_usage *srcu_sup; /* Update-side data. */ @@ -162,6 +167,7 @@ struct srcu_struct { #define __SRCU_STRUCT_INIT(name, usage_name, pcpu_name) \ { \ .sda = &pcpu_name, \ + .srcu_ctrp = &pcpu_name.srcu_ctrs[0], \ __SRCU_STRUCT_INIT_COMMON(name, usage_name) \ } @@ -201,10 +207,77 @@ struct srcu_struct { #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) +int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp); void synchronize_srcu_expedited(struct srcu_struct *ssp); void srcu_barrier(struct srcu_struct *ssp); void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf); +// Converts a per-CPU pointer to an ->srcu_ctrs[] array element to that +// element's index. +static inline bool __srcu_ptr_to_ctr(struct srcu_struct *ssp, struct srcu_ctr __percpu *scpp) +{ + return scpp - &ssp->sda->srcu_ctrs[0]; +} + +// Converts an integer to a per-CPU pointer to the corresponding +// ->srcu_ctrs[] array element. +static inline struct srcu_ctr __percpu *__srcu_ctr_to_ptr(struct srcu_struct *ssp, int idx) +{ + return &ssp->sda->srcu_ctrs[idx]; +} + +/* + * Counts the new reader in the appropriate per-CPU element of the + * srcu_struct. Returns a pointer that must be passed to the matching + * srcu_read_unlock_fast(). + * + * Note that both this_cpu_inc() and atomic_long_inc() are RCU read-side + * critical sections either because they disables interrupts, because they + * are a single instruction, or because they are a read-modify-write atomic + * operation, depending on the whims of the architecture. + * + * This means that __srcu_read_lock_fast() is not all that fast + * on architectures that support NMIs but do not supply NMI-safe + * implementations of this_cpu_inc(). + */ +static inline struct srcu_ctr __percpu *__srcu_read_lock_fast(struct srcu_struct *ssp) +{ + struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); + + RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_fast()."); + if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE)) + this_cpu_inc(scp->srcu_locks.counter); /* Y */ + else + atomic_long_inc(raw_cpu_ptr(&scp->srcu_locks)); /* Z */ + barrier(); /* Avoid leaking the critical section. */ + return scp; +} + +/* + * Removes the count for the old reader from the appropriate + * per-CPU element of the srcu_struct. Note that this may well be a + * different CPU than that which was incremented by the corresponding + * srcu_read_lock_fast(), but it must be within the same task. + * + * Note that both this_cpu_inc() and atomic_long_inc() are RCU read-side + * critical sections either because they disables interrupts, because they + * are a single instruction, or because they are a read-modify-write atomic + * operation, depending on the whims of the architecture. + * + * This means that __srcu_read_unlock_fast() is not all that fast + * on architectures that support NMIs but do not supply NMI-safe + * implementations of this_cpu_inc(). + */ +static inline void __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) +{ + barrier(); /* Avoid leaking the critical section. */ + if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE)) + this_cpu_inc(scp->srcu_unlocks.counter); /* Z */ + else + atomic_long_inc(raw_cpu_ptr(&scp->srcu_unlocks)); /* Z */ + RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_fast()."); +} + /* * Counts the new reader in the appropriate per-CPU element of the * srcu_struct. Returns an index that must be passed to the matching @@ -217,13 +290,12 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf); */ static inline int __srcu_read_lock_lite(struct srcu_struct *ssp) { - int idx; + struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_lite()."); - idx = READ_ONCE(ssp->srcu_idx) & 0x1; - this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); /* Y */ + this_cpu_inc(scp->srcu_locks.counter); /* Y */ barrier(); /* Avoid leaking the critical section. */ - return idx; + return __srcu_ptr_to_ctr(ssp, scp); } /* @@ -240,22 +312,24 @@ static inline int __srcu_read_lock_lite(struct srcu_struct *ssp) static inline void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) { barrier(); /* Avoid leaking the critical section. */ - this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter); /* Z */ + this_cpu_inc(__srcu_ctr_to_ptr(ssp, idx)->srcu_unlocks.counter); /* Z */ RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_lite()."); } void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor); -// Record _lite() usage even for CONFIG_PROVE_RCU=n kernels. -static inline void srcu_check_read_flavor_lite(struct srcu_struct *ssp) +// Record reader usage even for CONFIG_PROVE_RCU=n kernels. This is +// needed only for flavors that require grace-period smp_mb() calls to be +// promoted to synchronize_rcu(). +static inline void srcu_check_read_flavor_force(struct srcu_struct *ssp, int read_flavor) { struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); - if (likely(READ_ONCE(sdp->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE)) + if (likely(READ_ONCE(sdp->srcu_reader_flavor) & read_flavor)) return; // Note that the cmpxchg() in __srcu_check_read_flavor() is fully ordered. - __srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE); + __srcu_check_read_flavor(ssp, read_flavor); } // Record non-_lite() usage only for CONFIG_PROVE_RCU=y kernels. diff --git a/include/linux/string.h b/include/linux/string.h index f8e21e80942f..0403a4ca4c11 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -415,8 +415,10 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, */ #define strtomem_pad(dest, src, pad) do { \ const size_t _dest_len = __must_be_byte_array(dest) + \ + __must_be_noncstr(dest) + \ ARRAY_SIZE(dest); \ - const size_t _src_len = __builtin_object_size(src, 1); \ + const size_t _src_len = __must_be_cstr(src) + \ + __builtin_object_size(src, 1); \ \ BUILD_BUG_ON(!__builtin_constant_p(_dest_len) || \ _dest_len == (size_t)-1); \ @@ -439,8 +441,10 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, */ #define strtomem(dest, src) do { \ const size_t _dest_len = __must_be_byte_array(dest) + \ + __must_be_noncstr(dest) + \ ARRAY_SIZE(dest); \ - const size_t _src_len = __builtin_object_size(src, 1); \ + const size_t _src_len = __must_be_cstr(src) + \ + __builtin_object_size(src, 1); \ \ BUILD_BUG_ON(!__builtin_constant_p(_dest_len) || \ _dest_len == (size_t)-1); \ @@ -459,8 +463,10 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, */ #define memtostr(dest, src) do { \ const size_t _dest_len = __must_be_byte_array(dest) + \ + __must_be_cstr(dest) + \ ARRAY_SIZE(dest); \ - const size_t _src_len = __builtin_object_size(src, 1); \ + const size_t _src_len = __must_be_noncstr(src) + \ + __builtin_object_size(src, 1); \ const size_t _src_chars = strnlen(src, _src_len); \ const size_t _copy_len = min(_dest_len - 1, _src_chars); \ \ @@ -485,8 +491,10 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, */ #define memtostr_pad(dest, src) do { \ const size_t _dest_len = __must_be_byte_array(dest) + \ + __must_be_cstr(dest) + \ ARRAY_SIZE(dest); \ - const size_t _src_len = __builtin_object_size(src, 1); \ + const size_t _src_len = __must_be_noncstr(src) + \ + __builtin_object_size(src, 1); \ const size_t _src_chars = strnlen(src, _src_len); \ const size_t _copy_len = min(_dest_len - 1, _src_chars); \ \ diff --git a/include/linux/string_choices.h b/include/linux/string_choices.h index 120ca0f28e95..f3ba4f52ff26 100644 --- a/include/linux/string_choices.h +++ b/include/linux/string_choices.h @@ -41,23 +41,23 @@ static inline const char *str_high_low(bool v) } #define str_low_high(v) str_high_low(!(v)) -static inline const char *str_read_write(bool v) -{ - return v ? "read" : "write"; -} -#define str_write_read(v) str_read_write(!(v)) - static inline const char *str_on_off(bool v) { return v ? "on" : "off"; } #define str_off_on(v) str_on_off(!(v)) -static inline const char *str_yes_no(bool v) +static inline const char *str_read_write(bool v) { - return v ? "yes" : "no"; + return v ? "read" : "write"; } -#define str_no_yes(v) str_yes_no(!(v)) +#define str_write_read(v) str_read_write(!(v)) + +static inline const char *str_true_false(bool v) +{ + return v ? "true" : "false"; +} +#define str_false_true(v) str_true_false(!(v)) static inline const char *str_up_down(bool v) { @@ -65,11 +65,11 @@ static inline const char *str_up_down(bool v) } #define str_down_up(v) str_up_down(!(v)) -static inline const char *str_true_false(bool v) +static inline const char *str_yes_no(bool v) { - return v ? "true" : "false"; + return v ? "yes" : "no"; } -#define str_false_true(v) str_true_false(!(v)) +#define str_no_yes(v) str_yes_no(!(v)) /** * str_plural - Return the simple pluralization based on English counts diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index c6333204d451..e5603cc91963 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -951,6 +951,10 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len, int flags, uint32_t sig); asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags); +asmlinkage long sys_open_tree_attr(int dfd, const char __user *path, + unsigned flags, + struct mount_attr __user *uattr, + size_t usize); asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path, int to_dfd, const char __user *to_path, unsigned int ms_flags); @@ -1266,14 +1270,14 @@ static inline long ksys_lchown(const char __user *filename, uid_t user, AT_SYMLINK_NOFOLLOW); } -extern long do_sys_ftruncate(unsigned int fd, loff_t length, int small); +int do_sys_ftruncate(unsigned int fd, loff_t length, int small); static inline long ksys_ftruncate(unsigned int fd, loff_t length) { return do_sys_ftruncate(fd, length, 1); } -extern long do_sys_truncate(const char __user *pathname, loff_t length); +int do_sys_truncate(const char __user *pathname, loff_t length); static inline long ksys_truncate(const char __user *pathname, loff_t length) { diff --git a/include/linux/sysv_fs.h b/include/linux/sysv_fs.h deleted file mode 100644 index 5cf77dbb8d86..000000000000 --- a/include/linux/sysv_fs.h +++ /dev/null @@ -1,214 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_SYSV_FS_H -#define _LINUX_SYSV_FS_H - -#define __packed2__ __attribute__((packed, aligned(2))) - - -#ifndef __KERNEL__ -typedef u16 __fs16; -typedef u32 __fs16; -#endif - -/* inode numbers are 16 bit */ -typedef __fs16 sysv_ino_t; - -/* Block numbers are 24 bit, sometimes stored in 32 bit. - On Coherent FS, they are always stored in PDP-11 manner: the least - significant 16 bits come last. */ -typedef __fs32 sysv_zone_t; - -/* 0 is non-existent */ -#define SYSV_BADBL_INO 1 /* inode of bad blocks file */ -#define SYSV_ROOT_INO 2 /* inode of root directory */ - - -/* Xenix super-block data on disk */ -#define XENIX_NICINOD 100 /* number of inode cache entries */ -#define XENIX_NICFREE 100 /* number of free block list chunk entries */ -struct xenix_super_block { - __fs16 s_isize; /* index of first data zone */ - __fs32 s_fsize __packed2__; /* total number of zones of this fs */ - /* the start of the free block list: */ - __fs16 s_nfree; /* number of free blocks in s_free, <= XENIX_NICFREE */ - sysv_zone_t s_free[XENIX_NICFREE]; /* first free block list chunk */ - /* the cache of free inodes: */ - __fs16 s_ninode; /* number of free inodes in s_inode, <= XENIX_NICINOD */ - sysv_ino_t s_inode[XENIX_NICINOD]; /* some free inodes */ - /* locks, not used by Linux: */ - char s_flock; /* lock during free block list manipulation */ - char s_ilock; /* lock during inode cache manipulation */ - char s_fmod; /* super-block modified flag */ - char s_ronly; /* flag whether fs is mounted read-only */ - __fs32 s_time __packed2__; /* time of last super block update */ - __fs32 s_tfree __packed2__; /* total number of free zones */ - __fs16 s_tinode; /* total number of free inodes */ - __fs16 s_dinfo[4]; /* device information ?? */ - char s_fname[6]; /* file system volume name */ - char s_fpack[6]; /* file system pack name */ - char s_clean; /* set to 0x46 when filesystem is properly unmounted */ - char s_fill[371]; - s32 s_magic; /* version of file system */ - __fs32 s_type; /* type of file system: 1 for 512 byte blocks - 2 for 1024 byte blocks - 3 for 2048 byte blocks */ - -}; - -/* - * SystemV FS comes in two variants: - * sysv2: System V Release 2 (e.g. Microport), structure elements aligned(2). - * sysv4: System V Release 4 (e.g. Consensys), structure elements aligned(4). - */ -#define SYSV_NICINOD 100 /* number of inode cache entries */ -#define SYSV_NICFREE 50 /* number of free block list chunk entries */ - -/* SystemV4 super-block data on disk */ -struct sysv4_super_block { - __fs16 s_isize; /* index of first data zone */ - u16 s_pad0; - __fs32 s_fsize; /* total number of zones of this fs */ - /* the start of the free block list: */ - __fs16 s_nfree; /* number of free blocks in s_free, <= SYSV_NICFREE */ - u16 s_pad1; - sysv_zone_t s_free[SYSV_NICFREE]; /* first free block list chunk */ - /* the cache of free inodes: */ - __fs16 s_ninode; /* number of free inodes in s_inode, <= SYSV_NICINOD */ - u16 s_pad2; - sysv_ino_t s_inode[SYSV_NICINOD]; /* some free inodes */ - /* locks, not used by Linux: */ - char s_flock; /* lock during free block list manipulation */ - char s_ilock; /* lock during inode cache manipulation */ - char s_fmod; /* super-block modified flag */ - char s_ronly; /* flag whether fs is mounted read-only */ - __fs32 s_time; /* time of last super block update */ - __fs16 s_dinfo[4]; /* device information ?? */ - __fs32 s_tfree; /* total number of free zones */ - __fs16 s_tinode; /* total number of free inodes */ - u16 s_pad3; - char s_fname[6]; /* file system volume name */ - char s_fpack[6]; /* file system pack name */ - s32 s_fill[12]; - __fs32 s_state; /* file system state: 0x7c269d38-s_time means clean */ - s32 s_magic; /* version of file system */ - __fs32 s_type; /* type of file system: 1 for 512 byte blocks - 2 for 1024 byte blocks */ -}; - -/* SystemV2 super-block data on disk */ -struct sysv2_super_block { - __fs16 s_isize; /* index of first data zone */ - __fs32 s_fsize __packed2__; /* total number of zones of this fs */ - /* the start of the free block list: */ - __fs16 s_nfree; /* number of free blocks in s_free, <= SYSV_NICFREE */ - sysv_zone_t s_free[SYSV_NICFREE]; /* first free block list chunk */ - /* the cache of free inodes: */ - __fs16 s_ninode; /* number of free inodes in s_inode, <= SYSV_NICINOD */ - sysv_ino_t s_inode[SYSV_NICINOD]; /* some free inodes */ - /* locks, not used by Linux: */ - char s_flock; /* lock during free block list manipulation */ - char s_ilock; /* lock during inode cache manipulation */ - char s_fmod; /* super-block modified flag */ - char s_ronly; /* flag whether fs is mounted read-only */ - __fs32 s_time __packed2__; /* time of last super block update */ - __fs16 s_dinfo[4]; /* device information ?? */ - __fs32 s_tfree __packed2__; /* total number of free zones */ - __fs16 s_tinode; /* total number of free inodes */ - char s_fname[6]; /* file system volume name */ - char s_fpack[6]; /* file system pack name */ - s32 s_fill[14]; - __fs32 s_state; /* file system state: 0xcb096f43 means clean */ - s32 s_magic; /* version of file system */ - __fs32 s_type; /* type of file system: 1 for 512 byte blocks - 2 for 1024 byte blocks */ -}; - -/* V7 super-block data on disk */ -#define V7_NICINOD 100 /* number of inode cache entries */ -#define V7_NICFREE 50 /* number of free block list chunk entries */ -struct v7_super_block { - __fs16 s_isize; /* index of first data zone */ - __fs32 s_fsize __packed2__; /* total number of zones of this fs */ - /* the start of the free block list: */ - __fs16 s_nfree; /* number of free blocks in s_free, <= V7_NICFREE */ - sysv_zone_t s_free[V7_NICFREE]; /* first free block list chunk */ - /* the cache of free inodes: */ - __fs16 s_ninode; /* number of free inodes in s_inode, <= V7_NICINOD */ - sysv_ino_t s_inode[V7_NICINOD]; /* some free inodes */ - /* locks, not used by Linux or V7: */ - char s_flock; /* lock during free block list manipulation */ - char s_ilock; /* lock during inode cache manipulation */ - char s_fmod; /* super-block modified flag */ - char s_ronly; /* flag whether fs is mounted read-only */ - __fs32 s_time __packed2__; /* time of last super block update */ - /* the following fields are not maintained by V7: */ - __fs32 s_tfree __packed2__; /* total number of free zones */ - __fs16 s_tinode; /* total number of free inodes */ - __fs16 s_m; /* interleave factor */ - __fs16 s_n; /* interleave factor */ - char s_fname[6]; /* file system name */ - char s_fpack[6]; /* file system pack name */ -}; -/* Constants to aid sanity checking */ -/* This is not a hard limit, nor enforced by v7 kernel. It's actually just - * the limit used by Seventh Edition's ls, though is high enough to assume - * that no reasonable file system would have that much entries in root - * directory. Thus, if we see anything higher, we just probably got the - * endiannes wrong. */ -#define V7_NFILES 1024 -/* The disk addresses are three-byte (despite direct block addresses being - * aligned word-wise in inode). If the most significant byte is non-zero, - * something is most likely wrong (not a filesystem, bad bytesex). */ -#define V7_MAXSIZE 0x00ffffff - -/* Coherent super-block data on disk */ -#define COH_NICINOD 100 /* number of inode cache entries */ -#define COH_NICFREE 64 /* number of free block list chunk entries */ -struct coh_super_block { - __fs16 s_isize; /* index of first data zone */ - __fs32 s_fsize __packed2__; /* total number of zones of this fs */ - /* the start of the free block list: */ - __fs16 s_nfree; /* number of free blocks in s_free, <= COH_NICFREE */ - sysv_zone_t s_free[COH_NICFREE] __packed2__; /* first free block list chunk */ - /* the cache of free inodes: */ - __fs16 s_ninode; /* number of free inodes in s_inode, <= COH_NICINOD */ - sysv_ino_t s_inode[COH_NICINOD]; /* some free inodes */ - /* locks, not used by Linux: */ - char s_flock; /* lock during free block list manipulation */ - char s_ilock; /* lock during inode cache manipulation */ - char s_fmod; /* super-block modified flag */ - char s_ronly; /* flag whether fs is mounted read-only */ - __fs32 s_time __packed2__; /* time of last super block update */ - __fs32 s_tfree __packed2__; /* total number of free zones */ - __fs16 s_tinode; /* total number of free inodes */ - __fs16 s_interleave_m; /* interleave factor */ - __fs16 s_interleave_n; - char s_fname[6]; /* file system volume name */ - char s_fpack[6]; /* file system pack name */ - __fs32 s_unique; /* zero, not used */ -}; - -/* SystemV/Coherent inode data on disk */ -struct sysv_inode { - __fs16 i_mode; - __fs16 i_nlink; - __fs16 i_uid; - __fs16 i_gid; - __fs32 i_size; - u8 i_data[3*(10+1+1+1)]; - u8 i_gen; - __fs32 i_atime; /* time of last access */ - __fs32 i_mtime; /* time of last modification */ - __fs32 i_ctime; /* time of creation */ -}; - -/* SystemV/Coherent directory entry on disk */ -#define SYSV_NAMELEN 14 /* max size of name in struct sysv_dir_entry */ -struct sysv_dir_entry { - sysv_ino_t inode; - char name[SYSV_NAMELEN]; /* up to 14 characters, the rest are zeroes */ -}; - -#define SYSV_DIRSIZE sizeof(struct sysv_dir_entry) /* size of every directory entry */ - -#endif /* _LINUX_SYSV_FS_H */ diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index cf2446c9c30d..dd925d84fa46 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -217,54 +217,6 @@ static inline int arch_within_stack_frames(const void * const stack, } #endif -#ifdef CONFIG_HARDENED_USERCOPY -extern void __check_object_size(const void *ptr, unsigned long n, - bool to_user); - -static __always_inline void check_object_size(const void *ptr, unsigned long n, - bool to_user) -{ - if (!__builtin_constant_p(n)) - __check_object_size(ptr, n, to_user); -} -#else -static inline void check_object_size(const void *ptr, unsigned long n, - bool to_user) -{ } -#endif /* CONFIG_HARDENED_USERCOPY */ - -extern void __compiletime_error("copy source size is too small") -__bad_copy_from(void); -extern void __compiletime_error("copy destination size is too small") -__bad_copy_to(void); - -void __copy_overflow(int size, unsigned long count); - -static inline void copy_overflow(int size, unsigned long count) -{ - if (IS_ENABLED(CONFIG_BUG)) - __copy_overflow(size, count); -} - -static __always_inline __must_check bool -check_copy_size(const void *addr, size_t bytes, bool is_source) -{ - int sz = __builtin_object_size(addr, 0); - if (unlikely(sz >= 0 && sz < bytes)) { - if (!__builtin_constant_p(bytes)) - copy_overflow(sz, bytes); - else if (is_source) - __bad_copy_from(); - else - __bad_copy_to(); - return false; - } - if (WARN_ON_ONCE(bytes > INT_MAX)) - return false; - check_object_size(addr, bytes, is_source); - return true; -} - #ifndef arch_setup_new_exec static inline void arch_setup_new_exec(void) { } #endif diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 876e31b4461d..0b8b32bf0655 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -165,6 +165,4 @@ static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim) } #endif -struct vdso_data *arch_get_vdso_data(void *vvar_page); - #endif /* _LINUX_TIMENS_H */ diff --git a/include/linux/topology.h b/include/linux/topology.h index 52f5850730b3..24e715f0f6d2 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -240,6 +240,29 @@ static inline const struct cpumask *cpu_smt_mask(int cpu) } #endif +#ifndef topology_is_primary_thread + +static inline bool topology_is_primary_thread(unsigned int cpu) +{ + /* + * When disabling SMT, the primary thread of the SMT will remain + * enabled/active. Architectures that have a special primary thread + * (e.g. x86) need to override this function. Otherwise the first + * thread in the SMT can be made the primary thread. + * + * The sibling cpumask of an offline CPU always contains the CPU + * itself on architectures using the implementation of + * CONFIG_GENERIC_ARCH_TOPOLOGY for building their topology. + * Other architectures not using CONFIG_GENERIC_ARCH_TOPOLOGY for + * building their topology have to check whether to use this default + * implementation or to override it. + */ + return cpu == cpumask_first(topology_sibling_cpumask(cpu)); +} +#define topology_is_primary_thread topology_is_primary_thread + +#endif + static inline const struct cpumask *cpu_cpu_mask(int cpu) { return cpumask_of_node(cpu_to_node(cpu)); @@ -262,6 +285,36 @@ sched_numa_hop_mask(unsigned int node, unsigned int hops) #endif /* CONFIG_NUMA */ /** + * for_each_node_numadist() - iterate over nodes in increasing distance + * order, starting from a given node + * @node: the iteration variable and the starting node. + * @unvisited: a nodemask to keep track of the unvisited nodes. + * + * This macro iterates over NUMA node IDs in increasing distance from the + * starting @node and yields MAX_NUMNODES when all the nodes have been + * visited. + * + * Note that by the time the loop completes, the @unvisited nodemask will + * be fully cleared, unless the loop exits early. + * + * The difference between for_each_node() and for_each_node_numadist() is + * that the former allows to iterate over nodes in numerical order, whereas + * the latter iterates over nodes in increasing order of distance. + * + * This complexity of this iterator is O(N^2), where N represents the + * number of nodes, as each iteration involves scanning all nodes to + * find the one with the shortest distance. + * + * Requires rcu_lock to be held. + */ +#define for_each_node_numadist(node, unvisited) \ + for (int __start = (node), \ + (node) = nearest_node_nodemask((__start), &(unvisited)); \ + (node) < MAX_NUMNODES; \ + node_clear((node), (unvisited)), \ + (node) = nearest_node_nodemask((__start), &(unvisited))) + +/** * for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance * from a given node. * @mask: the iteration variable. diff --git a/include/linux/torture.h b/include/linux/torture.h index 0134e7221cae..1b59056c3b18 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -104,6 +104,7 @@ int torture_stutter_init(int s, int sgap); /* Initialization and cleanup. */ bool torture_init_begin(char *ttype, int v); void torture_init_end(void); +unsigned long get_torture_init_jiffies(void); bool torture_cleanup_begin(void); void torture_cleanup_end(void); bool torture_must_stop(void); diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index e9c702c1908d..7c06f4795670 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -7,7 +7,7 @@ #include <linux/minmax.h> #include <linux/nospec.h> #include <linux/sched.h> -#include <linux/thread_info.h> +#include <linux/ucopysize.h> #include <asm/uaccess.h> diff --git a/include/linux/ucopysize.h b/include/linux/ucopysize.h new file mode 100644 index 000000000000..41c2d9720466 --- /dev/null +++ b/include/linux/ucopysize.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Perform sanity checking for object sizes for uaccess.h and uio.h. */ +#ifndef __LINUX_UCOPYSIZE_H__ +#define __LINUX_UCOPYSIZE_H__ + +#include <linux/bug.h> + +#ifdef CONFIG_HARDENED_USERCOPY +#include <linux/jump_label.h> +extern void __check_object_size(const void *ptr, unsigned long n, + bool to_user); + +DECLARE_STATIC_KEY_MAYBE(CONFIG_HARDENED_USERCOPY_DEFAULT_ON, + validate_usercopy_range); + +static __always_inline void check_object_size(const void *ptr, unsigned long n, + bool to_user) +{ + if (!__builtin_constant_p(n) && + static_branch_maybe(CONFIG_HARDENED_USERCOPY_DEFAULT_ON, + &validate_usercopy_range)) { + __check_object_size(ptr, n, to_user); + } +} +#else +static inline void check_object_size(const void *ptr, unsigned long n, + bool to_user) +{ } +#endif /* CONFIG_HARDENED_USERCOPY */ + +extern void __compiletime_error("copy source size is too small") +__bad_copy_from(void); +extern void __compiletime_error("copy destination size is too small") +__bad_copy_to(void); + +void __copy_overflow(int size, unsigned long count); + +static inline void copy_overflow(int size, unsigned long count) +{ + if (IS_ENABLED(CONFIG_BUG)) + __copy_overflow(size, count); +} + +static __always_inline __must_check bool +check_copy_size(const void *addr, size_t bytes, bool is_source) +{ + int sz = __builtin_object_size(addr, 0); + if (unlikely(sz >= 0 && sz < bytes)) { + if (!__builtin_constant_p(bytes)) + copy_overflow(sz, bytes); + else if (is_source) + __bad_copy_from(); + else + __bad_copy_to(); + return false; + } + if (WARN_ON_ONCE(bytes > INT_MAX)) + return false; + check_object_size(addr, bytes, is_source); + return true; +} + +#endif /* __LINUX_UCOPYSIZE_H__ */ diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h index f85ec5613721..2dc767e08f54 100644 --- a/include/linux/uidgid.h +++ b/include/linux/uidgid.h @@ -132,6 +132,7 @@ static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) u32 map_id_down(struct uid_gid_map *map, u32 id); u32 map_id_up(struct uid_gid_map *map, u32 id); +u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count); #else @@ -186,6 +187,11 @@ static inline u32 map_id_down(struct uid_gid_map *map, u32 id) return id; } +static inline u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count) +{ + return id; +} + static inline u32 map_id_up(struct uid_gid_map *map, u32 id) { return id; diff --git a/include/linux/uio.h b/include/linux/uio.h index 8ada84e85447..49ece9e1888f 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -6,8 +6,8 @@ #define __LINUX_UIO_H #include <linux/kernel.h> -#include <linux/thread_info.h> #include <linux/mm_types.h> +#include <linux/ucopysize.h> #include <uapi/linux/uio.h> struct page; diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index b1df7d792fa1..2e46b69ff0a6 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -39,6 +39,8 @@ struct page; #define MAX_URETPROBE_DEPTH 64 +#define UPROBE_NO_TRAMPOLINE_VADDR (~0UL) + struct uprobe_consumer { /* * handler() can return UPROBE_HANDLER_REMOVE to signal the need to @@ -143,6 +145,7 @@ struct uprobe_task { struct uprobe *active_uprobe; unsigned long xol_vaddr; + bool signal_denied; struct arch_uprobe *auprobe; }; diff --git a/include/linux/vdso_datastore.h b/include/linux/vdso_datastore.h new file mode 100644 index 000000000000..a91fa24b06e0 --- /dev/null +++ b/include/linux/vdso_datastore.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_VDSO_DATASTORE_H +#define _LINUX_VDSO_DATASTORE_H + +#include <linux/mm_types.h> + +extern const struct vm_special_mapping vdso_vvar_mapping; +struct vm_area_struct *vdso_install_vvar_mapping(struct mm_struct *mm, unsigned long addr); + +#endif /* _LINUX_VDSO_DATASTORE_H */ diff --git a/include/linux/vfsdebug.h b/include/linux/vfsdebug.h new file mode 100644 index 000000000000..9cf22d3eb9dd --- /dev/null +++ b/include/linux/vfsdebug.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_VFS_DEBUG_H +#define LINUX_VFS_DEBUG_H 1 + +#include <linux/bug.h> + +struct inode; + +#ifdef CONFIG_DEBUG_VFS +void dump_inode(struct inode *inode, const char *reason); + +#define VFS_BUG_ON(cond) BUG_ON(cond) +#define VFS_WARN_ON(cond) (void)WARN_ON(cond) +#define VFS_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond) +#define VFS_WARN_ONCE(cond, format...) (void)WARN_ONCE(cond, format) +#define VFS_WARN(cond, format...) (void)WARN(cond, format) + +#define VFS_BUG_ON_INODE(cond, inode) ({ \ + if (unlikely(!!(cond))) { \ + dump_inode(inode, "VFS_BUG_ON_INODE(" #cond")");\ + BUG_ON(1); \ + } \ +}) + +#define VFS_WARN_ON_INODE(cond, inode) ({ \ + int __ret_warn = !!(cond); \ + \ + if (unlikely(__ret_warn)) { \ + dump_inode(inode, "VFS_WARN_ON_INODE(" #cond")");\ + WARN_ON(1); \ + } \ + unlikely(__ret_warn); \ +}) +#else +#define VFS_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond) +#define VFS_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) +#define VFS_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) +#define VFS_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) +#define VFS_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) + +#define VFS_BUG_ON_INODE(cond, inode) VFS_BUG_ON(cond) +#define VFS_WARN_ON_INODE(cond, inode) BUILD_BUG_ON_INVALID(cond) +#endif /* CONFIG_DEBUG_VFS */ + +#endif diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index f70d0958095c..5a37cb2b6f93 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -151,6 +151,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_X86 DIRECT_MAP_LEVEL2_SPLIT, DIRECT_MAP_LEVEL3_SPLIT, + DIRECT_MAP_LEVEL2_COLLAPSE, + DIRECT_MAP_LEVEL3_COLLAPSE, #endif #ifdef CONFIG_PER_VMA_LOCK_STATS VMA_LOCK_SUCCESS, diff --git a/include/linux/vmcore_info.h b/include/linux/vmcore_info.h index e1dec1a6a749..37e003ae5262 100644 --- a/include/linux/vmcore_info.h +++ b/include/linux/vmcore_info.h @@ -6,9 +6,8 @@ #include <linux/elfcore.h> #include <linux/elf.h> -#define CRASH_CORE_NOTE_NAME "CORE" #define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) -#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4) +#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(NN_PRSTATUS), 4) #define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4) /* diff --git a/include/linux/wait.h b/include/linux/wait.h index 6d90ad974408..3503fe822e38 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -316,6 +316,9 @@ extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags); } \ \ cmd; \ + \ + if (condition) \ + break; \ } \ finish_wait(&wq_head, &__wq_entry); \ __out: __ret; \ diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 0754c463224a..cf793d18e5df 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -69,6 +69,8 @@ struct rxrpc_peer *rxrpc_kernel_get_peer(struct rxrpc_peer *peer); struct rxrpc_peer *rxrpc_kernel_get_call_peer(struct socket *sock, struct rxrpc_call *call); const struct sockaddr_rxrpc *rxrpc_kernel_remote_srx(const struct rxrpc_peer *peer); const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer); +unsigned long rxrpc_kernel_set_peer_data(struct rxrpc_peer *peer, unsigned long app_data); +unsigned long rxrpc_kernel_get_peer_data(const struct rxrpc_peer *peer); unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *); int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t, rxrpc_user_attach_call_t, unsigned long, gfp_t, diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index 958a2460330c..8857f5ea77d4 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -127,27 +127,29 @@ enum yfs_cm_operation { E_(afs_call_trace_work, "QUEUE") #define afs_server_traces \ - EM(afs_server_trace_alloc, "ALLOC ") \ EM(afs_server_trace_callback, "CALLBACK ") \ EM(afs_server_trace_destroy, "DESTROY ") \ EM(afs_server_trace_free, "FREE ") \ EM(afs_server_trace_gc, "GC ") \ - EM(afs_server_trace_get_by_addr, "GET addr ") \ - EM(afs_server_trace_get_by_uuid, "GET uuid ") \ - EM(afs_server_trace_get_caps, "GET caps ") \ - EM(afs_server_trace_get_install, "GET inst ") \ - EM(afs_server_trace_get_new_cbi, "GET cbi ") \ EM(afs_server_trace_get_probe, "GET probe") \ - EM(afs_server_trace_give_up_cb, "giveup-cb") \ EM(afs_server_trace_purging, "PURGE ") \ - EM(afs_server_trace_put_call, "PUT call ") \ EM(afs_server_trace_put_cbi, "PUT cbi ") \ - EM(afs_server_trace_put_find_rsq, "PUT f-rsq") \ EM(afs_server_trace_put_probe, "PUT probe") \ - EM(afs_server_trace_put_slist, "PUT slist") \ - EM(afs_server_trace_put_slist_isort, "PUT isort") \ - EM(afs_server_trace_put_uuid_rsq, "PUT u-req") \ - E_(afs_server_trace_update, "UPDATE") + EM(afs_server_trace_see_destroyer, "SEE destr") \ + EM(afs_server_trace_see_expired, "SEE expd ") \ + EM(afs_server_trace_see_purge, "SEE purge") \ + EM(afs_server_trace_see_timer, "SEE timer") \ + EM(afs_server_trace_unuse_call, "UNU call ") \ + EM(afs_server_trace_unuse_create_fail, "UNU cfail") \ + EM(afs_server_trace_unuse_slist, "UNU slist") \ + EM(afs_server_trace_unuse_slist_isort, "UNU isort") \ + EM(afs_server_trace_update, "UPDATE ") \ + EM(afs_server_trace_use_by_uuid, "USE uuid ") \ + EM(afs_server_trace_use_cm_call, "USE cm-cl") \ + EM(afs_server_trace_use_get_caps, "USE gcaps") \ + EM(afs_server_trace_use_give_up_cb, "USE gvupc") \ + EM(afs_server_trace_use_install, "USE inst ") \ + E_(afs_server_trace_wait_create, "WAIT crt ") #define afs_volume_traces \ EM(afs_volume_trace_alloc, "ALLOC ") \ @@ -169,41 +171,48 @@ enum yfs_cm_operation { #define afs_cell_traces \ EM(afs_cell_trace_alloc, "ALLOC ") \ + EM(afs_cell_trace_destroy, "DESTROY ") \ EM(afs_cell_trace_free, "FREE ") \ EM(afs_cell_trace_get_atcell, "GET atcell") \ - EM(afs_cell_trace_get_queue_dns, "GET q-dns ") \ - EM(afs_cell_trace_get_queue_manage, "GET q-mng ") \ - EM(afs_cell_trace_get_queue_new, "GET q-new ") \ EM(afs_cell_trace_get_server, "GET server") \ EM(afs_cell_trace_get_vol, "GET vol ") \ - EM(afs_cell_trace_insert, "INSERT ") \ - EM(afs_cell_trace_manage, "MANAGE ") \ + EM(afs_cell_trace_purge, "PURGE ") \ EM(afs_cell_trace_put_atcell, "PUT atcell") \ EM(afs_cell_trace_put_candidate, "PUT candid") \ - EM(afs_cell_trace_put_destroy, "PUT destry") \ - EM(afs_cell_trace_put_queue_work, "PUT q-work") \ - EM(afs_cell_trace_put_queue_fail, "PUT q-fail") \ + EM(afs_cell_trace_put_final, "PUT final ") \ EM(afs_cell_trace_put_server, "PUT server") \ EM(afs_cell_trace_put_vol, "PUT vol ") \ + EM(afs_cell_trace_queue_again, "QUE again ") \ + EM(afs_cell_trace_queue_dns, "QUE dns ") \ + EM(afs_cell_trace_queue_new, "QUE new ") \ + EM(afs_cell_trace_queue_purge, "QUE purge ") \ + EM(afs_cell_trace_manage, "MANAGE ") \ + EM(afs_cell_trace_managed, "MANAGED ") \ EM(afs_cell_trace_see_source, "SEE source") \ - EM(afs_cell_trace_see_ws, "SEE ws ") \ + EM(afs_cell_trace_see_mgmt_timer, "SEE mtimer") \ EM(afs_cell_trace_unuse_alias, "UNU alias ") \ EM(afs_cell_trace_unuse_check_alias, "UNU chk-al") \ EM(afs_cell_trace_unuse_delete, "UNU delete") \ + EM(afs_cell_trace_unuse_dynroot_mntpt, "UNU dyn-mp") \ EM(afs_cell_trace_unuse_fc, "UNU fc ") \ - EM(afs_cell_trace_unuse_lookup, "UNU lookup") \ + EM(afs_cell_trace_unuse_lookup_dynroot, "UNU lu-dyn") \ + EM(afs_cell_trace_unuse_lookup_error, "UNU lu-err") \ EM(afs_cell_trace_unuse_mntpt, "UNU mntpt ") \ EM(afs_cell_trace_unuse_no_pin, "UNU no-pin") \ EM(afs_cell_trace_unuse_parse, "UNU parse ") \ EM(afs_cell_trace_unuse_pin, "UNU pin ") \ - EM(afs_cell_trace_unuse_probe, "UNU probe ") \ EM(afs_cell_trace_unuse_sbi, "UNU sbi ") \ EM(afs_cell_trace_unuse_ws, "UNU ws ") \ EM(afs_cell_trace_use_alias, "USE alias ") \ EM(afs_cell_trace_use_check_alias, "USE chk-al") \ EM(afs_cell_trace_use_fc, "USE fc ") \ EM(afs_cell_trace_use_fc_alias, "USE fc-al ") \ - EM(afs_cell_trace_use_lookup, "USE lookup") \ + EM(afs_cell_trace_use_lookup_add, "USE lu-add") \ + EM(afs_cell_trace_use_lookup_canonical, "USE lu-can") \ + EM(afs_cell_trace_use_lookup_dynroot, "USE lu-dyn") \ + EM(afs_cell_trace_use_lookup_mntpt, "USE lu-mpt") \ + EM(afs_cell_trace_use_lookup_mount, "USE lu-mnt") \ + EM(afs_cell_trace_use_lookup_ws, "USE lu-ws ") \ EM(afs_cell_trace_use_mntpt, "USE mntpt ") \ EM(afs_cell_trace_use_pin, "USE pin ") \ EM(afs_cell_trace_use_probe, "USE probe ") \ @@ -220,7 +229,7 @@ enum yfs_cm_operation { EM(afs_alist_trace_put_getaddru, "PUT GtAdrU") \ EM(afs_alist_trace_put_parse_empty, "PUT p-empt") \ EM(afs_alist_trace_put_parse_error, "PUT p-err ") \ - EM(afs_alist_trace_put_server_dup, "PUT sv-dup") \ + EM(afs_alist_trace_put_server_create, "PUT sv-crt") \ EM(afs_alist_trace_put_server_oom, "PUT sv-oom") \ EM(afs_alist_trace_put_server_update, "PUT sv-upd") \ EM(afs_alist_trace_put_vlgetcaps, "PUT vgtcap") \ @@ -1270,7 +1279,7 @@ TRACE_EVENT(afs_bulkstat_error, ); TRACE_EVENT(afs_cm_no_server, - TP_PROTO(struct afs_call *call, struct sockaddr_rxrpc *srx), + TP_PROTO(struct afs_call *call, const struct sockaddr_rxrpc *srx), TP_ARGS(call, srx), @@ -1529,7 +1538,7 @@ TRACE_EVENT(afs_server, __entry->reason = reason; ), - TP_printk("s=%08x %s u=%d a=%d", + TP_printk("s=%08x %s r=%d a=%d", __entry->server, __print_symbolic(__entry->reason, afs_server_traces), __entry->ref, @@ -1537,25 +1546,29 @@ TRACE_EVENT(afs_server, ); TRACE_EVENT(afs_volume, - TP_PROTO(afs_volid_t vid, int ref, enum afs_volume_trace reason), + TP_PROTO(unsigned int debug_id, afs_volid_t vid, int ref, + enum afs_volume_trace reason), - TP_ARGS(vid, ref, reason), + TP_ARGS(debug_id, vid, ref, reason), TP_STRUCT__entry( + __field(unsigned int, debug_id) __field(afs_volid_t, vid) __field(int, ref) __field(enum afs_volume_trace, reason) ), TP_fast_assign( - __entry->vid = vid; - __entry->ref = ref; - __entry->reason = reason; + __entry->debug_id = debug_id; + __entry->vid = vid; + __entry->ref = ref; + __entry->reason = reason; ), - TP_printk("V=%llx %s ur=%d", - __entry->vid, + TP_printk("V=%08x %s vid=%llx r=%d", + __entry->debug_id, __print_symbolic(__entry->reason, afs_volume_traces), + __entry->vid, __entry->ref) ); diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index e81431deaa50..5fbdabe3faea 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -207,7 +207,7 @@ TRACE_EVENT_RCU(rcu_exp_grace_period, __entry->gpevent = gpevent; ), - TP_printk("%s %ld %s", + TP_printk("%s %#lx %s", __entry->rcuname, __entry->gpseq, __entry->gpevent) ); @@ -561,40 +561,6 @@ TRACE_EVENT_RCU(rcu_segcb_stats, ); /* - * Tracepoint for the registration of a single RCU callback of the special - * kvfree() form. The first argument is the RCU type, the second argument - * is a pointer to the RCU callback, the third argument is the offset - * of the callback within the enclosing RCU-protected data structure, - * the fourth argument is the number of lazy callbacks queued, and the - * fifth argument is the total number of callbacks queued. - */ -TRACE_EVENT_RCU(rcu_kvfree_callback, - - TP_PROTO(const char *rcuname, struct rcu_head *rhp, unsigned long offset, - long qlen), - - TP_ARGS(rcuname, rhp, offset, qlen), - - TP_STRUCT__entry( - __field(const char *, rcuname) - __field(void *, rhp) - __field(unsigned long, offset) - __field(long, qlen) - ), - - TP_fast_assign( - __entry->rcuname = rcuname; - __entry->rhp = rhp; - __entry->offset = offset; - __entry->qlen = qlen; - ), - - TP_printk("%s rhp=%p func=%ld %ld", - __entry->rcuname, __entry->rhp, __entry->offset, - __entry->qlen) -); - -/* * Tracepoint for marking the beginning rcu_do_batch, performed to start * RCU callback invocation. The first argument is the RCU flavor, * the second is the number of lazy callbacks queued, the third is diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 9ea4c404bd4e..bfd97cce40a1 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -193,9 +193,7 @@ static inline long __trace_sched_switch_state(bool preempt, { unsigned int state; -#ifdef CONFIG_SCHED_DEBUG BUG_ON(p != current); -#endif /* CONFIG_SCHED_DEBUG */ /* * Preemption ignores task state, therefore preempted tasks are always diff --git a/include/trace/events/sched_ext.h b/include/trace/events/sched_ext.h index fe19da7315a9..50e4b712735a 100644 --- a/include/trace/events/sched_ext.h +++ b/include/trace/events/sched_ext.h @@ -26,6 +26,25 @@ TRACE_EVENT(sched_ext_dump, ) ); +TRACE_EVENT(sched_ext_event, + TP_PROTO(const char *name, __s64 delta), + TP_ARGS(name, delta), + + TP_STRUCT__entry( + __string(name, name) + __field( __s64, delta ) + ), + + TP_fast_assign( + __assign_str(name); + __entry->delta = delta; + ), + + TP_printk("name %s delta %lld", + __get_str(name), __entry->delta + ) +); + #endif /* _TRACE_SCHED_EXT_H */ /* This part must be outside protection */ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 1ea2c4c33b86..ef1c27fa3c57 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -85,6 +85,7 @@ /* compatibility flags */ #define MAP_FILE 0 +#define PKEY_UNRESTRICTED 0x0 #define PKEY_DISABLE_ACCESS 0x1 #define PKEY_DISABLE_WRITE 0x2 #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 88dc393c2bca..2892a45023af 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -849,9 +849,11 @@ __SYSCALL(__NR_getxattrat, sys_getxattrat) __SYSCALL(__NR_listxattrat, sys_listxattrat) #define __NR_removexattrat 466 __SYSCALL(__NR_removexattrat, sys_removexattrat) +#define __NR_open_tree_attr 467 +__SYSCALL(__NR_open_tree_attr, sys_open_tree_attr) #undef __NR_syscalls -#define __NR_syscalls 467 +#define __NR_syscalls 468 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/bits.h b/include/uapi/linux/bits.h index 5ee30f882736..682b406e1067 100644 --- a/include/uapi/linux/bits.h +++ b/include/uapi/linux/bits.h @@ -4,13 +4,9 @@ #ifndef _UAPI_LINUX_BITS_H #define _UAPI_LINUX_BITS_H -#define __GENMASK(h, l) \ - (((~_UL(0)) - (_UL(1) << (l)) + 1) & \ - (~_UL(0) >> (__BITS_PER_LONG - 1 - (h)))) +#define __GENMASK(h, l) (((~_UL(0)) << (l)) & (~_UL(0) >> (BITS_PER_LONG - 1 - (h)))) -#define __GENMASK_ULL(h, l) \ - (((~_ULL(0)) - (_ULL(1) << (l)) + 1) & \ - (~_ULL(0) >> (__BITS_PER_LONG_LONG - 1 - (h)))) +#define __GENMASK_ULL(h, l) (((~_ULL(0)) << (l)) & (~_ULL(0) >> (BITS_PER_LONG_LONG - 1 - (h)))) #define __GENMASK_U128(h, l) \ ((_BIT128((h)) << 1) - (_BIT128(l))) diff --git a/include/uapi/linux/const.h b/include/uapi/linux/const.h index e16be0d37746..b8f629ef135f 100644 --- a/include/uapi/linux/const.h +++ b/include/uapi/linux/const.h @@ -33,7 +33,7 @@ * Missing asm support * * __BIT128() would not work in the asm code, as it shifts an - * 'unsigned __init128' data type as direct representation of + * 'unsigned __int128' data type as direct representation of * 128 bit constants is not supported in the gcc compiler, as * they get silently truncated. * diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index b44069d29cec..819ded2d39de 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -11,6 +11,7 @@ typedef __u16 Elf32_Half; typedef __u32 Elf32_Off; typedef __s32 Elf32_Sword; typedef __u32 Elf32_Word; +typedef __u16 Elf32_Versym; /* 64-bit ELF base types. */ typedef __u64 Elf64_Addr; @@ -21,6 +22,7 @@ typedef __s32 Elf64_Sword; typedef __u32 Elf64_Word; typedef __u64 Elf64_Xword; typedef __s64 Elf64_Sxword; +typedef __u16 Elf64_Versym; /* These constants are for the segment types stored in the image headers */ #define PT_NULL 0 @@ -107,6 +109,7 @@ typedef __s64 Elf64_Sxword; #define DT_VALRNGLO 0x6ffffd00 #define DT_VALRNGHI 0x6ffffdff #define DT_ADDRRNGLO 0x6ffffe00 +#define DT_GNU_HASH 0x6ffffef5 #define DT_ADDRRNGHI 0x6ffffeff #define DT_VERSYM 0x6ffffff0 #define DT_RELACOUNT 0x6ffffff9 @@ -125,6 +128,8 @@ typedef __s64 Elf64_Sxword; #define STB_GLOBAL 1 #define STB_WEAK 2 +#define STN_UNDEF 0 + #define STT_NOTYPE 0 #define STT_OBJECT 1 #define STT_FUNC 2 @@ -133,6 +138,9 @@ typedef __s64 Elf64_Sxword; #define STT_COMMON 5 #define STT_TLS 6 +#define VER_FLG_BASE 0x1 +#define VER_FLG_WEAK 0x2 + #define ELF_ST_BIND(x) ((x) >> 4) #define ELF_ST_TYPE(x) ((x) & 0xf) #define ELF32_ST_BIND(x) ELF_ST_BIND(x) @@ -291,8 +299,18 @@ typedef struct elf64_phdr { #define SHF_WRITE 0x1 #define SHF_ALLOC 0x2 #define SHF_EXECINSTR 0x4 +#define SHF_MERGE 0x10 +#define SHF_STRINGS 0x20 +#define SHF_INFO_LINK 0x40 +#define SHF_LINK_ORDER 0x80 +#define SHF_OS_NONCONFORMING 0x100 +#define SHF_GROUP 0x200 +#define SHF_TLS 0x400 #define SHF_RELA_LIVEPATCH 0x00100000 #define SHF_RO_AFTER_INIT 0x00200000 +#define SHF_ORDERED 0x04000000 +#define SHF_EXCLUDE 0x08000000 +#define SHF_MASKOS 0x0ff00000 #define SHF_MASKPROC 0xf0000000 /* special section indexes */ @@ -368,101 +386,180 @@ typedef struct elf64_shdr { #define ELF_OSABI ELFOSABI_NONE #endif +/* Note definitions: NN_ defines names. NT_ defines types. */ + +#define NN_GNU_PROPERTY_TYPE_0 "GNU" +#define NT_GNU_PROPERTY_TYPE_0 5 + /* * Notes used in ET_CORE. Architectures export some of the arch register sets * using the corresponding note types via the PTRACE_GETREGSET and * PTRACE_SETREGSET requests. - * The note name for these types is "LINUX", except NT_PRFPREG that is named - * "CORE". */ +#define NN_PRSTATUS "CORE" #define NT_PRSTATUS 1 +#define NN_PRFPREG "CORE" #define NT_PRFPREG 2 +#define NN_PRPSINFO "CORE" #define NT_PRPSINFO 3 +#define NN_TASKSTRUCT "CORE" #define NT_TASKSTRUCT 4 +#define NN_AUXV "CORE" #define NT_AUXV 6 /* * Note to userspace developers: size of NT_SIGINFO note may increase * in the future to accomodate more fields, don't assume it is fixed! */ +#define NN_SIGINFO "CORE" #define NT_SIGINFO 0x53494749 +#define NN_FILE "CORE" #define NT_FILE 0x46494c45 +#define NN_PRXFPREG "LINUX" #define NT_PRXFPREG 0x46e62b7f /* copied from gdb5.1/include/elf/common.h */ +#define NN_PPC_VMX "LINUX" #define NT_PPC_VMX 0x100 /* PowerPC Altivec/VMX registers */ +#define NN_PPC_SPE "LINUX" #define NT_PPC_SPE 0x101 /* PowerPC SPE/EVR registers */ +#define NN_PPC_VSX "LINUX" #define NT_PPC_VSX 0x102 /* PowerPC VSX registers */ +#define NN_PPC_TAR "LINUX" #define NT_PPC_TAR 0x103 /* Target Address Register */ +#define NN_PPC_PPR "LINUX" #define NT_PPC_PPR 0x104 /* Program Priority Register */ +#define NN_PPC_DSCR "LINUX" #define NT_PPC_DSCR 0x105 /* Data Stream Control Register */ +#define NN_PPC_EBB "LINUX" #define NT_PPC_EBB 0x106 /* Event Based Branch Registers */ +#define NN_PPC_PMU "LINUX" #define NT_PPC_PMU 0x107 /* Performance Monitor Registers */ +#define NN_PPC_TM_CGPR "LINUX" #define NT_PPC_TM_CGPR 0x108 /* TM checkpointed GPR Registers */ +#define NN_PPC_TM_CFPR "LINUX" #define NT_PPC_TM_CFPR 0x109 /* TM checkpointed FPR Registers */ +#define NN_PPC_TM_CVMX "LINUX" #define NT_PPC_TM_CVMX 0x10a /* TM checkpointed VMX Registers */ +#define NN_PPC_TM_CVSX "LINUX" #define NT_PPC_TM_CVSX 0x10b /* TM checkpointed VSX Registers */ +#define NN_PPC_TM_SPR "LINUX" #define NT_PPC_TM_SPR 0x10c /* TM Special Purpose Registers */ +#define NN_PPC_TM_CTAR "LINUX" #define NT_PPC_TM_CTAR 0x10d /* TM checkpointed Target Address Register */ +#define NN_PPC_TM_CPPR "LINUX" #define NT_PPC_TM_CPPR 0x10e /* TM checkpointed Program Priority Register */ +#define NN_PPC_TM_CDSCR "LINUX" #define NT_PPC_TM_CDSCR 0x10f /* TM checkpointed Data Stream Control Register */ +#define NN_PPC_PKEY "LINUX" #define NT_PPC_PKEY 0x110 /* Memory Protection Keys registers */ +#define NN_PPC_DEXCR "LINUX" #define NT_PPC_DEXCR 0x111 /* PowerPC DEXCR registers */ +#define NN_PPC_HASHKEYR "LINUX" #define NT_PPC_HASHKEYR 0x112 /* PowerPC HASHKEYR register */ +#define NN_386_TLS "LINUX" #define NT_386_TLS 0x200 /* i386 TLS slots (struct user_desc) */ +#define NN_386_IOPERM "LINUX" #define NT_386_IOPERM 0x201 /* x86 io permission bitmap (1=deny) */ +#define NN_X86_XSTATE "LINUX" #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ /* Old binutils treats 0x203 as a CET state */ +#define NN_X86_SHSTK "LINUX" #define NT_X86_SHSTK 0x204 /* x86 SHSTK state */ +#define NN_X86_XSAVE_LAYOUT "LINUX" #define NT_X86_XSAVE_LAYOUT 0x205 /* XSAVE layout description */ +#define NN_S390_HIGH_GPRS "LINUX" #define NT_S390_HIGH_GPRS 0x300 /* s390 upper register halves */ +#define NN_S390_TIMER "LINUX" #define NT_S390_TIMER 0x301 /* s390 timer register */ +#define NN_S390_TODCMP "LINUX" #define NT_S390_TODCMP 0x302 /* s390 TOD clock comparator register */ +#define NN_S390_TODPREG "LINUX" #define NT_S390_TODPREG 0x303 /* s390 TOD programmable register */ +#define NN_S390_CTRS "LINUX" #define NT_S390_CTRS 0x304 /* s390 control registers */ +#define NN_S390_PREFIX "LINUX" #define NT_S390_PREFIX 0x305 /* s390 prefix register */ +#define NN_S390_LAST_BREAK "LINUX" #define NT_S390_LAST_BREAK 0x306 /* s390 breaking event address */ +#define NN_S390_SYSTEM_CALL "LINUX" #define NT_S390_SYSTEM_CALL 0x307 /* s390 system call restart data */ +#define NN_S390_TDB "LINUX" #define NT_S390_TDB 0x308 /* s390 transaction diagnostic block */ +#define NN_S390_VXRS_LOW "LINUX" #define NT_S390_VXRS_LOW 0x309 /* s390 vector registers 0-15 upper half */ +#define NN_S390_VXRS_HIGH "LINUX" #define NT_S390_VXRS_HIGH 0x30a /* s390 vector registers 16-31 */ +#define NN_S390_GS_CB "LINUX" #define NT_S390_GS_CB 0x30b /* s390 guarded storage registers */ +#define NN_S390_GS_BC "LINUX" #define NT_S390_GS_BC 0x30c /* s390 guarded storage broadcast control block */ +#define NN_S390_RI_CB "LINUX" #define NT_S390_RI_CB 0x30d /* s390 runtime instrumentation */ +#define NN_S390_PV_CPU_DATA "LINUX" #define NT_S390_PV_CPU_DATA 0x30e /* s390 protvirt cpu dump data */ +#define NN_ARM_VFP "LINUX" #define NT_ARM_VFP 0x400 /* ARM VFP/NEON registers */ +#define NN_ARM_TLS "LINUX" #define NT_ARM_TLS 0x401 /* ARM TLS register */ +#define NN_ARM_HW_BREAK "LINUX" #define NT_ARM_HW_BREAK 0x402 /* ARM hardware breakpoint registers */ +#define NN_ARM_HW_WATCH "LINUX" #define NT_ARM_HW_WATCH 0x403 /* ARM hardware watchpoint registers */ +#define NN_ARM_SYSTEM_CALL "LINUX" #define NT_ARM_SYSTEM_CALL 0x404 /* ARM system call number */ +#define NN_ARM_SVE "LINUX" #define NT_ARM_SVE 0x405 /* ARM Scalable Vector Extension registers */ +#define NN_ARM_PAC_MASK "LINUX" #define NT_ARM_PAC_MASK 0x406 /* ARM pointer authentication code masks */ +#define NN_ARM_PACA_KEYS "LINUX" #define NT_ARM_PACA_KEYS 0x407 /* ARM pointer authentication address keys */ +#define NN_ARM_PACG_KEYS "LINUX" #define NT_ARM_PACG_KEYS 0x408 /* ARM pointer authentication generic key */ +#define NN_ARM_TAGGED_ADDR_CTRL "LINUX" #define NT_ARM_TAGGED_ADDR_CTRL 0x409 /* arm64 tagged address control (prctl()) */ +#define NN_ARM_PAC_ENABLED_KEYS "LINUX" #define NT_ARM_PAC_ENABLED_KEYS 0x40a /* arm64 ptr auth enabled keys (prctl()) */ +#define NN_ARM_SSVE "LINUX" #define NT_ARM_SSVE 0x40b /* ARM Streaming SVE registers */ +#define NN_ARM_ZA "LINUX" #define NT_ARM_ZA 0x40c /* ARM SME ZA registers */ +#define NN_ARM_ZT "LINUX" #define NT_ARM_ZT 0x40d /* ARM SME ZT registers */ +#define NN_ARM_FPMR "LINUX" #define NT_ARM_FPMR 0x40e /* ARM floating point mode register */ +#define NN_ARM_POE "LINUX" #define NT_ARM_POE 0x40f /* ARM POE registers */ +#define NN_ARM_GCS "LINUX" #define NT_ARM_GCS 0x410 /* ARM GCS state */ +#define NN_ARC_V2 "LINUX" #define NT_ARC_V2 0x600 /* ARCv2 accumulator/extra registers */ +#define NN_VMCOREDD "LINUX" #define NT_VMCOREDD 0x700 /* Vmcore Device Dump Note */ +#define NN_MIPS_DSP "LINUX" #define NT_MIPS_DSP 0x800 /* MIPS DSP ASE registers */ +#define NN_MIPS_FP_MODE "LINUX" #define NT_MIPS_FP_MODE 0x801 /* MIPS floating-point mode */ +#define NN_MIPS_MSA "LINUX" #define NT_MIPS_MSA 0x802 /* MIPS SIMD registers */ +#define NN_RISCV_CSR "LINUX" #define NT_RISCV_CSR 0x900 /* RISC-V Control and Status Registers */ +#define NN_RISCV_VECTOR "LINUX" #define NT_RISCV_VECTOR 0x901 /* RISC-V vector registers */ +#define NN_RISCV_TAGGED_ADDR_CTRL "LINUX" #define NT_RISCV_TAGGED_ADDR_CTRL 0x902 /* RISC-V tagged address control (prctl()) */ +#define NN_LOONGARCH_CPUCFG "LINUX" #define NT_LOONGARCH_CPUCFG 0xa00 /* LoongArch CPU config registers */ +#define NN_LOONGARCH_CSR "LINUX" #define NT_LOONGARCH_CSR 0xa01 /* LoongArch control and status registers */ +#define NN_LOONGARCH_LSX "LINUX" #define NT_LOONGARCH_LSX 0xa02 /* LoongArch Loongson SIMD Extension registers */ +#define NN_LOONGARCH_LASX "LINUX" #define NT_LOONGARCH_LASX 0xa03 /* LoongArch Loongson Advanced SIMD Extension registers */ +#define NN_LOONGARCH_LBT "LINUX" #define NT_LOONGARCH_LBT 0xa04 /* LoongArch Loongson Binary Translation registers */ +#define NN_LOONGARCH_HW_BREAK "LINUX" #define NT_LOONGARCH_HW_BREAK 0xa05 /* LoongArch hardware breakpoint registers */ +#define NN_LOONGARCH_HW_WATCH "LINUX" #define NT_LOONGARCH_HW_WATCH 0xa06 /* LoongArch hardware watchpoint registers */ -/* Note types with note name "GNU" */ -#define NT_GNU_PROPERTY_TYPE_0 5 - /* Note header in a PT_NOTE section */ typedef struct elf32_note { Elf32_Word n_namesz; /* Name size */ @@ -483,4 +580,34 @@ typedef struct elf64_note { /* Bits for GNU_PROPERTY_AARCH64_FEATURE_1_BTI */ #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0) +typedef struct { + Elf32_Half vd_version; + Elf32_Half vd_flags; + Elf32_Half vd_ndx; + Elf32_Half vd_cnt; + Elf32_Word vd_hash; + Elf32_Word vd_aux; + Elf32_Word vd_next; +} Elf32_Verdef; + +typedef struct { + Elf64_Half vd_version; + Elf64_Half vd_flags; + Elf64_Half vd_ndx; + Elf64_Half vd_cnt; + Elf64_Word vd_hash; + Elf64_Word vd_aux; + Elf64_Word vd_next; +} Elf64_Verdef; + +typedef struct { + Elf32_Word vda_name; + Elf32_Word vda_next; +} Elf32_Verdaux; + +typedef struct { + Elf64_Word vda_name; + Elf64_Word vda_next; +} Elf64_Verdaux; + #endif /* _UAPI_LINUX_ELF_H */ diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index bd8167979707..e710967c7c26 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -28,6 +28,8 @@ /* #define FAN_DIR_MODIFY 0x00080000 */ /* Deprecated (reserved) */ #define FAN_PRE_ACCESS 0x00100000 /* Pre-content access hook */ +#define FAN_MNT_ATTACH 0x01000000 /* Mount was attached */ +#define FAN_MNT_DETACH 0x02000000 /* Mount was detached */ #define FAN_EVENT_ON_CHILD 0x08000000 /* Interested in child events */ @@ -64,6 +66,7 @@ #define FAN_REPORT_NAME 0x00000800 /* Report events with name */ #define FAN_REPORT_TARGET_FID 0x00001000 /* Report dirent target id */ #define FAN_REPORT_FD_ERROR 0x00002000 /* event->fd can report error */ +#define FAN_REPORT_MNT 0x00004000 /* Report mount events */ /* Convenience macro - FAN_REPORT_NAME requires FAN_REPORT_DIR_FID */ #define FAN_REPORT_DFID_NAME (FAN_REPORT_DIR_FID | FAN_REPORT_NAME) @@ -94,6 +97,7 @@ #define FAN_MARK_INODE 0x00000000 #define FAN_MARK_MOUNT 0x00000010 #define FAN_MARK_FILESYSTEM 0x00000100 +#define FAN_MARK_MNTNS 0x00000110 /* * Convenience macro - FAN_MARK_IGNORE requires FAN_MARK_IGNORED_SURV_MODIFY @@ -147,6 +151,7 @@ struct fanotify_event_metadata { #define FAN_EVENT_INFO_TYPE_PIDFD 4 #define FAN_EVENT_INFO_TYPE_ERROR 5 #define FAN_EVENT_INFO_TYPE_RANGE 6 +#define FAN_EVENT_INFO_TYPE_MNT 7 /* Special info types for FAN_RENAME */ #define FAN_EVENT_INFO_TYPE_OLD_DFID_NAME 10 @@ -200,6 +205,11 @@ struct fanotify_event_info_range { __u64 count; }; +struct fanotify_event_info_mnt { + struct fanotify_event_info_header hdr; + __u64 mnt_id; +}; + /* * User space may need to record additional information about its decision. * The extra information type records what kind of information is included. diff --git a/include/uapi/linux/firewire-cdev.h b/include/uapi/linux/firewire-cdev.h index 1f2c9469f921..05e3aa8fa8bc 100644 --- a/include/uapi/linux/firewire-cdev.h +++ b/include/uapi/linux/firewire-cdev.h @@ -449,7 +449,8 @@ struct fw_cdev_event_phy_packet { * which the packet arrived. For %FW_CDEV_EVENT_PHY_PACKET_SENT2 and non-ping packet, * the time stamp of isochronous cycle at which the packet was sent. For ping packet, * the tick count for round-trip time measured by 1394 OHCI controller. - * The time stamp of isochronous cycle at which either the response was sent for + * + * The time stamp of isochronous cycle at which either the response was sent for * %FW_CDEV_EVENT_PHY_PACKET_SENT2 or the request arrived for * %FW_CDEV_EVENT_PHY_PACKET_RECEIVED2. * @data: Incoming data diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 45e6d8fca9b9..b6ae8ad8934b 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -929,6 +929,7 @@ struct kvm_enable_cap { #define KVM_CAP_PRE_FAULT_MEMORY 236 #define KVM_CAP_X86_APIC_BUS_CYCLES_NS 237 #define KVM_CAP_X86_GUEST_MODE 238 +#define KVM_CAP_ARM_WRITABLE_IMP_ID_REGS 239 struct kvm_irq_routing_irqchip { __u32 irqchip; diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index c07008816aca..7fa67c2031a5 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -179,7 +179,12 @@ struct statmount { __u32 opt_array; /* [str] Array of nul terminated fs options */ __u32 opt_sec_num; /* Number of security options */ __u32 opt_sec_array; /* [str] Array of nul terminated security options */ - __u64 __spare2[46]; + __u64 supported_mask; /* Mask flags that this kernel supports */ + __u32 mnt_uidmap_num; /* Number of uid mappings */ + __u32 mnt_uidmap; /* [str] Array of uid mappings (as seen from callers namespace) */ + __u32 mnt_gidmap_num; /* Number of gid mappings */ + __u32 mnt_gidmap; /* [str] Array of gid mappings (as seen from callers namespace) */ + __u64 __spare2[43]; char str[]; /* Variable size part containing strings */ }; @@ -217,6 +222,9 @@ struct mnt_id_req { #define STATMOUNT_SB_SOURCE 0x00000200U /* Want/got sb_source */ #define STATMOUNT_OPT_ARRAY 0x00000400U /* Want/got opt_... */ #define STATMOUNT_OPT_SEC_ARRAY 0x00000800U /* Want/got opt_sec... */ +#define STATMOUNT_SUPPORTED_MASK 0x00001000U /* Want/got supported mask flags */ +#define STATMOUNT_MNT_UIDMAP 0x00002000U /* Want/got uidmap... */ +#define STATMOUNT_MNT_GIDMAP 0x00004000U /* Want/got gidmap... */ /* * Special @mnt_id values that can be passed to listmount diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h new file mode 100644 index 000000000000..876bfe4e4227 --- /dev/null +++ b/include/uapi/linux/mshv.h @@ -0,0 +1,291 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Userspace interfaces for /dev/mshv* devices and derived fds + * + * This file is divided into sections containing data structures and IOCTLs for + * a particular set of related devices or derived file descriptors. + * + * The IOCTL definitions are at the end of each section. They are grouped by + * device/fd, so that new IOCTLs can easily be added with a monotonically + * increasing number. + */ +#ifndef _UAPI_LINUX_MSHV_H +#define _UAPI_LINUX_MSHV_H + +#include <linux/types.h> + +#define MSHV_IOCTL 0xB8 + +/* + ******************************************* + * Entry point to main VMM APIs: /dev/mshv * + ******************************************* + */ + +enum { + MSHV_PT_BIT_LAPIC, + MSHV_PT_BIT_X2APIC, + MSHV_PT_BIT_GPA_SUPER_PAGES, + MSHV_PT_BIT_COUNT, +}; + +#define MSHV_PT_FLAGS_MASK ((1 << MSHV_PT_BIT_COUNT) - 1) + +enum { + MSHV_PT_ISOLATION_NONE, + MSHV_PT_ISOLATION_COUNT, +}; + +/** + * struct mshv_create_partition - arguments for MSHV_CREATE_PARTITION + * @pt_flags: Bitmask of 1 << MSHV_PT_BIT_* + * @pt_isolation: MSHV_PT_ISOLATION_* + * + * Returns a file descriptor to act as a handle to a guest partition. + * At this point the partition is not yet initialized in the hypervisor. + * Some operations must be done with the partition in this state, e.g. setting + * so-called "early" partition properties. The partition can then be + * initialized with MSHV_INITIALIZE_PARTITION. + */ +struct mshv_create_partition { + __u64 pt_flags; + __u64 pt_isolation; +}; + +/* /dev/mshv */ +#define MSHV_CREATE_PARTITION _IOW(MSHV_IOCTL, 0x00, struct mshv_create_partition) + +/* + ************************ + * Child partition APIs * + ************************ + */ + +struct mshv_create_vp { + __u32 vp_index; +}; + +enum { + MSHV_SET_MEM_BIT_WRITABLE, + MSHV_SET_MEM_BIT_EXECUTABLE, + MSHV_SET_MEM_BIT_UNMAP, + MSHV_SET_MEM_BIT_COUNT +}; + +#define MSHV_SET_MEM_FLAGS_MASK ((1 << MSHV_SET_MEM_BIT_COUNT) - 1) + +/* The hypervisor's "native" page size */ +#define MSHV_HV_PAGE_SIZE 0x1000 + +/** + * struct mshv_user_mem_region - arguments for MSHV_SET_GUEST_MEMORY + * @size: Size of the memory region (bytes). Must be aligned to + * MSHV_HV_PAGE_SIZE + * @guest_pfn: Base guest page number to map + * @userspace_addr: Base address of userspace memory. Must be aligned to + * MSHV_HV_PAGE_SIZE + * @flags: Bitmask of 1 << MSHV_SET_MEM_BIT_*. If (1 << MSHV_SET_MEM_BIT_UNMAP) + * is set, ignore other bits. + * @rsvd: MBZ + * + * Map or unmap a region of userspace memory to Guest Physical Addresses (GPA). + * Mappings can't overlap in GPA space or userspace. + * To unmap, these fields must match an existing mapping. + */ +struct mshv_user_mem_region { + __u64 size; + __u64 guest_pfn; + __u64 userspace_addr; + __u8 flags; + __u8 rsvd[7]; +}; + +enum { + MSHV_IRQFD_BIT_DEASSIGN, + MSHV_IRQFD_BIT_RESAMPLE, + MSHV_IRQFD_BIT_COUNT, +}; + +#define MSHV_IRQFD_FLAGS_MASK ((1 << MSHV_IRQFD_BIT_COUNT) - 1) + +struct mshv_user_irqfd { + __s32 fd; + __s32 resamplefd; + __u32 gsi; + __u32 flags; +}; + +enum { + MSHV_IOEVENTFD_BIT_DATAMATCH, + MSHV_IOEVENTFD_BIT_PIO, + MSHV_IOEVENTFD_BIT_DEASSIGN, + MSHV_IOEVENTFD_BIT_COUNT, +}; + +#define MSHV_IOEVENTFD_FLAGS_MASK ((1 << MSHV_IOEVENTFD_BIT_COUNT) - 1) + +struct mshv_user_ioeventfd { + __u64 datamatch; + __u64 addr; /* legal pio/mmio address */ + __u32 len; /* 1, 2, 4, or 8 bytes */ + __s32 fd; + __u32 flags; + __u8 rsvd[4]; +}; + +struct mshv_user_irq_entry { + __u32 gsi; + __u32 address_lo; + __u32 address_hi; + __u32 data; +}; + +struct mshv_user_irq_table { + __u32 nr; + __u32 rsvd; /* MBZ */ + struct mshv_user_irq_entry entries[]; +}; + +enum { + MSHV_GPAP_ACCESS_TYPE_ACCESSED, + MSHV_GPAP_ACCESS_TYPE_DIRTY, + MSHV_GPAP_ACCESS_TYPE_COUNT /* Count of enum members */ +}; + +enum { + MSHV_GPAP_ACCESS_OP_NOOP, + MSHV_GPAP_ACCESS_OP_CLEAR, + MSHV_GPAP_ACCESS_OP_SET, + MSHV_GPAP_ACCESS_OP_COUNT /* Count of enum members */ +}; + +/** + * struct mshv_gpap_access_bitmap - arguments for MSHV_GET_GPAP_ACCESS_BITMAP + * @access_type: MSHV_GPAP_ACCESS_TYPE_* - The type of access to record in the + * bitmap + * @access_op: MSHV_GPAP_ACCESS_OP_* - Allows an optional clear or set of all + * the access states in the range, after retrieving the current + * states. + * @rsvd: MBZ + * @page_count: Number of pages + * @gpap_base: Base gpa page number + * @bitmap_ptr: Output buffer for bitmap, at least (page_count + 7) / 8 bytes + * + * Retrieve a bitmap of either ACCESSED or DIRTY bits for a given range of guest + * memory, and optionally clear or set the bits. + */ +struct mshv_gpap_access_bitmap { + __u8 access_type; + __u8 access_op; + __u8 rsvd[6]; + __u64 page_count; + __u64 gpap_base; + __u64 bitmap_ptr; +}; + +/** + * struct mshv_root_hvcall - arguments for MSHV_ROOT_HVCALL + * @code: Hypercall code (HVCALL_*) + * @reps: in: Rep count ('repcount') + * out: Reps completed ('repcomp'). MBZ unless rep hvcall + * @in_sz: Size of input incl rep data. <= MSHV_HV_PAGE_SIZE + * @out_sz: Size of output buffer. <= MSHV_HV_PAGE_SIZE. MBZ if out_ptr is 0 + * @status: in: MBZ + * out: HV_STATUS_* from hypercall + * @rsvd: MBZ + * @in_ptr: Input data buffer (struct hv_input_*). If used with partition or + * vp fd, partition id field is populated by kernel. + * @out_ptr: Output data buffer (optional) + */ +struct mshv_root_hvcall { + __u16 code; + __u16 reps; + __u16 in_sz; + __u16 out_sz; + __u16 status; + __u8 rsvd[6]; + __u64 in_ptr; + __u64 out_ptr; +}; + +/* Partition fds created with MSHV_CREATE_PARTITION */ +#define MSHV_INITIALIZE_PARTITION _IO(MSHV_IOCTL, 0x00) +#define MSHV_CREATE_VP _IOW(MSHV_IOCTL, 0x01, struct mshv_create_vp) +#define MSHV_SET_GUEST_MEMORY _IOW(MSHV_IOCTL, 0x02, struct mshv_user_mem_region) +#define MSHV_IRQFD _IOW(MSHV_IOCTL, 0x03, struct mshv_user_irqfd) +#define MSHV_IOEVENTFD _IOW(MSHV_IOCTL, 0x04, struct mshv_user_ioeventfd) +#define MSHV_SET_MSI_ROUTING _IOW(MSHV_IOCTL, 0x05, struct mshv_user_irq_table) +#define MSHV_GET_GPAP_ACCESS_BITMAP _IOWR(MSHV_IOCTL, 0x06, struct mshv_gpap_access_bitmap) +/* Generic hypercall */ +#define MSHV_ROOT_HVCALL _IOWR(MSHV_IOCTL, 0x07, struct mshv_root_hvcall) + +/* + ******************************** + * VP APIs for child partitions * + ******************************** + */ + +#define MSHV_RUN_VP_BUF_SZ 256 + +/* + * VP state pages may be mapped to userspace via mmap(). + * To specify which state page, use MSHV_VP_MMAP_OFFSET_ values multiplied by + * the system page size. + * e.g. + * long page_size = sysconf(_SC_PAGE_SIZE); + * void *reg_page = mmap(NULL, MSHV_HV_PAGE_SIZE, PROT_READ|PROT_WRITE, + * MAP_SHARED, vp_fd, + * MSHV_VP_MMAP_OFFSET_REGISTERS * page_size); + */ +enum { + MSHV_VP_MMAP_OFFSET_REGISTERS, + MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE, + MSHV_VP_MMAP_OFFSET_GHCB, + MSHV_VP_MMAP_OFFSET_COUNT +}; + +/** + * struct mshv_run_vp - argument for MSHV_RUN_VP + * @msg_buf: On success, the intercept message is copied here. It can be + * interpreted using the relevant hypervisor definitions. + */ +struct mshv_run_vp { + __u8 msg_buf[MSHV_RUN_VP_BUF_SZ]; +}; + +enum { + MSHV_VP_STATE_LAPIC, /* Local interrupt controller state (either arch) */ + MSHV_VP_STATE_XSAVE, /* XSAVE data in compacted form (x86_64) */ + MSHV_VP_STATE_SIMP, + MSHV_VP_STATE_SIEFP, + MSHV_VP_STATE_SYNTHETIC_TIMERS, + MSHV_VP_STATE_COUNT, +}; + +/** + * struct mshv_get_set_vp_state - arguments for MSHV_[GET,SET]_VP_STATE + * @type: MSHV_VP_STATE_* + * @rsvd: MBZ + * @buf_sz: in: 4k page-aligned size of buffer + * out: Actual size of data (on EINVAL, check this to see if buffer + * was too small) + * @buf_ptr: 4k page-aligned data buffer + */ +struct mshv_get_set_vp_state { + __u8 type; + __u8 rsvd[3]; + __u32 buf_sz; + __u64 buf_ptr; +}; + +/* VP fds created with MSHV_CREATE_VP */ +#define MSHV_RUN_VP _IOR(MSHV_IOCTL, 0x00, struct mshv_run_vp) +#define MSHV_GET_VP_STATE _IOWR(MSHV_IOCTL, 0x01, struct mshv_get_set_vp_state) +#define MSHV_SET_VP_STATE _IOWR(MSHV_IOCTL, 0x02, struct mshv_get_set_vp_state) +/* + * Generic hypercall + * Defined above in partition IOCTLs, avoid redefining it here + * #define MSHV_ROOT_HVCALL _IOWR(MSHV_IOCTL, 0x07, struct mshv_root_hvcall) + */ + +#endif diff --git a/include/uapi/linux/nilfs2_ondisk.h b/include/uapi/linux/nilfs2_ondisk.h index c23f91ae5fe8..3196cc44a002 100644 --- a/include/uapi/linux/nilfs2_ondisk.h +++ b/include/uapi/linux/nilfs2_ondisk.h @@ -188,7 +188,8 @@ struct nilfs_super_block { __le16 s_segment_usage_size; /* Size of a segment usage */ /*98*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ -/*A8*/ char s_volume_name[80]; /* volume name */ +/*A8*/ char s_volume_name[80] /* volume name */ + __kernel_nonstring; /*F8*/ __le32 s_c_interval; /* Commit interval of segment */ __le32 s_c_block_max; /* diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 0524d541d4e3..5fc753c23734 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -385,6 +385,8 @@ enum perf_event_read_format { * * @sample_max_stack: Max number of frame pointers in a callchain, * should be < /proc/sys/kernel/perf_event_max_stack + * Max number of entries of branch stack + * should be < hardware limit */ struct perf_event_attr { diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h index 4540f6301b8c..2970ef44655a 100644 --- a/include/uapi/linux/pidfd.h +++ b/include/uapi/linux/pidfd.h @@ -10,6 +10,10 @@ /* Flags for pidfd_open(). */ #define PIDFD_NONBLOCK O_NONBLOCK #define PIDFD_THREAD O_EXCL +#ifdef __KERNEL__ +#include <linux/sched.h> +#define PIDFD_CLONE CLONE_PIDFD +#endif /* Flags for pidfd_send_signal(). */ #define PIDFD_SIGNAL_THREAD (1UL << 0) @@ -20,9 +24,34 @@ #define PIDFD_INFO_PID (1UL << 0) /* Always returned, even if not requested */ #define PIDFD_INFO_CREDS (1UL << 1) /* Always returned, even if not requested */ #define PIDFD_INFO_CGROUPID (1UL << 2) /* Always returned if available, even if not requested */ +#define PIDFD_INFO_EXIT (1UL << 3) /* Only returned if requested. */ #define PIDFD_INFO_SIZE_VER0 64 /* sizeof first published struct */ +/* + * The concept of process and threads in userland and the kernel is a confusing + * one - within the kernel every thread is a 'task' with its own individual PID, + * however from userland's point of view threads are grouped by a single PID, + * which is that of the 'thread group leader', typically the first thread + * spawned. + * + * To cut the Gideon knot, for internal kernel usage, we refer to + * PIDFD_SELF_THREAD to refer to the current thread (or task from a kernel + * perspective), and PIDFD_SELF_THREAD_GROUP to refer to the current thread + * group leader... + */ +#define PIDFD_SELF_THREAD -10000 /* Current thread. */ +#define PIDFD_SELF_THREAD_GROUP -20000 /* Current thread group leader. */ + +/* + * ...and for userland we make life simpler - PIDFD_SELF refers to the current + * thread, PIDFD_SELF_PROCESS refers to the process thread group leader. + * + * For nearly all practical uses, a user will want to use PIDFD_SELF. + */ +#define PIDFD_SELF PIDFD_SELF_THREAD +#define PIDFD_SELF_PROCESS PIDFD_SELF_THREAD_GROUP + struct pidfd_info { /* * This mask is similar to the request_mask in statx(2). @@ -62,7 +91,7 @@ struct pidfd_info { __u32 sgid; __u32 fsuid; __u32 fsgid; - __u32 spare0[1]; + __s32 exit_code; }; #define PIDFS_IOCTL_MAGIC 0xFF diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 5c6080680cb2..15c18ef4eb11 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -353,4 +353,15 @@ struct prctl_mm_map { */ #define PR_LOCK_SHADOW_STACK_STATUS 76 +/* + * Controls the mode of timer_create() for CRIU restore operations. + * Enabling this allows CRIU to restore timers with explicit IDs. + * + * Don't use for normal operations as the result might be undefined. + */ +#define PR_TIMER_CREATE_RESTORE_IDS 77 +# define PR_TIMER_CREATE_RESTORE_IDS_OFF 0 +# define PR_TIMER_CREATE_RESTORE_IDS_ON 1 +# define PR_TIMER_CREATE_RESTORE_IDS_GET 2 + #endif /* _LINUX_PRCTL_H */ diff --git a/include/uapi/linux/stddef.h b/include/uapi/linux/stddef.h index a6fce46aeb37..b87df1b485c2 100644 --- a/include/uapi/linux/stddef.h +++ b/include/uapi/linux/stddef.h @@ -70,4 +70,10 @@ #define __counted_by_be(m) #endif +#ifdef __KERNEL__ +#define __kernel_nonstring __nonstring +#else +#define __kernel_nonstring +#endif + #endif /* _UAPI_LINUX_STDDEF_H */ diff --git a/include/vdso/align.h b/include/vdso/align.h new file mode 100644 index 000000000000..02dd8626b5c5 --- /dev/null +++ b/include/vdso/align.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __VDSO_ALIGN_H +#define __VDSO_ALIGN_H + +#include <vdso/const.h> + +/* @a is a power of 2 value */ +#define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) +#define ALIGN_DOWN(x, a) __ALIGN_KERNEL((x) - ((a) - 1), (a)) +#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask)) +#define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a))) +#define PTR_ALIGN_DOWN(p, a) ((typeof(p))ALIGN_DOWN((unsigned long)(p), (a))) +#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) + +#endif /* __VDSO_ALIGN_H */ diff --git a/include/vdso/cache.h b/include/vdso/cache.h new file mode 100644 index 000000000000..f89d48304bf8 --- /dev/null +++ b/include/vdso/cache.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __VDSO_CACHE_H +#define __VDSO_CACHE_H + +#include <asm/cache.h> + +#ifndef SMP_CACHE_BYTES +#define SMP_CACHE_BYTES L1_CACHE_BYTES +#endif + +#ifndef ____cacheline_aligned +#define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) +#endif + +#endif /* __VDSO_ALIGN_H */ diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h index d967baa0cd0c..1864e76e8f69 100644 --- a/include/vdso/datapage.h +++ b/include/vdso/datapage.h @@ -9,11 +9,14 @@ #include <uapi/linux/types.h> #include <uapi/asm-generic/errno-base.h> +#include <vdso/align.h> #include <vdso/bits.h> +#include <vdso/cache.h> #include <vdso/clocksource.h> #include <vdso/ktime.h> #include <vdso/limits.h> #include <vdso/math64.h> +#include <vdso/page.h> #include <vdso/processor.h> #include <vdso/time.h> #include <vdso/time32.h> @@ -25,6 +28,15 @@ struct arch_vdso_time_data {}; #endif +#if defined(CONFIG_ARCH_HAS_VDSO_ARCH_DATA) +#include <asm/vdso/arch_data.h> +#elif defined(CONFIG_GENERIC_VDSO_DATA_STORE) +struct vdso_arch_data { + /* Needed for the generic code, never actually used at runtime */ + char __unused; +}; +#endif + #define VDSO_BASES (CLOCK_TAI + 1) #define VDSO_HRES (BIT(CLOCK_REALTIME) | \ BIT(CLOCK_MONOTONIC) | \ @@ -45,11 +57,11 @@ struct arch_vdso_time_data {}; * * There is one vdso_timestamp object in vvar for each vDSO-accelerated * clock_id. For high-resolution clocks, this encodes the time - * corresponding to vdso_data.cycle_last. For coarse clocks this encodes + * corresponding to vdso_time_data.cycle_last. For coarse clocks this encodes * the actual time. * * To be noticed that for highres clocks nsec is left-shifted by - * vdso_data.cs[x].shift. + * vdso_time_data[x].shift. */ struct vdso_timestamp { u64 sec; @@ -57,7 +69,7 @@ struct vdso_timestamp { }; /** - * struct vdso_data - vdso datapage representation + * struct vdso_clock - vdso per clocksource datapage representation * @seq: timebase sequence counter * @clock_mode: clock mode * @cycle_last: timebase at clocksource init @@ -67,19 +79,9 @@ struct vdso_timestamp { * @shift: clocksource shift * @basetime[clock_id]: basetime per clock_id * @offset[clock_id]: time namespace offset per clock_id - * @tz_minuteswest: minutes west of Greenwich - * @tz_dsttime: type of DST correction - * @hrtimer_res: hrtimer resolution - * @__unused: unused - * @arch_data: architecture specific data (optional, defaults - * to an empty struct) * - * vdso_data will be accessed by 64 bit and compat code at the same time - * so we should be careful before modifying this structure. - * - * The ordering of the struct members is optimized to have fast access to the - * often required struct members which are related to CLOCK_REALTIME and - * CLOCK_MONOTONIC. This information is stored in the first cache lines. + * See also struct vdso_time_data for basic access and ordering information as + * struct vdso_clock is used there. * * @basetime is used to store the base time for the system wide time getter * VVAR page. @@ -92,7 +94,7 @@ struct vdso_timestamp { * For clocks which are not affected by time namespace adjustment the * offset must be zero. */ -struct vdso_data { +struct vdso_clock { u32 seq; s32 clock_mode; @@ -108,14 +110,35 @@ struct vdso_data { struct vdso_timestamp basetime[VDSO_BASES]; struct timens_offset offset[VDSO_BASES]; }; +}; + +/** + * struct vdso_time_data - vdso datapage representation + * @arch_data: architecture specific data (optional, defaults + * to an empty struct) + * @clock_data: clocksource related data (array) + * @tz_minuteswest: minutes west of Greenwich + * @tz_dsttime: type of DST correction + * @hrtimer_res: hrtimer resolution + * @__unused: unused + * + * vdso_time_data will be accessed by 64 bit and compat code at the same time + * so we should be careful before modifying this structure. + * + * The ordering of the struct members is optimized to have fast acces to the + * often required struct members which are related to CLOCK_REALTIME and + * CLOCK_MONOTONIC. This information is stored in the first cache lines. + */ +struct vdso_time_data { + struct arch_vdso_time_data arch_data; - s32 tz_minuteswest; - s32 tz_dsttime; - u32 hrtimer_res; - u32 __unused; + struct vdso_clock clock_data[CS_BASES]; - struct arch_vdso_time_data arch_data; -}; + s32 tz_minuteswest; + s32 tz_dsttime; + u32 hrtimer_res; + u32 __unused; +} ____cacheline_aligned; /** * struct vdso_rng_data - vdso RNG state information @@ -136,22 +159,32 @@ struct vdso_rng_data { * With the hidden visibility, the compiler simply generates a PC-relative * relocation, and this is what we need. */ -extern struct vdso_data _vdso_data[CS_BASES] __attribute__((visibility("hidden"))); -extern struct vdso_data _timens_data[CS_BASES] __attribute__((visibility("hidden"))); -extern struct vdso_rng_data _vdso_rng_data __attribute__((visibility("hidden"))); - -/** - * union vdso_data_store - Generic vDSO data page - */ -union vdso_data_store { - struct vdso_data data[CS_BASES]; - u8 page[1U << CONFIG_PAGE_SHIFT]; +#ifdef CONFIG_GENERIC_VDSO_DATA_STORE +extern struct vdso_time_data vdso_u_time_data __attribute__((visibility("hidden"))); +extern struct vdso_rng_data vdso_u_rng_data __attribute__((visibility("hidden"))); +extern struct vdso_arch_data vdso_u_arch_data __attribute__((visibility("hidden"))); + +extern struct vdso_time_data *vdso_k_time_data; +extern struct vdso_rng_data *vdso_k_rng_data; +extern struct vdso_arch_data *vdso_k_arch_data; + +#define VDSO_ARCH_DATA_SIZE ALIGN(sizeof(struct vdso_arch_data), PAGE_SIZE) +#define VDSO_ARCH_DATA_PAGES (VDSO_ARCH_DATA_SIZE >> PAGE_SHIFT) + +enum vdso_pages { + VDSO_TIME_PAGE_OFFSET, + VDSO_TIMENS_PAGE_OFFSET, + VDSO_RNG_PAGE_OFFSET, + VDSO_ARCH_PAGES_START, + VDSO_ARCH_PAGES_END = VDSO_ARCH_PAGES_START + VDSO_ARCH_DATA_PAGES - 1, + VDSO_NR_PAGES }; +#endif /* CONFIG_GENERIC_VDSO_DATA_STORE */ + /* * The generic vDSO implementation requires that gettimeofday.h * provides: - * - __arch_get_vdso_data(): to get the vdso datapage. * - __arch_get_hw_counter(): to get the hw counter based on the * clock_mode. * - gettimeofday_fallback(): fallback for gettimeofday. @@ -164,6 +197,27 @@ union vdso_data_store { #include <asm/vdso/gettimeofday.h> #endif /* ENABLE_COMPAT_VDSO */ +#else /* !__ASSEMBLY__ */ + +#ifdef CONFIG_VDSO_GETRANDOM +#define __vdso_u_rng_data PROVIDE(vdso_u_rng_data = vdso_u_data + 2 * PAGE_SIZE); +#else +#define __vdso_u_rng_data +#endif + +#ifdef CONFIG_ARCH_HAS_VDSO_ARCH_DATA +#define __vdso_u_arch_data PROVIDE(vdso_u_arch_data = vdso_u_data + 3 * PAGE_SIZE); +#else +#define __vdso_u_arch_data +#endif + +#define VDSO_VVAR_SYMS \ + PROVIDE(vdso_u_data = . - __VDSO_PAGES * PAGE_SIZE); \ + PROVIDE(vdso_u_time_data = vdso_u_data); \ + __vdso_u_rng_data \ + __vdso_u_arch_data \ + + #endif /* !__ASSEMBLY__ */ #endif /* __VDSO_DATAPAGE_H */ diff --git a/include/vdso/helpers.h b/include/vdso/helpers.h index 3ddb03bb05cb..0a98fed550ba 100644 --- a/include/vdso/helpers.h +++ b/include/vdso/helpers.h @@ -7,49 +7,53 @@ #include <asm/barrier.h> #include <vdso/datapage.h> -static __always_inline u32 vdso_read_begin(const struct vdso_data *vd) +static __always_inline u32 vdso_read_begin(const struct vdso_clock *vc) { u32 seq; - while (unlikely((seq = READ_ONCE(vd->seq)) & 1)) + while (unlikely((seq = READ_ONCE(vc->seq)) & 1)) cpu_relax(); smp_rmb(); return seq; } -static __always_inline u32 vdso_read_retry(const struct vdso_data *vd, +static __always_inline u32 vdso_read_retry(const struct vdso_clock *vc, u32 start) { u32 seq; smp_rmb(); - seq = READ_ONCE(vd->seq); + seq = READ_ONCE(vc->seq); return seq != start; } -static __always_inline void vdso_write_begin(struct vdso_data *vd) +static __always_inline void vdso_write_begin(struct vdso_time_data *vd) { + struct vdso_clock *vc = vd->clock_data; + /* * WRITE_ONCE() is required otherwise the compiler can validly tear * updates to vd[x].seq and it is possible that the value seen by the * reader is inconsistent. */ - WRITE_ONCE(vd[CS_HRES_COARSE].seq, vd[CS_HRES_COARSE].seq + 1); - WRITE_ONCE(vd[CS_RAW].seq, vd[CS_RAW].seq + 1); + WRITE_ONCE(vc[CS_HRES_COARSE].seq, vc[CS_HRES_COARSE].seq + 1); + WRITE_ONCE(vc[CS_RAW].seq, vc[CS_RAW].seq + 1); smp_wmb(); } -static __always_inline void vdso_write_end(struct vdso_data *vd) +static __always_inline void vdso_write_end(struct vdso_time_data *vd) { + struct vdso_clock *vc = vd->clock_data; + smp_wmb(); /* * WRITE_ONCE() is required otherwise the compiler can validly tear * updates to vd[x].seq and it is possible that the value seen by the * reader is inconsistent. */ - WRITE_ONCE(vd[CS_HRES_COARSE].seq, vd[CS_HRES_COARSE].seq + 1); - WRITE_ONCE(vd[CS_RAW].seq, vd[CS_RAW].seq + 1); + WRITE_ONCE(vc[CS_HRES_COARSE].seq, vc[CS_HRES_COARSE].seq + 1); + WRITE_ONCE(vc[CS_RAW].seq, vc[CS_RAW].seq + 1); } #endif /* !__ASSEMBLY__ */ diff --git a/include/xen/interface/xen-mca.h b/include/xen/interface/xen-mca.h index 464aa6b3a5f9..1c9afbe8cc26 100644 --- a/include/xen/interface/xen-mca.h +++ b/include/xen/interface/xen-mca.h @@ -372,7 +372,7 @@ struct xen_mce { #define XEN_MCE_LOG_LEN 32 struct xen_mce_log { - char signature[12]; /* "MACHINECHECK" */ + char signature[12] __nonstring; /* "MACHINECHECK" */ unsigned len; /* = XEN_MCE_LOG_LEN */ unsigned next; unsigned flags; |