diff options
Diffstat (limited to 'arch/x86/include/asm')
90 files changed, 1140 insertions, 1058 deletions
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 4f1ce5fc4e19..a192bdea69e2 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -10,5 +10,4 @@ generated-y += unistd_64_x32.h generated-y += xen-hypercalls.h generic-y += early_ioremap.h -generic-y += export.h generic-y += mcs_spinlock.h diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index c8a7fc23f63c..f896eed4516c 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -16,6 +16,9 @@ #include <asm/x86_init.h> #include <asm/cpufeature.h> #include <asm/irq_vectors.h> +#include <asm/xen/hypervisor.h> + +#include <xen/xen.h> #ifdef CONFIG_ACPI_APEI # include <asm/pgtable_types.h> @@ -127,6 +130,17 @@ static inline void arch_acpi_set_proc_cap_bits(u32 *cap) if (!cpu_has(c, X86_FEATURE_MWAIT) || boot_option_idle_override == IDLE_NOMWAIT) *cap &= ~(ACPI_PROC_CAP_C_C1_FFH | ACPI_PROC_CAP_C_C2C3_FFH); + + if (xen_initial_domain()) { + /* + * When Linux is running as Xen dom0, the hypervisor is the + * entity in charge of the processor power management, and so + * Xen needs to check the OS capabilities reported in the + * processor capabilities buffer matches what the hypervisor + * driver supports. + */ + xen_sanitize_proc_cap_bits(cap); + } } static inline bool acpi_has_cpu_in_madt(void) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 9c4da699e11a..fcd20c6dc7f9 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -10,6 +10,9 @@ #define ALT_FLAG_NOT (1 << 0) #define ALT_NOT(feature) ((ALT_FLAG_NOT << ALT_FLAGS_SHIFT) | (feature)) +#define ALT_FLAG_DIRECT_CALL (1 << 1) +#define ALT_DIRECT_CALL(feature) ((ALT_FLAG_DIRECT_CALL << ALT_FLAGS_SHIFT) | (feature)) +#define ALT_CALL_ALWAYS ALT_DIRECT_CALL(X86_FEATURE_ALWAYS) #ifndef __ASSEMBLY__ @@ -58,7 +61,7 @@ #define ANNOTATE_IGNORE_ALTERNATIVE \ "999:\n\t" \ ".pushsection .discard.ignore_alts\n\t" \ - ".long 999b - .\n\t" \ + ".long 999b\n\t" \ ".popsection\n\t" /* @@ -86,6 +89,8 @@ struct alt_instr { u8 replacementlen; /* length of new instruction */ } __packed; +extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; + /* * Debug flag that can be tested to see whether alternative * instructions were patched in already: @@ -101,11 +106,10 @@ extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine, s32 *start_cfi, s32 *end_cfi); struct module; -struct paravirt_patch_site; struct callthunk_sites { s32 *call_start, *call_end; - struct paravirt_patch_site *pv_start, *pv_end; + struct alt_instr *alt_start, *alt_end; }; #ifdef CONFIG_CALL_THUNKS @@ -150,6 +154,8 @@ static inline int alternatives_text_reserved(void *start, void *end) } #endif /* CONFIG_SMP */ +#define ALT_CALL_INSTR "call BUG_func" + #define b_replacement(num) "664"#num #define e_replacement(num) "665"#num @@ -330,6 +336,22 @@ static inline int alternatives_text_reserved(void *start, void *end) */ #define ASM_NO_INPUT_CLOBBER(clbr...) "i" (0) : clbr +/* Macro for creating assembler functions avoiding any C magic. */ +#define DEFINE_ASM_FUNC(func, instr, sec) \ + asm (".pushsection " #sec ", \"ax\"\n" \ + ".global " #func "\n\t" \ + ".type " #func ", @function\n\t" \ + ASM_FUNC_ALIGN "\n" \ + #func ":\n\t" \ + ASM_ENDBR \ + instr "\n\t" \ + ASM_RET \ + ".size " #func ", . - " #func "\n\t" \ + ".popsection") + +void BUG_func(void); +void nop_func(void); + #else /* __ASSEMBLY__ */ #ifdef CONFIG_SMP @@ -352,7 +374,7 @@ static inline int alternatives_text_reserved(void *start, void *end) .macro ANNOTATE_IGNORE_ALTERNATIVE .Lannotate_\@: .pushsection .discard.ignore_alts - .long .Lannotate_\@ - . + .long .Lannotate_\@ .popsection .endm @@ -370,6 +392,10 @@ static inline int alternatives_text_reserved(void *start, void *end) .byte \alt_len .endm +.macro ALT_CALL_INSTR + call BUG_func +.endm + /* * Define an alternative between two instructions. If @feature is * present, early code in apply_alternatives() replaces @oldinstr with diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index ed0eaf65c437..5c37944c8a5e 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -104,7 +104,7 @@ static inline bool amd_gart_present(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) return false; - /* GART present only on Fam15h, upto model 0fh */ + /* GART present only on Fam15h, up to model 0fh */ if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10)) return true; diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 5af4ec1a0f71..9d159b771dc8 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -54,7 +54,7 @@ extern int local_apic_timer_c2_ok; extern bool apic_is_disabled; extern unsigned int lapic_timer_period; -extern int cpuid_to_apicid[]; +extern u32 cpuid_to_apicid[]; extern enum apic_intr_mode_id apic_intr_mode; enum apic_intr_mode_id { @@ -272,11 +272,10 @@ struct apic { void (*send_IPI_all)(int vector); void (*send_IPI_self)(int vector); - enum apic_delivery_modes delivery_mode; - u32 disable_esr : 1, dest_mode_logical : 1, - x2apic_set_max_apicid : 1; + x2apic_set_max_apicid : 1, + nmi_to_offline_cpu : 1; u32 (*calc_dest_apicid)(unsigned int cpu); @@ -292,19 +291,19 @@ struct apic { int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); bool (*apic_id_registered)(void); - bool (*check_apicid_used)(physid_mask_t *map, int apicid); + bool (*check_apicid_used)(physid_mask_t *map, u32 apicid); void (*init_apic_ldr)(void); void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap); - int (*cpu_present_to_apicid)(int mps_cpu); - int (*phys_pkg_id)(int cpuid_apic, int index_msb); + u32 (*cpu_present_to_apicid)(int mps_cpu); + u32 (*phys_pkg_id)(u32 cpuid_apic, int index_msb); - u32 (*get_apic_id)(unsigned long x); - u32 (*set_apic_id)(unsigned int id); + u32 (*get_apic_id)(u32 id); + u32 (*set_apic_id)(u32 apicid); /* wakeup_secondary_cpu */ - int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip); + int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip); /* wakeup secondary CPU using 64-bit wakeup point */ - int (*wakeup_secondary_cpu_64)(int apicid, unsigned long start_eip); + int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip); char *name; }; @@ -322,8 +321,8 @@ struct apic_override { void (*send_IPI_self)(int vector); u64 (*icr_read)(void); void (*icr_write)(u32 low, u32 high); - int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip); - int (*wakeup_secondary_cpu_64)(int apicid, unsigned long start_eip); + int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip); + int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip); }; /* @@ -493,16 +492,6 @@ static inline bool lapic_vector_set_in_irr(unsigned int vector) return !!(irr & (1U << (vector % 32))); } -static inline unsigned default_get_apic_id(unsigned long x) -{ - unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); - - if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID)) - return (x >> 24) & 0xFF; - else - return (x >> 24) & 0x0F; -} - /* * Warm reset vector position: */ @@ -517,9 +506,9 @@ extern void generic_bigsmp_probe(void); extern struct apic apic_noop; -static inline unsigned int read_apic_id(void) +static inline u32 read_apic_id(void) { - unsigned int reg = apic_read(APIC_ID); + u32 reg = apic_read(APIC_ID); return apic->get_apic_id(reg); } @@ -538,13 +527,14 @@ extern int default_apic_id_valid(u32 apicid); extern u32 apic_default_calc_apicid(unsigned int cpu); extern u32 apic_flat_calc_apicid(unsigned int cpu); -extern bool default_check_apicid_used(physid_mask_t *map, int apicid); extern void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap); -extern int default_cpu_present_to_apicid(int mps_cpu); +extern u32 default_cpu_present_to_apicid(int mps_cpu); + +void apic_send_nmi_to_offline_cpu(unsigned int cpu); #else /* CONFIG_X86_LOCAL_APIC */ -static inline unsigned int read_apic_id(void) { return 0; } +static inline u32 read_apic_id(void) { return 0; } #endif /* !CONFIG_X86_LOCAL_APIC */ diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 4b125e5b3187..094106b6a538 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -20,6 +20,13 @@ */ #define IO_APIC_SLOT_SIZE 1024 +#define APIC_DELIVERY_MODE_FIXED 0 +#define APIC_DELIVERY_MODE_LOWESTPRIO 1 +#define APIC_DELIVERY_MODE_SMI 2 +#define APIC_DELIVERY_MODE_NMI 4 +#define APIC_DELIVERY_MODE_INIT 5 +#define APIC_DELIVERY_MODE_EXTINT 7 + #define APIC_ID 0x20 #define APIC_LVR 0x30 @@ -165,279 +172,10 @@ #define APIC_CPUID(apicid) ((apicid) & XAPIC_DEST_CPUS_MASK) #define NUM_APIC_CLUSTERS ((BAD_APICID + 1) >> XAPIC_DEST_CPUS_SHIFT) -#ifndef __ASSEMBLY__ -/* - * the local APIC register structure, memory mapped. Not terribly well - * tested, but we might eventually use this one in the future - the - * problem why we cannot use it right now is the P5 APIC, it has an - * errata which cannot take 8-bit reads and writes, only 32-bit ones ... - */ -#define u32 unsigned int - -struct local_apic { - -/*000*/ struct { u32 __reserved[4]; } __reserved_01; - -/*010*/ struct { u32 __reserved[4]; } __reserved_02; - -/*020*/ struct { /* APIC ID Register */ - u32 __reserved_1 : 24, - phys_apic_id : 4, - __reserved_2 : 4; - u32 __reserved[3]; - } id; - -/*030*/ const - struct { /* APIC Version Register */ - u32 version : 8, - __reserved_1 : 8, - max_lvt : 8, - __reserved_2 : 8; - u32 __reserved[3]; - } version; - -/*040*/ struct { u32 __reserved[4]; } __reserved_03; - -/*050*/ struct { u32 __reserved[4]; } __reserved_04; - -/*060*/ struct { u32 __reserved[4]; } __reserved_05; - -/*070*/ struct { u32 __reserved[4]; } __reserved_06; - -/*080*/ struct { /* Task Priority Register */ - u32 priority : 8, - __reserved_1 : 24; - u32 __reserved_2[3]; - } tpr; - -/*090*/ const - struct { /* Arbitration Priority Register */ - u32 priority : 8, - __reserved_1 : 24; - u32 __reserved_2[3]; - } apr; - -/*0A0*/ const - struct { /* Processor Priority Register */ - u32 priority : 8, - __reserved_1 : 24; - u32 __reserved_2[3]; - } ppr; - -/*0B0*/ struct { /* End Of Interrupt Register */ - u32 eoi; - u32 __reserved[3]; - } eoi; - -/*0C0*/ struct { u32 __reserved[4]; } __reserved_07; - -/*0D0*/ struct { /* Logical Destination Register */ - u32 __reserved_1 : 24, - logical_dest : 8; - u32 __reserved_2[3]; - } ldr; - -/*0E0*/ struct { /* Destination Format Register */ - u32 __reserved_1 : 28, - model : 4; - u32 __reserved_2[3]; - } dfr; - -/*0F0*/ struct { /* Spurious Interrupt Vector Register */ - u32 spurious_vector : 8, - apic_enabled : 1, - focus_cpu : 1, - __reserved_2 : 22; - u32 __reserved_3[3]; - } svr; - -/*100*/ struct { /* In Service Register */ -/*170*/ u32 bitfield; - u32 __reserved[3]; - } isr [8]; - -/*180*/ struct { /* Trigger Mode Register */ -/*1F0*/ u32 bitfield; - u32 __reserved[3]; - } tmr [8]; - -/*200*/ struct { /* Interrupt Request Register */ -/*270*/ u32 bitfield; - u32 __reserved[3]; - } irr [8]; - -/*280*/ union { /* Error Status Register */ - struct { - u32 send_cs_error : 1, - receive_cs_error : 1, - send_accept_error : 1, - receive_accept_error : 1, - __reserved_1 : 1, - send_illegal_vector : 1, - receive_illegal_vector : 1, - illegal_register_address : 1, - __reserved_2 : 24; - u32 __reserved_3[3]; - } error_bits; - struct { - u32 errors; - u32 __reserved_3[3]; - } all_errors; - } esr; - -/*290*/ struct { u32 __reserved[4]; } __reserved_08; - -/*2A0*/ struct { u32 __reserved[4]; } __reserved_09; - -/*2B0*/ struct { u32 __reserved[4]; } __reserved_10; - -/*2C0*/ struct { u32 __reserved[4]; } __reserved_11; - -/*2D0*/ struct { u32 __reserved[4]; } __reserved_12; - -/*2E0*/ struct { u32 __reserved[4]; } __reserved_13; - -/*2F0*/ struct { u32 __reserved[4]; } __reserved_14; - -/*300*/ struct { /* Interrupt Command Register 1 */ - u32 vector : 8, - delivery_mode : 3, - destination_mode : 1, - delivery_status : 1, - __reserved_1 : 1, - level : 1, - trigger : 1, - __reserved_2 : 2, - shorthand : 2, - __reserved_3 : 12; - u32 __reserved_4[3]; - } icr1; - -/*310*/ struct { /* Interrupt Command Register 2 */ - union { - u32 __reserved_1 : 24, - phys_dest : 4, - __reserved_2 : 4; - u32 __reserved_3 : 24, - logical_dest : 8; - } dest; - u32 __reserved_4[3]; - } icr2; - -/*320*/ struct { /* LVT - Timer */ - u32 vector : 8, - __reserved_1 : 4, - delivery_status : 1, - __reserved_2 : 3, - mask : 1, - timer_mode : 1, - __reserved_3 : 14; - u32 __reserved_4[3]; - } lvt_timer; - -/*330*/ struct { /* LVT - Thermal Sensor */ - u32 vector : 8, - delivery_mode : 3, - __reserved_1 : 1, - delivery_status : 1, - __reserved_2 : 3, - mask : 1, - __reserved_3 : 15; - u32 __reserved_4[3]; - } lvt_thermal; - -/*340*/ struct { /* LVT - Performance Counter */ - u32 vector : 8, - delivery_mode : 3, - __reserved_1 : 1, - delivery_status : 1, - __reserved_2 : 3, - mask : 1, - __reserved_3 : 15; - u32 __reserved_4[3]; - } lvt_pc; - -/*350*/ struct { /* LVT - LINT0 */ - u32 vector : 8, - delivery_mode : 3, - __reserved_1 : 1, - delivery_status : 1, - polarity : 1, - remote_irr : 1, - trigger : 1, - mask : 1, - __reserved_2 : 15; - u32 __reserved_3[3]; - } lvt_lint0; - -/*360*/ struct { /* LVT - LINT1 */ - u32 vector : 8, - delivery_mode : 3, - __reserved_1 : 1, - delivery_status : 1, - polarity : 1, - remote_irr : 1, - trigger : 1, - mask : 1, - __reserved_2 : 15; - u32 __reserved_3[3]; - } lvt_lint1; - -/*370*/ struct { /* LVT - Error */ - u32 vector : 8, - __reserved_1 : 4, - delivery_status : 1, - __reserved_2 : 3, - mask : 1, - __reserved_3 : 15; - u32 __reserved_4[3]; - } lvt_error; - -/*380*/ struct { /* Timer Initial Count Register */ - u32 initial_count; - u32 __reserved_2[3]; - } timer_icr; - -/*390*/ const - struct { /* Timer Current Count Register */ - u32 curr_count; - u32 __reserved_2[3]; - } timer_ccr; - -/*3A0*/ struct { u32 __reserved[4]; } __reserved_16; - -/*3B0*/ struct { u32 __reserved[4]; } __reserved_17; - -/*3C0*/ struct { u32 __reserved[4]; } __reserved_18; - -/*3D0*/ struct { u32 __reserved[4]; } __reserved_19; - -/*3E0*/ struct { /* Timer Divide Configuration Register */ - u32 divisor : 4, - __reserved_1 : 28; - u32 __reserved_2[3]; - } timer_dcr; - -/*3F0*/ struct { u32 __reserved[4]; } __reserved_20; - -} __attribute__ ((packed)); - -#undef u32 - #ifdef CONFIG_X86_32 #define BAD_APICID 0xFFu #else #define BAD_APICID 0xFFFFu #endif -enum apic_delivery_modes { - APIC_DELIVERY_MODE_FIXED = 0, - APIC_DELIVERY_MODE_LOWESTPRIO = 1, - APIC_DELIVERY_MODE_SMI = 2, - APIC_DELIVERY_MODE_NMI = 4, - APIC_DELIVERY_MODE_INIT = 5, - APIC_DELIVERY_MODE_EXTINT = 7, -}; - -#endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_APICDEF_H */ diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 35389b2af88e..0216f63a366b 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -81,22 +81,4 @@ do { \ #include <asm-generic/barrier.h> -/* - * Make previous memory operations globally visible before - * a WRMSR. - * - * MFENCE makes writes visible, but only affects load/store - * instructions. WRMSR is unfortunately not a load/store - * instruction and is unaffected by MFENCE. The LFENCE ensures - * that the WRMSR is not reordered. - * - * Most WRMSRs are full serializing instructions themselves and - * do not require this barrier. This is only required for the - * IA32_TSC_DEADLINE and X2APIC MSRs. - */ -static inline void weak_wrmsr_fence(void) -{ - asm volatile("mfence; lfence" : : : "memory"); -} - #endif /* _ASM_X86_BARRIER_H */ diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 50e5ebf9d0a0..990eb686ca67 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -94,18 +94,17 @@ arch___clear_bit(unsigned long nr, volatile unsigned long *addr) asm volatile(__ASM_SIZE(btr) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } -static __always_inline bool -arch_clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr) +static __always_inline bool arch_xor_unlock_is_negative_byte(unsigned long mask, + volatile unsigned long *addr) { bool negative; - asm volatile(LOCK_PREFIX "andb %2,%1" + asm volatile(LOCK_PREFIX "xorb %2,%1" CC_SET(s) : CC_OUT(s) (negative), WBYTE_ADDR(addr) - : "ir" ((char) ~(1 << nr)) : "memory"); + : "iq" ((char)mask) : "memory"); return negative; } -#define arch_clear_bit_unlock_is_negative_byte \ - arch_clear_bit_unlock_is_negative_byte +#define arch_xor_unlock_is_negative_byte arch_xor_unlock_is_negative_byte static __always_inline void arch___clear_bit_unlock(long nr, volatile unsigned long *addr) diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 4ae14339cb8c..a38cc0afc90a 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -40,23 +40,40 @@ #ifdef CONFIG_X86_64 # define BOOT_STACK_SIZE 0x4000 +/* + * Used by decompressor's startup_32() to allocate page tables for identity + * mapping of the 4G of RAM in 4-level paging mode: + * - 1 level4 table; + * - 1 level3 table; + * - 4 level2 table that maps everything with 2M pages; + * + * The additional level5 table needed for 5-level paging is allocated from + * trampoline_32bit memory. + */ # define BOOT_INIT_PGT_SIZE (6*4096) -# ifdef CONFIG_RANDOMIZE_BASE + /* - * Assuming all cross the 512GB boundary: - * 1 page for level4 - * (2+2)*4 pages for kernel, param, cmd_line, and randomized kernel - * 2 pages for first 2M (video RAM: CONFIG_X86_VERBOSE_BOOTUP). - * Total is 19 pages. + * Total number of page tables kernel_add_identity_map() can allocate, + * including page tables consumed by startup_32(). + * + * Worst-case scenario: + * - 5-level paging needs 1 level5 table; + * - KASLR needs to map kernel, boot_params, cmdline and randomized kernel, + * assuming all of them cross 256T boundary: + * + 4*2 level4 table; + * + 4*2 level3 table; + * + 4*2 level2 table; + * - X86_VERBOSE_BOOTUP needs to map the first 2M (video RAM): + * + 1 level4 table; + * + 1 level3 table; + * + 1 level2 table; + * Total: 28 tables + * + * Add 4 spare table in case decompressor touches anything beyond what is + * accounted above. Warn if it happens. */ -# ifdef CONFIG_X86_VERBOSE_BOOTUP -# define BOOT_PGT_SIZE (19*4096) -# else /* !CONFIG_X86_VERBOSE_BOOTUP */ -# define BOOT_PGT_SIZE (17*4096) -# endif -# else /* !CONFIG_RANDOMIZE_BASE */ -# define BOOT_PGT_SIZE BOOT_INIT_PGT_SIZE -# endif +# define BOOT_PGT_SIZE_WARN (28*4096) +# define BOOT_PGT_SIZE (32*4096) #else /* !CONFIG_X86_64 */ # define BOOT_STACK_SIZE 0x1000 @@ -68,6 +85,8 @@ extern const unsigned long kernel_total_size; unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr, void (*error)(char *x)); + +extern struct boot_params *boot_params_ptr; #endif #endif /* _ASM_X86_BOOT_H */ diff --git a/arch/x86/include/asm/cacheinfo.h b/arch/x86/include/asm/cacheinfo.h index ce9685fc78d8..5aa061199866 100644 --- a/arch/x86/include/asm/cacheinfo.h +++ b/arch/x86/include/asm/cacheinfo.h @@ -7,9 +7,6 @@ extern unsigned int memory_caching_control; #define CACHE_MTRR 0x01 #define CACHE_PAT 0x02 -void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu); -void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu); - void cache_disable(void); void cache_enable(void); void set_cache_aps_delayed_init(bool val); diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h index 58dacd90daef..7cd752557905 100644 --- a/arch/x86/include/asm/cfi.h +++ b/arch/x86/include/asm/cfi.h @@ -7,16 +7,140 @@ * * Copyright (C) 2022 Google LLC */ +#include <linux/bug.h> +#include <asm/ibt.h> -#include <linux/cfi.h> +/* + * An overview of the various calling conventions... + * + * Traditional: + * + * foo: + * ... code here ... + * ret + * + * direct caller: + * call foo + * + * indirect caller: + * lea foo(%rip), %r11 + * ... + * call *%r11 + * + * + * IBT: + * + * foo: + * endbr64 + * ... code here ... + * ret + * + * direct caller: + * call foo / call foo+4 + * + * indirect caller: + * lea foo(%rip), %r11 + * ... + * call *%r11 + * + * + * kCFI: + * + * __cfi_foo: + * movl $0x12345678, %eax + * # 11 nops when CONFIG_CALL_PADDING + * foo: + * endbr64 # when IBT + * ... code here ... + * ret + * + * direct call: + * call foo # / call foo+4 when IBT + * + * indirect call: + * lea foo(%rip), %r11 + * ... + * movl $(-0x12345678), %r10d + * addl -4(%r11), %r10d # -15 when CONFIG_CALL_PADDING + * jz 1f + * ud2 + * 1:call *%r11 + * + * + * FineIBT (builds as kCFI + CALL_PADDING + IBT + RETPOLINE and runtime patches into): + * + * __cfi_foo: + * endbr64 + * subl 0x12345678, %r10d + * jz foo + * ud2 + * nop + * foo: + * osp nop3 # was endbr64 + * ... code here ... + * ret + * + * direct caller: + * call foo / call foo+4 + * + * indirect caller: + * lea foo(%rip), %r11 + * ... + * movl $0x12345678, %r10d + * subl $16, %r11 + * nop4 + * call *%r11 + * + */ +enum cfi_mode { + CFI_DEFAULT, /* FineIBT if hardware has IBT, otherwise kCFI */ + CFI_OFF, /* Taditional / IBT depending on .config */ + CFI_KCFI, /* Optionally CALL_PADDING, IBT, RETPOLINE */ + CFI_FINEIBT, /* see arch/x86/kernel/alternative.c */ +}; + +extern enum cfi_mode cfi_mode; + +struct pt_regs; #ifdef CONFIG_CFI_CLANG enum bug_trap_type handle_cfi_failure(struct pt_regs *regs); +#define __bpfcall +extern u32 cfi_bpf_hash; +extern u32 cfi_bpf_subprog_hash; + +static inline int cfi_get_offset(void) +{ + switch (cfi_mode) { + case CFI_FINEIBT: + return 16; + case CFI_KCFI: + if (IS_ENABLED(CONFIG_CALL_PADDING)) + return 16; + return 5; + default: + return 0; + } +} +#define cfi_get_offset cfi_get_offset + +extern u32 cfi_get_func_hash(void *func); + #else static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs) { return BUG_TRAP_TYPE_NONE; } +#define cfi_bpf_hash 0U +#define cfi_bpf_subprog_hash 0U +static inline u32 cfi_get_func_hash(void *func) +{ + return 0; +} #endif /* CONFIG_CFI_CLANG */ +#if HAS_KERNEL_IBT == 1 +#define CFI_NOSEAL(x) asm(IBT_NOSEAL(__stringify(x))) +#endif + #endif /* _ASM_X86_CFI_H */ diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index d53636506134..5612648b0202 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -221,12 +221,18 @@ extern void __add_wrong_size(void) #define __try_cmpxchg(ptr, pold, new, size) \ __raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX) +#define __sync_try_cmpxchg(ptr, pold, new, size) \ + __raw_try_cmpxchg((ptr), (pold), (new), (size), "lock; ") + #define __try_cmpxchg_local(ptr, pold, new, size) \ __raw_try_cmpxchg((ptr), (pold), (new), (size), "") #define arch_try_cmpxchg(ptr, pold, new) \ __try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr))) +#define arch_sync_try_cmpxchg(ptr, pold, new) \ + __sync_try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr))) + #define arch_try_cmpxchg_local(ptr, pold, new) \ __try_cmpxchg_local((ptr), (pold), (new), sizeof(*(ptr))) diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h index 6ae2d16a7613..76c310b19b11 100644 --- a/arch/x86/include/asm/coco.h +++ b/arch/x86/include/asm/coco.h @@ -10,13 +10,14 @@ enum cc_vendor { CC_VENDOR_INTEL, }; -extern enum cc_vendor cc_vendor; - #ifdef CONFIG_ARCH_HAS_CC_PLATFORM +extern enum cc_vendor cc_vendor; void cc_set_mask(u64 mask); u64 cc_mkenc(u64 val); u64 cc_mkdec(u64 val); #else +#define cc_vendor (CC_VENDOR_NONE) + static inline u64 cc_mkenc(u64 val) { return val; diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 3a233ebff712..f8f9a9b79395 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -23,13 +23,7 @@ static inline void prefill_possible_map(void) {} #endif /* CONFIG_SMP */ -struct x86_cpu { - struct cpu cpu; -}; - #ifdef CONFIG_HOTPLUG_CPU -extern int arch_register_cpu(int num); -extern void arch_unregister_cpu(int); extern void soft_restart_cpu(void); #endif @@ -73,26 +67,12 @@ static inline void init_ia32_feat_ctl(struct cpuinfo_x86 *c) {} extern __noendbr void cet_disable(void); -struct ucode_cpu_info; - -int intel_cpu_collect_info(struct ucode_cpu_info *uci); - -static inline bool intel_cpu_signatures_match(unsigned int s1, unsigned int p1, - unsigned int s2, unsigned int p2) -{ - if (s1 != s2) - return false; - - /* Processor flags are either both 0 ... */ - if (!p1 && !p2) - return true; +struct cpu_signature; - /* ... or they intersect. */ - return p1 & p2; -} +void intel_collect_cpu_info(struct cpu_signature *sig); extern u64 x86_read_arch_cap_msr(void); -int intel_find_matching_signature(void *mc, unsigned int csig, int cpf); +bool intel_find_matching_signature(void *mc, struct cpu_signature *sig); int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type); extern struct cpumask cpus_stop_mask; diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index a26bebbdff87..a1273698fc43 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -168,7 +168,7 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); */ static __always_inline bool _static_cpu_has(u16 bit) { - asm_volatile_goto( + asm goto( ALTERNATIVE_TERNARY("jmp 6f", %P[feature], "", "jmp %l[t_no]") ".pushsection .altinstr_aux,\"ax\"\n" "6:\n" diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 2061ed1c398f..fdf723b6f6d0 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -81,10 +81,8 @@ #define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ #define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ #define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ - -/* CPU types for specific tunings: */ #define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ -/* FREE, was #define X86_FEATURE_K7 ( 3*32+ 5) "" Athlon */ +#define X86_FEATURE_ZEN5 ( 3*32+ 5) /* "" CPU based on Zen5 microarchitecture */ #define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ #define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ #define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ @@ -198,6 +196,7 @@ #define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ #define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ #define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ +#define X86_FEATURE_TDX_HOST_PLATFORM ( 7*32+ 7) /* Platform supports being a TDX host */ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ #define X86_FEATURE_XCOMPACTED ( 7*32+10) /* "" Use compacted XSTATE (XSAVES or XSAVEC) */ @@ -218,7 +217,7 @@ #define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */ #define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_ZEN (7*32+28) /* "" CPU based on Zen microarchitecture */ +#define X86_FEATURE_ZEN ( 7*32+28) /* "" Generic flag for all Zen and newer */ #define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ #define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */ #define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* "" MSR IA32_FEAT_CTL configured */ @@ -308,10 +307,14 @@ #define X86_FEATURE_SMBA (11*32+21) /* "" Slow Memory Bandwidth Allocation */ #define X86_FEATURE_BMEC (11*32+22) /* "" Bandwidth Monitoring Event Configuration */ #define X86_FEATURE_USER_SHSTK (11*32+23) /* Shadow stack support for user mode applications */ - #define X86_FEATURE_SRSO (11*32+24) /* "" AMD BTB untrain RETs */ #define X86_FEATURE_SRSO_ALIAS (11*32+25) /* "" AMD BTB untrain RETs through aliasing */ #define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* "" Issue an IBPB only on VMEXIT */ +#define X86_FEATURE_APIC_MSRS_FENCE (11*32+27) /* "" IA32_TSC_DEADLINE and X2APIC MSRs need fencing */ +#define X86_FEATURE_ZEN2 (11*32+28) /* "" CPU based on Zen2 microarchitecture */ +#define X86_FEATURE_ZEN3 (11*32+29) /* "" CPU based on Zen3 microarchitecture */ +#define X86_FEATURE_ZEN4 (11*32+30) /* "" CPU based on Zen4 microarchitecture */ +#define X86_FEATURE_ZEN1 (11*32+31) /* "" CPU based on Zen1 microarchitecture */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ @@ -439,9 +442,11 @@ #define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */ #define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* "" Virtual TSC_AUX */ #define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */ +#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* AMD SEV-ES full debug state swap support */ /* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */ #define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* "" No Nested Data Breakpoints */ +#define X86_FEATURE_WRMSR_XX_BASE_NS (20*32+ 1) /* "" WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */ #define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* "" LFENCE always serializing / synchronizes RDTSC */ #define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* "" Null Selector Clears Base */ #define X86_FEATURE_AUTOIBRS (20*32+ 8) /* "" Automatic IBRS */ @@ -493,6 +498,7 @@ #define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */ #define X86_BUG_SMT_RSB X86_BUG(29) /* CPU is vulnerable to Cross-Thread Return Address Predictions */ #define X86_BUG_GDS X86_BUG(30) /* CPU is affected by Gather Data Sampling */ +#define X86_BUG_TDX_PW_MCE X86_BUG(31) /* CPU may incur #MC if non-TD software does partial write to TDX private memory */ /* BUG word 2 */ #define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */ diff --git a/arch/x86/include/asm/crash_core.h b/arch/x86/include/asm/crash_core.h new file mode 100644 index 000000000000..76af98f4e801 --- /dev/null +++ b/arch/x86/include/asm/crash_core.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _X86_CRASH_CORE_H +#define _X86_CRASH_CORE_H + +/* 16M alignment for crash kernel regions */ +#define CRASH_ALIGN SZ_16M + +/* + * Keep the crash kernel below this limit. + * + * Earlier 32-bits kernels would limit the kernel to the low 512 MB range + * due to mapping restrictions. + * + * 64-bit kdump kernels need to be restricted to be under 64 TB, which is + * the upper limit of system RAM in 4-level paging mode. Since the kdump + * jump could be from 5-level paging to 4-level paging, the jump will fail if + * the kernel is put above 64 TB, and during the 1st kernel bootup there's + * no good way to detect the paging mode of the target kernel which will be + * loaded for dumping. + */ +extern unsigned long swiotlb_size_or_default(void); + +#ifdef CONFIG_X86_32 +# define CRASH_ADDR_LOW_MAX SZ_512M +# define CRASH_ADDR_HIGH_MAX SZ_512M +#else +# define CRASH_ADDR_LOW_MAX SZ_4G +# define CRASH_ADDR_HIGH_MAX SZ_64T +#endif + +# define DEFAULT_CRASH_KERNEL_LOW_SIZE crash_low_size_default() + +static inline unsigned long crash_low_size_default(void) +{ +#ifdef CONFIG_X86_64 + return max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20); +#else + return 0; +#endif +} + +#endif /* _X86_CRASH_CORE_H */ diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index c8c5674d69f6..fb7702d4170c 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -2,6 +2,7 @@ #ifndef _ASM_X86_CURRENT_H #define _ASM_X86_CURRENT_H +#include <linux/build_bug.h> #include <linux/compiler.h> #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 66eb5e1ac4fb..0cec92c430cc 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -5,6 +5,7 @@ #include <linux/bug.h> #include <linux/percpu.h> #include <uapi/asm/debugreg.h> +#include <asm/cpufeature.h> DECLARE_PER_CPU(unsigned long, cpu_dr7); diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index f7e7099af595..d440a65af8f3 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -8,6 +8,56 @@ * archs. */ +/* + * Low-level interface mapping flags/field names to bits + */ + +/* Flags for _DESC_S (non-system) descriptors */ +#define _DESC_ACCESSED 0x0001 +#define _DESC_DATA_WRITABLE 0x0002 +#define _DESC_CODE_READABLE 0x0002 +#define _DESC_DATA_EXPAND_DOWN 0x0004 +#define _DESC_CODE_CONFORMING 0x0004 +#define _DESC_CODE_EXECUTABLE 0x0008 + +/* Common flags */ +#define _DESC_S 0x0010 +#define _DESC_DPL(dpl) ((dpl) << 5) +#define _DESC_PRESENT 0x0080 + +#define _DESC_LONG_CODE 0x2000 +#define _DESC_DB 0x4000 +#define _DESC_GRANULARITY_4K 0x8000 + +/* System descriptors have a numeric "type" field instead of flags */ +#define _DESC_SYSTEM(code) (code) + +/* + * High-level interface mapping intended usage to low-level combinations + * of flags + */ + +#define _DESC_DATA (_DESC_S | _DESC_PRESENT | _DESC_ACCESSED | \ + _DESC_DATA_WRITABLE) +#define _DESC_CODE (_DESC_S | _DESC_PRESENT | _DESC_ACCESSED | \ + _DESC_CODE_READABLE | _DESC_CODE_EXECUTABLE) + +#define DESC_DATA16 (_DESC_DATA) +#define DESC_CODE16 (_DESC_CODE) + +#define DESC_DATA32 (_DESC_DATA | _DESC_GRANULARITY_4K | _DESC_DB) +#define DESC_DATA32_BIOS (_DESC_DATA | _DESC_DB) + +#define DESC_CODE32 (_DESC_CODE | _DESC_GRANULARITY_4K | _DESC_DB) +#define DESC_CODE32_BIOS (_DESC_CODE | _DESC_DB) + +#define DESC_TSS32 (_DESC_SYSTEM(9) | _DESC_PRESENT) + +#define DESC_DATA64 (_DESC_DATA | _DESC_GRANULARITY_4K | _DESC_DB) +#define DESC_CODE64 (_DESC_CODE | _DESC_GRANULARITY_4K | _DESC_LONG_CODE) + +#define DESC_USER (_DESC_DPL(3)) + #ifndef __ASSEMBLY__ #include <linux/types.h> @@ -22,19 +72,19 @@ struct desc_struct { #define GDT_ENTRY_INIT(flags, base, limit) \ { \ - .limit0 = (u16) (limit), \ - .limit1 = ((limit) >> 16) & 0x0F, \ - .base0 = (u16) (base), \ - .base1 = ((base) >> 16) & 0xFF, \ - .base2 = ((base) >> 24) & 0xFF, \ - .type = (flags & 0x0f), \ - .s = (flags >> 4) & 0x01, \ - .dpl = (flags >> 5) & 0x03, \ - .p = (flags >> 7) & 0x01, \ - .avl = (flags >> 12) & 0x01, \ - .l = (flags >> 13) & 0x01, \ - .d = (flags >> 14) & 0x01, \ - .g = (flags >> 15) & 0x01, \ + .limit0 = ((limit) >> 0) & 0xFFFF, \ + .limit1 = ((limit) >> 16) & 0x000F, \ + .base0 = ((base) >> 0) & 0xFFFF, \ + .base1 = ((base) >> 16) & 0x00FF, \ + .base2 = ((base) >> 24) & 0x00FF, \ + .type = ((flags) >> 0) & 0x000F, \ + .s = ((flags) >> 4) & 0x0001, \ + .dpl = ((flags) >> 5) & 0x0003, \ + .p = ((flags) >> 7) & 0x0001, \ + .avl = ((flags) >> 12) & 0x0001, \ + .l = ((flags) >> 13) & 0x0001, \ + .d = ((flags) >> 14) & 0x0001, \ + .g = ((flags) >> 15) & 0x0001, \ } enum { @@ -94,6 +144,7 @@ struct gate_struct { typedef struct gate_struct gate_desc; +#ifndef _SETUP static inline unsigned long gate_offset(const gate_desc *g) { #ifdef CONFIG_X86_64 @@ -108,6 +159,7 @@ static inline unsigned long gate_segment(const gate_desc *g) { return g->segment; } +#endif struct desc_ptr { unsigned short size; diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index b0994ae3bc23..c4555b269a1b 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -91,19 +91,6 @@ static inline void efi_fpu_end(void) #ifdef CONFIG_X86_32 #define EFI_X86_KERNEL_ALLOC_LIMIT (SZ_512M - 1) - -#define arch_efi_call_virt_setup() \ -({ \ - efi_fpu_begin(); \ - firmware_restrict_branch_speculation_start(); \ -}) - -#define arch_efi_call_virt_teardown() \ -({ \ - firmware_restrict_branch_speculation_end(); \ - efi_fpu_end(); \ -}) - #else /* !CONFIG_X86_32 */ #define EFI_X86_KERNEL_ALLOC_LIMIT EFI_ALLOC_LIMIT @@ -116,14 +103,6 @@ extern bool efi_disable_ibt_for_runtime; __efi_call(__VA_ARGS__); \ }) -#define arch_efi_call_virt_setup() \ -({ \ - efi_sync_low_kernel_mappings(); \ - efi_fpu_begin(); \ - firmware_restrict_branch_speculation_start(); \ - efi_enter_mm(); \ -}) - #undef arch_efi_call_virt #define arch_efi_call_virt(p, f, args...) ({ \ u64 ret, ibt = ibt_save(efi_disable_ibt_for_runtime); \ @@ -132,13 +111,6 @@ extern bool efi_disable_ibt_for_runtime; ret; \ }) -#define arch_efi_call_virt_teardown() \ -({ \ - efi_leave_mm(); \ - firmware_restrict_branch_speculation_end(); \ - efi_fpu_end(); \ -}) - #ifdef CONFIG_KASAN /* * CONFIG_KASAN may redefine memset to __memset. __memset function is present @@ -168,8 +140,8 @@ extern void efi_delete_dummy_variable(void); extern void efi_crash_gracefully_on_page_fault(unsigned long phys_addr); extern void efi_free_boot_services(void); -void efi_enter_mm(void); -void efi_leave_mm(void); +void arch_efi_call_virt_setup(void); +void arch_efi_call_virt_teardown(void); /* kexec external ABI */ struct efi_setup_data { diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 18fd06f7936a..1e16bd5ac781 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -7,6 +7,7 @@ */ #include <linux/thread_info.h> +#include <asm/ia32.h> #include <asm/ptrace.h> #include <asm/user.h> #include <asm/auxvec.h> @@ -149,7 +150,7 @@ do { \ ((x)->e_machine == EM_X86_64) #define compat_elf_check_arch(x) \ - (elf_check_arch_ia32(x) || \ + ((elf_check_arch_ia32(x) && ia32_enabled_verbose()) || \ (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64)) static inline void elf_common_init(struct thread_struct *t, diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h index 991e31cfde94..fe6312045042 100644 --- a/arch/x86/include/asm/extable_fixup_types.h +++ b/arch/x86/include/asm/extable_fixup_types.h @@ -4,7 +4,7 @@ /* * Our IMM is signed, as such it must live at the top end of the word. Also, - * since C99 hex constants are of ambigious type, force cast the mask to 'int' + * since C99 hex constants are of ambiguous type, force cast the mask to 'int' * so that FIELD_GET() will DTRT and sign extend the value when it extracts it. */ #define EX_DATA_TYPE_MASK ((int)0x000000FF) diff --git a/arch/x86/include/asm/fb.h b/arch/x86/include/asm/fb.h index 23873da8fb77..c3b9582de7ef 100644 --- a/arch/x86/include/asm/fb.h +++ b/arch/x86/include/asm/fb.h @@ -2,12 +2,14 @@ #ifndef _ASM_X86_FB_H #define _ASM_X86_FB_H +#include <asm/page.h> + struct fb_info; -struct file; -struct vm_area_struct; -void fb_pgprotect(struct file *file, struct vm_area_struct *vma, unsigned long off); -#define fb_pgprotect fb_pgprotect +pgprot_t pgprot_framebuffer(pgprot_t prot, + unsigned long vm_start, unsigned long vm_end, + unsigned long offset); +#define pgprot_framebuffer pgprot_framebuffer int fb_is_primary_device(struct fb_info *info); #define fb_is_primary_device fb_is_primary_device diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 31089b851c4f..a2be3aefff9f 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -157,7 +157,8 @@ static inline void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) { static inline void fpu_sync_guest_vmexit_xfd_state(void) { } #endif -extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u32 pkru); +extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, + unsigned int size, u64 xfeatures, u32 pkru); extern int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru); static inline void fpstate_set_confidential(struct fpu_guest *gfpu) diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index eb810074f1e7..ace9aa3b78a3 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -5,6 +5,8 @@ #ifndef _ASM_X86_FPU_H #define _ASM_X86_FPU_H +#include <asm/page_types.h> + /* * The legacy x87 FPU state format, as saved by FSAVE and * restored by the FRSTOR instructions: @@ -415,7 +417,7 @@ struct fpu_state_perm { * * This master permission field is only to be used when * task.fpu.fpstate based checks fail to validate whether the task - * is allowed to expand it's xfeatures set which requires to + * is allowed to expand its xfeatures set which requires to * allocate a larger sized fpstate buffer. * * Do not access this field directly. Use the provided helper diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 551829884734..b02c3cd3c0f6 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -28,7 +28,7 @@ #include <asm/irq.h> #include <asm/sections.h> -#ifdef CONFIG_X86_LOCAL_APIC +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY struct irq_data; struct pci_dev; struct msi_desc; @@ -105,10 +105,10 @@ static inline void irq_complete_move(struct irq_cfg *c) { } #endif extern void apic_ack_edge(struct irq_data *data); -#else /* CONFIG_X86_LOCAL_APIC */ +#else /* CONFIG_IRQ_DOMAIN_HIERARCHY */ static inline void lock_vector_lock(void) {} static inline void unlock_vector_lock(void) {} -#endif /* CONFIG_X86_LOCAL_APIC */ +#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ /* Statistics */ extern atomic_t irq_err_count; diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index 637fa1df3512..c715097e92fd 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -69,6 +69,8 @@ struct legacy_pic { void (*make_irq)(unsigned int irq); }; +void legacy_pic_pcat_compat(void); + extern struct legacy_pic *legacy_pic; extern struct legacy_pic null_legacy_pic; diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index fada857f0a1e..c7ef6ea2fa99 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -2,7 +2,6 @@ #ifndef _ASM_X86_IA32_H #define _ASM_X86_IA32_H - #ifdef CONFIG_IA32_EMULATION #include <linux/compat.h> @@ -68,6 +67,37 @@ extern void ia32_pick_mmap_layout(struct mm_struct *mm); #endif -#endif /* CONFIG_IA32_EMULATION */ +extern bool __ia32_enabled; + +static inline bool ia32_enabled(void) +{ + return __ia32_enabled; +} + +static inline void ia32_disable(void) +{ + __ia32_enabled = false; +} + +#else /* !CONFIG_IA32_EMULATION */ + +static inline bool ia32_enabled(void) +{ + return IS_ENABLED(CONFIG_X86_32); +} + +static inline void ia32_disable(void) {} + +#endif + +static inline bool ia32_enabled_verbose(void) +{ + bool enabled = ia32_enabled(); + + if (IS_ENABLED(CONFIG_IA32_EMULATION) && !enabled) + pr_notice_once("32-bit emulation disabled. You can reenable with ia32_emulation=on\n"); + + return enabled; +} #endif /* _ASM_X86_IA32_H */ diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 05fd175cec7d..13639e57e1f8 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -569,6 +569,10 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_UD, exc_invalid_op); DECLARE_IDTENTRY_RAW(X86_TRAP_BP, exc_int3); DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_PF, exc_page_fault); +#if defined(CONFIG_IA32_EMULATION) +DECLARE_IDTENTRY_RAW(IA32_SYSCALL_VECTOR, int80_emulation); +#endif + #ifdef CONFIG_X86_MCE #ifdef CONFIG_X86_64 DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check); diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 5f1d3c421f68..cc9ccf61b6bd 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_INIT_H #define _ASM_X86_INIT_H +#define __head __section(".head.text") + struct x86_mapping_info { void *(*alloc_pgt_page)(void *); /* allocate buf for page table */ void *context; /* context for alloc_pgt_page */ diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 5fcd85fd64fd..b65e9c46b922 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -27,6 +27,7 @@ * _X - regular server parts * _D - micro server parts * _N,_P - other mobile parts + * _H - premium mobile parts * _S - other client parts * * Historical OPTDIFFs: @@ -124,6 +125,7 @@ #define INTEL_FAM6_METEORLAKE 0xAC #define INTEL_FAM6_METEORLAKE_L 0xAA +#define INTEL_FAM6_ARROWLAKE_H 0xC5 #define INTEL_FAM6_ARROWLAKE 0xC6 #define INTEL_FAM6_LUNARLAKE_M 0xBD @@ -160,6 +162,8 @@ #define INTEL_FAM6_ATOM_CRESTMONT_X 0xAF /* Sierra Forest */ #define INTEL_FAM6_ATOM_CRESTMONT 0xB6 /* Grand Ridge */ +#define INTEL_FAM6_ATOM_DARKMONT_X 0xDD /* Clearwater Forest */ + /* Xeon Phi */ #define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 76238842406a..3814a9263d64 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -242,7 +242,7 @@ static inline void slow_down_io(void) #endif -#define BUILDIO(bwl, bw, type) \ +#define BUILDIO(bwl, type) \ static inline void out##bwl##_p(type value, u16 port) \ { \ out##bwl(value, port); \ @@ -288,9 +288,9 @@ static inline void ins##bwl(u16 port, void *addr, unsigned long count) \ } \ } -BUILDIO(b, b, u8) -BUILDIO(w, w, u16) -BUILDIO(l, , u32) +BUILDIO(b, u8) +BUILDIO(w, u16) +BUILDIO(l, u32) #undef BUILDIO #define inb_p inb_p diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h index a1911fea8739..af7541c11821 100644 --- a/arch/x86/include/asm/iosf_mbi.h +++ b/arch/x86/include/asm/iosf_mbi.h @@ -111,7 +111,7 @@ int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask); * This function will block all kernel access to the PMIC I2C bus, so that the * P-Unit can safely access the PMIC over the shared I2C bus. * - * Note on these systems the i2c-bus driver will request a sempahore from the + * Note on these systems the i2c-bus driver will request a semaphore from the * P-Unit for exclusive access to the PMIC bus when i2c drivers are accessing * it, but this does not appear to be sufficient, we still need to avoid making * certain P-Unit requests during the access window to avoid problems. diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h index 800ffce0db29..6b4d36c95165 100644 --- a/arch/x86/include/asm/irq_work.h +++ b/arch/x86/include/asm/irq_work.h @@ -9,7 +9,6 @@ static inline bool arch_irq_work_has_interrupt(void) { return boot_cpu_has(X86_FEATURE_APIC); } -extern void arch_irq_work_raise(void); #else static inline bool arch_irq_work_has_interrupt(void) { diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 071572e23d3a..cbbef32517f0 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -24,7 +24,7 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("1:" + asm goto("1:" "jmp %l[l_yes] # objtool NOPs this \n\t" JUMP_TABLE_ENTRY : : "i" (key), "i" (2 | branch) : : l_yes); @@ -38,7 +38,7 @@ l_yes: static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { - asm_volatile_goto("1:" + asm goto("1:" ".byte " __stringify(BYTES_NOP5) "\n\t" JUMP_TABLE_ENTRY : : "i" (key), "i" (branch) : : l_yes); @@ -52,7 +52,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { - asm_volatile_goto("1:" + asm goto("1:" "jmp %l[l_yes]\n\t" JUMP_TABLE_ENTRY : : "i" (key), "i" (branch) : : l_yes); diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 3be6a98751f0..c9f6a6c5de3c 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -205,8 +205,6 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image); #endif #endif -typedef void crash_vmclear_fn(void); -extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss; extern void kdump_nmi_shootdown_cpus(void); #ifdef CONFIG_CRASH_HOTPLUG diff --git a/arch/x86/include/asm/kmsan.h b/arch/x86/include/asm/kmsan.h index 8fa6ac0e2d76..d91b37f5b4bb 100644 --- a/arch/x86/include/asm/kmsan.h +++ b/arch/x86/include/asm/kmsan.h @@ -64,6 +64,7 @@ static inline bool kmsan_virt_addr_valid(void *addr) { unsigned long x = (unsigned long)addr; unsigned long y = x - __START_KERNEL_map; + bool ret; /* use the carry flag to determine if x was < __START_KERNEL_map */ if (unlikely(x > y)) { @@ -79,7 +80,21 @@ static inline bool kmsan_virt_addr_valid(void *addr) return false; } - return pfn_valid(x >> PAGE_SHIFT); + /* + * pfn_valid() relies on RCU, and may call into the scheduler on exiting + * the critical section. However, this would result in recursion with + * KMSAN. Therefore, disable preemption here, and re-enable preemption + * below while suppressing reschedules to avoid recursion. + * + * Note, this sacrifices occasionally breaking scheduling guarantees. + * Although, a kernel compiled with KMSAN has already given up on any + * performance guarantees due to being heavily instrumented. + */ + preempt_disable(); + ret = pfn_valid(x >> PAGE_SHIFT); + preempt_enable_no_resched(); + + return ret; } #endif /* !MODULE */ diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index a2e9317aad49..5939694dfb28 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -113,8 +113,6 @@ struct kprobe_ctlblk { }; extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr); -extern int kprobe_exceptions_notify(struct notifier_block *self, - unsigned long val, void *data); extern int kprobe_int3_handler(struct pt_regs *regs); #else diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index e3054e3e46d5..378ed944b849 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -55,8 +55,10 @@ KVM_X86_OP(set_rflags) KVM_X86_OP(get_if_flag) KVM_X86_OP(flush_tlb_all) KVM_X86_OP(flush_tlb_current) +#if IS_ENABLED(CONFIG_HYPERV) KVM_X86_OP_OPTIONAL(flush_remote_tlbs) KVM_X86_OP_OPTIONAL(flush_remote_tlbs_range) +#endif KVM_X86_OP(flush_tlb_gva) KVM_X86_OP(flush_tlb_guest) KVM_X86_OP(vcpu_pre_run) @@ -108,6 +110,7 @@ KVM_X86_OP_OPTIONAL(vcpu_blocking) KVM_X86_OP_OPTIONAL(vcpu_unblocking) KVM_X86_OP_OPTIONAL(pi_update_irte) KVM_X86_OP_OPTIONAL(pi_start_assignment) +KVM_X86_OP_OPTIONAL(apicv_pre_state_restore) KVM_X86_OP_OPTIONAL(apicv_post_state_restore) KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt) KVM_X86_OP_OPTIONAL(set_hv_timer) @@ -126,7 +129,7 @@ KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from) KVM_X86_OP_OPTIONAL(vm_move_enc_context_from) KVM_X86_OP_OPTIONAL(guest_memory_reclaimed) KVM_X86_OP(get_msr_feature) -KVM_X86_OP(can_emulate_instruction) +KVM_X86_OP(check_emulate_instruction) KVM_X86_OP(apic_init_signal_blocked) KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush) KVM_X86_OP_OPTIONAL(migrate_timers) @@ -134,6 +137,7 @@ KVM_X86_OP(msr_filter_changed) KVM_X86_OP(complete_emulated_msr) KVM_X86_OP(vcpu_deliver_sipi_vector) KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons); +KVM_X86_OP_OPTIONAL(get_untagged_addr) #undef KVM_X86_OP #undef KVM_X86_OP_OPTIONAL diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h index 6c98f4bb4228..058bc636356a 100644 --- a/arch/x86/include/asm/kvm-x86-pmu-ops.h +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h @@ -22,7 +22,7 @@ KVM_X86_PMU_OP(get_msr) KVM_X86_PMU_OP(set_msr) KVM_X86_PMU_OP(refresh) KVM_X86_PMU_OP(init) -KVM_X86_PMU_OP(reset) +KVM_X86_PMU_OP_OPTIONAL(reset) KVM_X86_PMU_OP_OPTIONAL(deliver_pmi) KVM_X86_PMU_OP_OPTIONAL(cleanup) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3bc146dfd38d..d271ba20a0b2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -39,7 +39,15 @@ #define __KVM_HAVE_ARCH_VCPU_DEBUGFS +/* + * CONFIG_KVM_MAX_NR_VCPUS is defined iff CONFIG_KVM!=n, provide a dummy max if + * KVM is disabled (arbitrarily use the default from CONFIG_KVM_MAX_NR_VCPUS). + */ +#ifdef CONFIG_KVM_MAX_NR_VCPUS +#define KVM_MAX_VCPUS CONFIG_KVM_MAX_NR_VCPUS +#else #define KVM_MAX_VCPUS 1024 +#endif /* * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs @@ -125,7 +133,8 @@ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \ - | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP)) + | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP \ + | X86_CR4_LAM_SUP)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) @@ -288,13 +297,13 @@ struct kvm_kernel_irq_routing_entry; * kvm_mmu_page_role tracks the properties of a shadow page (where shadow page * also includes TDP pages) to determine whether or not a page can be used in * the given MMU context. This is a subset of the overall kvm_cpu_role to - * minimize the size of kvm_memory_slot.arch.gfn_track, i.e. allows allocating - * 2 bytes per gfn instead of 4 bytes per gfn. + * minimize the size of kvm_memory_slot.arch.gfn_write_track, i.e. allows + * allocating 2 bytes per gfn instead of 4 bytes per gfn. * * Upper-level shadow pages having gptes are tracked for write-protection via - * gfn_track. As above, gfn_track is a 16 bit counter, so KVM must not create - * more than 2^16-1 upper-level shadow pages at a single gfn, otherwise - * gfn_track will overflow and explosions will ensure. + * gfn_write_track. As above, gfn_write_track is a 16 bit counter, so KVM must + * not create more than 2^16-1 upper-level shadow pages at a single gfn, + * otherwise gfn_write_track will overflow and explosions will ensue. * * A unique shadow page (SP) for a gfn is created if and only if an existing SP * cannot be reused. The ability to reuse a SP is tracked by its role, which @@ -492,8 +501,23 @@ struct kvm_pmc { u8 idx; bool is_paused; bool intr; + /* + * Base value of the PMC counter, relative to the *consumed* count in + * the associated perf_event. This value includes counter updates from + * the perf_event and emulated_count since the last time the counter + * was reprogrammed, but it is *not* the current value as seen by the + * guest or userspace. + * + * The count is relative to the associated perf_event so that KVM + * doesn't need to reprogram the perf_event every time the guest writes + * to the counter. + */ u64 counter; - u64 prev_counter; + /* + * PMC events triggered by KVM emulation that haven't been fully + * processed, i.e. haven't undergone overflow detection. + */ + u64 emulated_counter; u64 eventsel; struct perf_event *perf_event; struct kvm_vcpu *vcpu; @@ -528,7 +552,6 @@ struct kvm_pmu { u64 raw_event_mask; struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC]; struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED]; - struct irq_work irq_work; /* * Overlay the bitmap with a 64-bit atomic so that all bits can be @@ -680,6 +703,7 @@ struct kvm_hypervisor_cpuid { u32 limit; }; +#ifdef CONFIG_KVM_XEN /* Xen HVM per vcpu emulation context */ struct kvm_vcpu_xen { u64 hypercall_rip; @@ -702,6 +726,7 @@ struct kvm_vcpu_xen { struct timer_list poll_timer; struct kvm_hypervisor_cpuid cpuid; }; +#endif struct kvm_queued_exception { bool pending; @@ -746,7 +771,6 @@ struct kvm_vcpu_arch { u64 smi_count; bool at_instruction_boundary; bool tpr_access_reporting; - bool xsaves_enabled; bool xfd_no_write_intercept; u64 ia32_xss; u64 microcode_version; @@ -831,6 +855,25 @@ struct kvm_vcpu_arch { struct kvm_cpuid_entry2 *cpuid_entries; struct kvm_hypervisor_cpuid kvm_cpuid; + /* + * FIXME: Drop this macro and use KVM_NR_GOVERNED_FEATURES directly + * when "struct kvm_vcpu_arch" is no longer defined in an + * arch/x86/include/asm header. The max is mostly arbitrary, i.e. + * can be increased as necessary. + */ +#define KVM_MAX_NR_GOVERNED_FEATURES BITS_PER_LONG + + /* + * Track whether or not the guest is allowed to use features that are + * governed by KVM, where "governed" means KVM needs to manage state + * and/or explicitly enable the feature in hardware. Typically, but + * not always, governed features can be used by the guest if and only + * if both KVM and userspace want to expose the feature to the guest. + */ + struct { + DECLARE_BITMAP(enabled, KVM_MAX_NR_GOVERNED_FEATURES); + } governed_features; + u64 reserved_gpa_bits; int maxphyaddr; @@ -910,10 +953,13 @@ struct kvm_vcpu_arch { /* used for guest single stepping over the given code position */ unsigned long singlestep_rip; +#ifdef CONFIG_KVM_HYPERV bool hyperv_enabled; struct kvm_vcpu_hv *hyperv; +#endif +#ifdef CONFIG_KVM_XEN struct kvm_vcpu_xen xen; - +#endif cpumask_var_t wbinvd_dirty_mask; unsigned long last_retry_eip; @@ -1005,7 +1051,7 @@ struct kvm_lpage_info { struct kvm_arch_memory_slot { struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES]; struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; - unsigned short *gfn_track[KVM_PAGE_TRACK_MAX]; + unsigned short *gfn_write_track; }; /* @@ -1067,6 +1113,7 @@ enum hv_tsc_page_status { HV_TSC_PAGE_BROKEN, }; +#ifdef CONFIG_KVM_HYPERV /* Hyper-V emulation context */ struct kvm_hv { struct mutex hv_lock; @@ -1097,9 +1144,11 @@ struct kvm_hv { */ unsigned int synic_auto_eoi_used; - struct hv_partition_assist_pg *hv_pa_pg; struct kvm_hv_syndbg hv_syndbg; + + bool xsaves_xsavec_checked; }; +#endif struct msr_bitmap_range { u32 flags; @@ -1108,6 +1157,7 @@ struct msr_bitmap_range { unsigned long *bitmap; }; +#ifdef CONFIG_KVM_XEN /* Xen emulation context */ struct kvm_xen { struct mutex xen_lock; @@ -1119,6 +1169,7 @@ struct kvm_xen { struct idr evtchn_ports; unsigned long poll_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)]; }; +#endif enum kvm_irqchip_mode { KVM_IRQCHIP_NONE, @@ -1227,6 +1278,7 @@ enum kvm_apicv_inhibit { }; struct kvm_arch { + unsigned long vm_type; unsigned long n_used_mmu_pages; unsigned long n_requested_mmu_pages; unsigned long n_max_mmu_pages; @@ -1247,8 +1299,9 @@ struct kvm_arch { * create an NX huge page (without hanging the guest). */ struct list_head possible_nx_huge_pages; - struct kvm_page_track_notifier_node mmu_sp_tracker; +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING struct kvm_page_track_notifier_head track_notifier_head; +#endif /* * Protects marking pages unsync during page faults, as TDP MMU page * faults only take mmu_lock for read. For simplicity, the unsync @@ -1257,7 +1310,6 @@ struct kvm_arch { */ spinlock_t mmu_unsync_pages_lock; - struct list_head assigned_dev_head; struct iommu_domain *iommu_domain; bool iommu_noncoherent; #define __KVM_HAVE_ARCH_NONCOHERENT_DMA @@ -1305,6 +1357,7 @@ struct kvm_arch { int nr_vcpus_matched_tsc; u32 default_tsc_khz; + bool user_set_tsc; seqcount_raw_spinlock_t pvclock_sc; bool use_master_clock; @@ -1318,8 +1371,13 @@ struct kvm_arch { /* reads protected by irq_srcu, writes by irq_lock */ struct hlist_head mask_notifier_list; +#ifdef CONFIG_KVM_HYPERV struct kvm_hv hyperv; +#endif + +#ifdef CONFIG_KVM_XEN struct kvm_xen xen; +#endif bool backwards_tsc_observed; bool boot_vcpu_runs_old_kvmclock; @@ -1377,9 +1435,8 @@ struct kvm_arch { * the MMU lock in read mode + RCU or * the MMU lock in write mode * - * For writes, this list is protected by: - * the MMU lock in read mode + the tdp_mmu_pages_lock or - * the MMU lock in write mode + * For writes, this list is protected by tdp_mmu_pages_lock; see + * below for the details. * * Roots will remain in the list until their tdp_mmu_root_count * drops to zero, at which point the thread that decremented the @@ -1396,11 +1453,12 @@ struct kvm_arch { * - possible_nx_huge_pages; * - the possible_nx_huge_page_link field of kvm_mmu_page structs used * by the TDP MMU - * It is acceptable, but not necessary, to acquire this lock when - * the thread holds the MMU lock in write mode. + * Because the lock is only taken within the MMU lock, strictly + * speaking it is redundant to acquire this lock when the thread + * holds the MMU lock in write mode. However it often simplifies + * the code to do so. */ spinlock_t tdp_mmu_pages_lock; - struct workqueue_struct *tdp_mmu_zap_wq; #endif /* CONFIG_X86_64 */ /* @@ -1413,6 +1471,7 @@ struct kvm_arch { #if IS_ENABLED(CONFIG_HYPERV) hpa_t hv_root_tdp; spinlock_t hv_root_tdp_lock; + struct hv_partition_assist_pg *hv_pa_pg; #endif /* * VM-scope maximum vCPU ID. Used to determine the size of structures @@ -1585,9 +1644,11 @@ struct kvm_x86_ops { void (*flush_tlb_all)(struct kvm_vcpu *vcpu); void (*flush_tlb_current)(struct kvm_vcpu *vcpu); +#if IS_ENABLED(CONFIG_HYPERV) int (*flush_remote_tlbs)(struct kvm *kvm); int (*flush_remote_tlbs_range)(struct kvm *kvm, gfn_t gfn, gfn_t nr_pages); +#endif /* * Flush any TLB entries associated with the given GVA. @@ -1624,7 +1685,7 @@ struct kvm_x86_ops { /* Whether or not a virtual NMI is pending in hardware. */ bool (*is_vnmi_pending)(struct kvm_vcpu *vcpu); /* - * Attempt to pend a virtual NMI in harware. Returns %true on success + * Attempt to pend a virtual NMI in hardware. Returns %true on success * to allow using static_call_ret0 as the fallback. */ bool (*set_vnmi_pending)(struct kvm_vcpu *vcpu); @@ -1655,8 +1716,8 @@ struct kvm_x86_ops { u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu); u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu); - void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); - void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu, u64 multiplier); + void (*write_tsc_offset)(struct kvm_vcpu *vcpu); + void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu); /* * Retrieve somewhat arbitrary exit information. Intended to @@ -1674,7 +1735,7 @@ struct kvm_x86_ops { void (*request_immediate_exit)(struct kvm_vcpu *vcpu); - void (*sched_in)(struct kvm_vcpu *kvm, int cpu); + void (*sched_in)(struct kvm_vcpu *vcpu, int cpu); /* * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A zero @@ -1691,6 +1752,7 @@ struct kvm_x86_ops { int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void (*pi_start_assignment)(struct kvm *kvm); + void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu); @@ -1716,8 +1778,8 @@ struct kvm_x86_ops { int (*get_msr_feature)(struct kvm_msr_entry *entry); - bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type, - void *insn, int insn_len); + int (*check_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len); bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu); int (*enable_l2_tlb_flush)(struct kvm_vcpu *vcpu); @@ -1732,6 +1794,8 @@ struct kvm_x86_ops { * Returns vCPU specific APICv inhibit reasons */ unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu); + + gva_t (*get_untagged_addr)(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags); }; struct kvm_x86_nested_ops { @@ -1795,8 +1859,9 @@ static inline struct kvm *kvm_arch_alloc_vm(void) #define __KVM_HAVE_ARCH_VM_FREE void kvm_arch_free_vm(struct kvm *kvm); -#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB -static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) +#if IS_ENABLED(CONFIG_HYPERV) +#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS +static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm) { if (kvm_x86_ops.flush_remote_tlbs && !static_call(kvm_x86_flush_remote_tlbs)(kvm)) @@ -1805,6 +1870,17 @@ static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) return -ENOTSUPP; } +#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE +static inline int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, + u64 nr_pages) +{ + if (!kvm_x86_ops.flush_remote_tlbs_range) + return -EOPNOTSUPP; + + return static_call(kvm_x86_flush_remote_tlbs_range)(kvm, gfn, nr_pages); +} +#endif /* CONFIG_HYPERV */ + #define kvm_arch_pmi_in_guest(vcpu) \ ((vcpu) && (vcpu)->arch.handling_intr_from_guest) @@ -1814,9 +1890,12 @@ void kvm_mmu_vendor_module_exit(void); void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); -int kvm_mmu_init_vm(struct kvm *kvm); +void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); +void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm, + struct kvm_memory_slot *slot); + void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, @@ -1833,7 +1912,6 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, const struct kvm_memory_slot *memslot); -void kvm_mmu_zap_all(struct kvm *kvm); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages); @@ -2056,6 +2134,12 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd); void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, int tdp_max_root_level, int tdp_huge_page_level); +#ifdef CONFIG_KVM_PRIVATE_MEM +#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.vm_type != KVM_X86_DEFAULT_VM) +#else +#define kvm_arch_has_private_mem(kvm) false +#endif + static inline u16 kvm_read_ldt(void) { u16 ldt; @@ -2103,16 +2187,15 @@ enum { #define HF_SMM_MASK (1 << 1) #define HF_SMM_INSIDE_NMI_MASK (1 << 2) -# define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE -# define KVM_ADDRESS_SPACE_NUM 2 +# define KVM_MAX_NR_ADDRESS_SPACES 2 +/* SMM is currently unsupported for guests with private memory. */ +# define kvm_arch_nr_memslot_as_ids(kvm) (kvm_arch_has_private_mem(kvm) ? 1 : 2) # define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) # define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) #else # define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, 0) #endif -#define KVM_ARCH_WANT_MMU_NOTIFIER - int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_cpu_has_extint(struct kvm_vcpu *v); diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h index eb186bc57f6a..3d040741044b 100644 --- a/arch/x86/include/asm/kvm_page_track.h +++ b/arch/x86/include/asm/kvm_page_track.h @@ -2,11 +2,9 @@ #ifndef _ASM_X86_KVM_PAGE_TRACK_H #define _ASM_X86_KVM_PAGE_TRACK_H -enum kvm_page_track_mode { - KVM_PAGE_TRACK_WRITE, - KVM_PAGE_TRACK_MAX, -}; +#include <linux/kvm_types.h> +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING /* * The notifier represented by @kvm_page_track_notifier_node is linked into * the head which will be notified when guest is triggering the track event. @@ -26,54 +24,39 @@ struct kvm_page_track_notifier_node { * It is called when guest is writing the write-tracked page * and write emulation is finished at that time. * - * @vcpu: the vcpu where the write access happened. * @gpa: the physical address written by guest. * @new: the data was written to the address. * @bytes: the written length. * @node: this node */ - void (*track_write)(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, - int bytes, struct kvm_page_track_notifier_node *node); + void (*track_write)(gpa_t gpa, const u8 *new, int bytes, + struct kvm_page_track_notifier_node *node); + /* - * It is called when memory slot is being moved or removed - * users can drop write-protection for the pages in that memory slot + * Invoked when a memory region is removed from the guest. Or in KVM + * terms, when a memslot is deleted. * - * @kvm: the kvm where memory slot being moved or removed - * @slot: the memory slot being moved or removed - * @node: this node + * @gfn: base gfn of the region being removed + * @nr_pages: number of pages in the to-be-removed region + * @node: this node */ - void (*track_flush_slot)(struct kvm *kvm, struct kvm_memory_slot *slot, - struct kvm_page_track_notifier_node *node); + void (*track_remove_region)(gfn_t gfn, unsigned long nr_pages, + struct kvm_page_track_notifier_node *node); }; -int kvm_page_track_init(struct kvm *kvm); -void kvm_page_track_cleanup(struct kvm *kvm); +int kvm_page_track_register_notifier(struct kvm *kvm, + struct kvm_page_track_notifier_node *n); +void kvm_page_track_unregister_notifier(struct kvm *kvm, + struct kvm_page_track_notifier_node *n); -bool kvm_page_track_write_tracking_enabled(struct kvm *kvm); -int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot); - -void kvm_page_track_free_memslot(struct kvm_memory_slot *slot); -int kvm_page_track_create_memslot(struct kvm *kvm, - struct kvm_memory_slot *slot, - unsigned long npages); - -void kvm_slot_page_track_add_page(struct kvm *kvm, - struct kvm_memory_slot *slot, gfn_t gfn, - enum kvm_page_track_mode mode); -void kvm_slot_page_track_remove_page(struct kvm *kvm, - struct kvm_memory_slot *slot, gfn_t gfn, - enum kvm_page_track_mode mode); -bool kvm_slot_page_track_is_active(struct kvm *kvm, - const struct kvm_memory_slot *slot, - gfn_t gfn, enum kvm_page_track_mode mode); +int kvm_write_track_add_gfn(struct kvm *kvm, gfn_t gfn); +int kvm_write_track_remove_gfn(struct kvm *kvm, gfn_t gfn); +#else +/* + * Allow defining a node in a structure even if page tracking is disabled, e.g. + * to play nice with testing headers via direct inclusion from the command line. + */ +struct kvm_page_track_notifier_node {}; +#endif /* CONFIG_KVM_EXTERNAL_WRITE_TRACKING */ -void -kvm_page_track_register_notifier(struct kvm *kvm, - struct kvm_page_track_notifier_node *n); -void -kvm_page_track_unregister_notifier(struct kvm *kvm, - struct kvm_page_track_notifier_node *n); -void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, - int bytes); -void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot); #endif diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 97a3de7892d3..571fe4d2d232 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -8,6 +8,14 @@ #undef notrace #define notrace __attribute__((no_instrument_function)) +#ifdef CONFIG_64BIT +/* + * The generic version tends to create spurious ENDBR instructions under + * certain conditions. + */ +#define _THIS_IP_ ({ unsigned long __here; asm ("lea 0(%%rip), %0" : "=r" (__here)); __here; }) +#endif + #ifdef CONFIG_X86_32 #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) #endif /* CONFIG_X86_32 */ @@ -97,6 +105,13 @@ CFI_POST_PADDING \ SYM_FUNC_END(__cfi_##name) +/* UML needs to be able to override memcpy() and friends for KASAN. */ +#ifdef CONFIG_UML +# define SYM_FUNC_ALIAS_MEMFUNC SYM_FUNC_ALIAS_WEAK +#else +# define SYM_FUNC_ALIAS_MEMFUNC SYM_FUNC_ALIAS +#endif + /* SYM_TYPED_FUNC_START -- use for indirectly called globals, w/ CFI type */ #define SYM_TYPED_FUNC_START(name) \ SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_F_ALIGN) \ diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index 635132a12778..73dba8b94443 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -135,28 +135,27 @@ static inline bool local_try_cmpxchg(local_t *l, long *old, long new) #define local_xchg(l, n) (xchg(&((l)->a.counter), (n))) /** - * local_add_unless - add unless the number is a given value + * local_add_unless - add unless the number is already a given value * @l: pointer of type local_t * @a: the amount to add to l... * @u: ...unless l is equal to u. * - * Atomically adds @a to @l, so long as it was not @u. - * Returns non-zero if @l was not @u, and zero otherwise. + * Atomically adds @a to @l, if @v was not already @u. + * Returns true if the addition was done. */ -#define local_add_unless(l, a, u) \ -({ \ - long c, old; \ - c = local_read((l)); \ - for (;;) { \ - if (unlikely(c == (u))) \ - break; \ - old = local_cmpxchg((l), c, c + (a)); \ - if (likely(old == c)) \ - break; \ - c = old; \ - } \ - c != (u); \ -}) +static __always_inline bool +local_add_unless(local_t *l, long a, long u) +{ + long c = local_read(l); + + do { + if (unlikely(c == u)) + return false; + } while (!local_try_cmpxchg(l, &c, c + a)); + + return true; +} + #define local_inc_not_zero(l) local_add_unless((l), 1, 0) /* On x86_32, these are no better than the atomic variants. diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 180b1cbfcc4e..de3118305838 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -245,7 +245,7 @@ static inline void cmci_recheck(void) {} int mce_available(struct cpuinfo_x86 *c); bool mce_is_memory_error(struct mce *m); bool mce_is_correctable(struct mce *m); -int mce_usable_address(struct mce *m); +bool mce_usable_address(struct mce *m); DECLARE_PER_CPU(unsigned, mce_exception_count); DECLARE_PER_CPU(unsigned, mce_poll_count); @@ -311,6 +311,7 @@ enum smca_bank_types { SMCA_PIE, /* Power, Interrupts, etc. */ SMCA_UMC, /* Unified Memory Controller */ SMCA_UMC_V2, + SMCA_MA_LLC, /* Memory Attached Last Level Cache */ SMCA_PB, /* Parameter Block */ SMCA_PSP, /* Platform Security Processor */ SMCA_PSP_V2, @@ -326,6 +327,8 @@ enum smca_bank_types { SMCA_SHUB, /* System HUB Unit */ SMCA_SATA, /* SATA Unit */ SMCA_USB, /* USB Unit */ + SMCA_USR_DP, /* Ultra Short Reach Data Plane Controller */ + SMCA_USR_CP, /* Ultra Short Reach Control Plane Controller */ SMCA_GMI_PCS, /* GMI PCS Unit */ SMCA_XGMI_PHY, /* xGMI PHY Unit */ SMCA_WAFL_PHY, /* WAFL PHY Unit */ @@ -333,7 +336,6 @@ enum smca_bank_types { N_SMCA_BANK_TYPES }; -extern const char *smca_get_long_name(enum smca_bank_types t); extern bool amd_mce_is_memory_error(struct mce *m); extern int mce_threshold_create_device(unsigned int cpu); diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 473b16d73b47..359ada486fa9 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -19,8 +19,10 @@ #ifdef CONFIG_X86_MEM_ENCRYPT void __init mem_encrypt_init(void); +void __init mem_encrypt_setup_arch(void); #else static inline void mem_encrypt_init(void) { } +static inline void __init mem_encrypt_setup_arch(void) { } #endif #ifdef CONFIG_AMD_MEM_ENCRYPT @@ -43,7 +45,6 @@ void __init sme_map_bootdata(char *real_mode_data); void __init sme_unmap_bootdata(char *real_mode_data); void __init sme_early_init(void); -void __init sev_setup_arch(void); void __init sme_encrypt_kernel(struct boot_params *bp); void __init sme_enable(struct boot_params *bp); @@ -73,7 +74,6 @@ static inline void __init sme_map_bootdata(char *real_mode_data) { } static inline void __init sme_unmap_bootdata(char *real_mode_data) { } static inline void __init sme_early_init(void) { } -static inline void __init sev_setup_arch(void) { } static inline void __init sme_encrypt_kernel(struct boot_params *bp) { } static inline void __init sme_enable(struct boot_params *bp) { } diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index bbbe9d744977..695e569159c1 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -23,6 +23,8 @@ static inline void load_ucode_ap(void) { } static inline void microcode_bsp_resume(void) { } #endif +extern unsigned long initrd_start_early; + #ifdef CONFIG_CPU_SUP_INTEL /* Intel specific microcode defines. Public for IFS */ struct microcode_header_intel { @@ -36,7 +38,8 @@ struct microcode_header_intel { unsigned int datasize; unsigned int totalsize; unsigned int metasize; - unsigned int reserved[2]; + unsigned int min_req_ver; + unsigned int reserved; }; struct microcode_intel { @@ -68,11 +71,19 @@ static inline u32 intel_get_microcode_revision(void) return rev; } +#endif /* !CONFIG_CPU_SUP_INTEL */ -void show_ucode_info_early(void); +bool microcode_nmi_handler(void); +void microcode_offline_nmi_handler(void); -#else /* CONFIG_CPU_SUP_INTEL */ -static inline void show_ucode_info_early(void) { } -#endif /* !CONFIG_CPU_SUP_INTEL */ +#ifdef CONFIG_MICROCODE_LATE_LOADING +DECLARE_STATIC_KEY_FALSE(microcode_nmi_handler_enable); +static __always_inline bool microcode_nmi_handler_enabled(void) +{ + return static_branch_unlikely(µcode_nmi_handler_enable); +} +#else +static __always_inline bool microcode_nmi_handler_enabled(void) { return false; } +#endif #endif /* _ASM_X86_MICROCODE_H */ diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h new file mode 100644 index 000000000000..12b820259b9f --- /dev/null +++ b/arch/x86/include/asm/mman.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_MMAN_H__ +#define __ASM_MMAN_H__ + +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS +#define arch_calc_vm_prot_bits(prot, key) ( \ + ((key) & 0x1 ? VM_PKEY_BIT0 : 0) | \ + ((key) & 0x2 ? VM_PKEY_BIT1 : 0) | \ + ((key) & 0x4 ? VM_PKEY_BIT2 : 0) | \ + ((key) & 0x8 ? VM_PKEY_BIT3 : 0)) +#endif + +#include <uapi/asm/mman.h> + +#endif /* __ASM_MMAN_H__ */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 416901d406f8..8dac45a2c7fc 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -186,8 +186,7 @@ do { \ #else #define deactivate_mm(tsk, mm) \ do { \ - if (!tsk->vfork_done) \ - shstk_free(tsk); \ + shstk_free(tsk); \ load_gs_index(0); \ loadsegment(fs, 0); \ } while (0) diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index f46df8349e86..4b0f98a8d338 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -37,7 +37,7 @@ extern int mp_bus_id_to_type[MAX_MP_BUSSES]; extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); -extern unsigned int boot_cpu_physical_apicid; +extern u32 boot_cpu_physical_apicid; extern u8 boot_cpu_apic_version; #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 033b53f993c6..ce4ce8720d55 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -276,11 +276,11 @@ int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry); #ifdef CONFIG_AMD_MEM_ENCRYPT bool hv_ghcb_negotiate_protocol(void); void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason); -int hv_snp_boot_ap(int cpu, unsigned long start_ip); +int hv_snp_boot_ap(u32 cpu, unsigned long start_ip); #else static inline bool hv_ghcb_negotiate_protocol(void) { return false; } static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {} -static inline int hv_snp_boot_ap(int cpu, unsigned long start_ip) { return 0; } +static inline int hv_snp_boot_ap(u32 cpu, unsigned long start_ip) { return 0; } #endif #if defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) @@ -340,8 +340,10 @@ static inline u64 hv_get_non_nested_register(unsigned int reg) { return 0; } #ifdef CONFIG_HYPERV_VTL_MODE void __init hv_vtl_init_platform(void); +int __init hv_vtl_early_init(void); #else static inline void __init hv_vtl_init_platform(void) {} +static inline int __init hv_vtl_early_init(void) { return 0; } #endif #include <asm-generic/mshyperv.h> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1d111350197f..f1bd7b91b3c6 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -222,6 +222,7 @@ #define MSR_INTEGRITY_CAPS_ARRAY_BIST BIT(MSR_INTEGRITY_CAPS_ARRAY_BIST_BIT) #define MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT 4 #define MSR_INTEGRITY_CAPS_PERIODIC_BIST BIT(MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT) +#define MSR_INTEGRITY_CAPS_SAF_GEN_MASK GENMASK_ULL(10, 9) #define MSR_LBR_NHM_FROM 0x00000680 #define MSR_LBR_NHM_TO 0x000006c0 @@ -236,6 +237,11 @@ #define LBR_INFO_CYCLES 0xffff #define LBR_INFO_BR_TYPE_OFFSET 56 #define LBR_INFO_BR_TYPE (0xfull << LBR_INFO_BR_TYPE_OFFSET) +#define LBR_INFO_BR_CNTR_OFFSET 32 +#define LBR_INFO_BR_CNTR_NUM 4 +#define LBR_INFO_BR_CNTR_BITS 2 +#define LBR_INFO_BR_CNTR_MASK GENMASK_ULL(LBR_INFO_BR_CNTR_BITS - 1, 0) +#define LBR_INFO_BR_CNTR_FULL_MASK GENMASK_ULL(LBR_INFO_BR_CNTR_NUM * LBR_INFO_BR_CNTR_BITS - 1, 0) #define MSR_ARCH_LBR_CTL 0x000014ce #define ARCH_LBR_CTL_LBREN BIT(0) @@ -535,6 +541,9 @@ #define MSR_RELOAD_PMC0 0x000014c1 #define MSR_RELOAD_FIXED_CTR0 0x00001309 +/* KeyID partitioning between MKTME and TDX */ +#define MSR_IA32_MKTME_KEYID_PARTITIONING 0x00000087 + /* * AMD64 MSRs. Not complete. See the architecture manual for a more * complete list. @@ -553,6 +562,7 @@ #define MSR_AMD64_CPUID_FN_1 0xc0011004 #define MSR_AMD64_LS_CFG 0xc0011020 #define MSR_AMD64_DC_CFG 0xc0011022 +#define MSR_AMD64_TW_CFG 0xc0011023 #define MSR_AMD64_DE_CFG 0xc0011029 #define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT 1 @@ -637,12 +647,21 @@ /* AMD Last Branch Record MSRs */ #define MSR_AMD64_LBR_SELECT 0xc000010e +/* Zen4 */ +#define MSR_ZEN4_BP_CFG 0xc001102e +#define MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT 5 + +/* Fam 19h MSRs */ +#define MSR_F19H_UMC_PERF_CTL 0xc0010800 +#define MSR_F19H_UMC_PERF_CTR 0xc0010801 + +/* Zen 2 */ +#define MSR_ZEN2_SPECTRAL_CHICKEN 0xc00110e3 +#define MSR_ZEN2_SPECTRAL_CHICKEN_BIT BIT_ULL(1) + /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 -#define MSR_ZEN2_SPECTRAL_CHICKEN 0xc00110e3 -#define MSR_ZEN2_SPECTRAL_CHICKEN_BIT BIT_ULL(1) - /* Fam 16h MSRs */ #define MSR_F16H_L2I_PERF_CTL 0xc0010230 #define MSR_F16H_L2I_PERF_CTR 0xc0010231 @@ -1112,12 +1131,16 @@ #define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14) #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F -/* AMD-V MSRs */ +/* AMD-V MSRs */ #define MSR_VM_CR 0xc0010114 #define MSR_VM_IGNNE 0xc0010115 #define MSR_VM_HSAVE_PA 0xc0010117 +#define SVM_VM_CR_VALID_MASK 0x001fULL +#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL +#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL + /* Hardware Feedback Interface */ #define MSR_IA32_HW_FEEDBACK_PTR 0x17d0 #define MSR_IA32_HW_FEEDBACK_CONFIG 0x17d1 diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 778df05f8539..920426d691ce 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -87,6 +87,15 @@ static __always_inline void __mwaitx(unsigned long eax, unsigned long ebx, :: "a" (eax), "b" (ebx), "c" (ecx)); } +/* + * Re-enable interrupts right upon calling mwait in such a way that + * no interrupt can fire _before_ the execution of mwait, ie: no + * instruction must be placed between "sti" and "mwait". + * + * This is necessary because if an interrupt queues a timer before + * executing mwait, it would otherwise go unnoticed and the next tick + * would not be reprogrammed accordingly before mwait ever wakes up. + */ static __always_inline void __sti_mwait(unsigned long eax, unsigned long ecx) { mds_idle_clear_cpu_buffers(); @@ -115,8 +124,15 @@ static __always_inline void mwait_idle_with_hints(unsigned long eax, unsigned lo } __monitor((void *)¤t_thread_info()->flags, 0, 0); - if (!need_resched()) - __mwait(eax, ecx); + + if (!need_resched()) { + if (ecx & 1) { + __mwait(eax, ecx); + } else { + __sti_mwait(eax, ecx); + raw_local_irq_disable(); + } + } } current_clr_polling(); } diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 65fbf6b853af..691ff1ef701b 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -49,7 +49,7 @@ * but there is still a cushion vs. the RSB depth. The algorithm does not * claim to be perfect and it can be speculated around by the CPU, but it * is considered that it obfuscates the problem enough to make exploitation - * extremly difficult. + * extremely difficult. */ #define RET_DEPTH_SHIFT 5 #define RSB_RET_STUFF_LOOPS 16 @@ -187,7 +187,7 @@ .macro ANNOTATE_RETPOLINE_SAFE .Lhere_\@: .pushsection .discard.retpoline_safe - .long .Lhere_\@ - . + .long .Lhere_\@ .popsection .endm @@ -199,7 +199,7 @@ /* * Abuse ANNOTATE_RETPOLINE_SAFE on a NOP to indicate UNRET_END, should - * eventually turn into it's own annotation. + * eventually turn into its own annotation. */ .macro VALIDATE_UNRET_END #if defined(CONFIG_NOINSTR_VALIDATION) && \ @@ -262,7 +262,7 @@ .Lskip_rsb_\@: .endm -#ifdef CONFIG_CPU_UNRET_ENTRY +#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO) #define CALL_UNTRAIN_RET "call entry_untrain_ret" #else #define CALL_UNTRAIN_RET "" @@ -279,38 +279,24 @@ * As such, this must be placed after every *SWITCH_TO_KERNEL_CR3 at a point * where we have a stack but before any RET instruction. */ -.macro UNTRAIN_RET -#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ - defined(CONFIG_CALL_DEPTH_TRACKING) || defined(CONFIG_CPU_SRSO) +.macro __UNTRAIN_RET ibpb_feature, call_depth_insns +#if defined(CONFIG_RETHUNK) || defined(CONFIG_CPU_IBPB_ENTRY) VALIDATE_UNRET_END ALTERNATIVE_3 "", \ CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \ - "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \ - __stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH + "call entry_ibpb", \ibpb_feature, \ + __stringify(\call_depth_insns), X86_FEATURE_CALL_DEPTH #endif .endm -.macro UNTRAIN_RET_VM -#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ - defined(CONFIG_CALL_DEPTH_TRACKING) || defined(CONFIG_CPU_SRSO) - VALIDATE_UNRET_END - ALTERNATIVE_3 "", \ - CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \ - "call entry_ibpb", X86_FEATURE_IBPB_ON_VMEXIT, \ - __stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH -#endif -.endm +#define UNTRAIN_RET \ + __UNTRAIN_RET X86_FEATURE_ENTRY_IBPB, __stringify(RESET_CALL_DEPTH) -.macro UNTRAIN_RET_FROM_CALL -#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ - defined(CONFIG_CALL_DEPTH_TRACKING) - VALIDATE_UNRET_END - ALTERNATIVE_3 "", \ - CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \ - "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \ - __stringify(RESET_CALL_DEPTH_FROM_CALL), X86_FEATURE_CALL_DEPTH -#endif -.endm +#define UNTRAIN_RET_VM \ + __UNTRAIN_RET X86_FEATURE_IBPB_ON_VMEXIT, __stringify(RESET_CALL_DEPTH) + +#define UNTRAIN_RET_FROM_CALL \ + __UNTRAIN_RET X86_FEATURE_ENTRY_IBPB, __stringify(RESET_CALL_DEPTH_FROM_CALL) .macro CALL_DEPTH_ACCOUNT @@ -325,7 +311,7 @@ #define ANNOTATE_RETPOLINE_SAFE \ "999:\n\t" \ ".pushsection .discard.retpoline_safe\n\t" \ - ".long 999b - .\n\t" \ + ".long 999b\n\t" \ ".popsection\n\t" typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE]; @@ -339,13 +325,23 @@ extern void __x86_return_thunk(void); static inline void __x86_return_thunk(void) {} #endif +#ifdef CONFIG_CPU_UNRET_ENTRY extern void retbleed_return_thunk(void); +#else +static inline void retbleed_return_thunk(void) {} +#endif + +#ifdef CONFIG_CPU_SRSO extern void srso_return_thunk(void); extern void srso_alias_return_thunk(void); +#else +static inline void srso_return_thunk(void) {} +static inline void srso_alias_return_thunk(void) {} +#endif -extern void retbleed_untrain_ret(void); -extern void srso_untrain_ret(void); -extern void srso_alias_untrain_ret(void); +extern void retbleed_return_thunk(void); +extern void srso_return_thunk(void); +extern void srso_alias_return_thunk(void); extern void entry_untrain_ret(void); extern void entry_ibpb(void); @@ -353,12 +349,7 @@ extern void entry_ibpb(void); extern void (*x86_return_thunk)(void); #ifdef CONFIG_CALL_DEPTH_TRACKING -extern void __x86_return_skl(void); - -static inline void x86_set_skl_return_thunk(void) -{ - x86_return_thunk = &__x86_return_skl; -} +extern void call_depth_return_thunk(void); #define CALL_DEPTH_ACCOUNT \ ALTERNATIVE("", \ @@ -371,12 +362,12 @@ DECLARE_PER_CPU(u64, __x86_ret_count); DECLARE_PER_CPU(u64, __x86_stuffs_count); DECLARE_PER_CPU(u64, __x86_ctxsw_count); #endif -#else -static inline void x86_set_skl_return_thunk(void) {} +#else /* !CONFIG_CALL_DEPTH_TRACKING */ +static inline void call_depth_return_thunk(void) {} #define CALL_DEPTH_ACCOUNT "" -#endif +#endif /* CONFIG_CALL_DEPTH_TRACKING */ #ifdef CONFIG_RETPOLINE diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index e3bae2b60a0d..ef2844d69173 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -12,13 +12,6 @@ #define NR_NODE_MEMBLKS (MAX_NUMNODES*2) -/* - * Too small node sizes may confuse the VM badly. Usually they - * result from BIOS bugs. So dont recognize nodes as standalone - * NUMA entities that have less than this amount of RAM listed: - */ -#define NODE_MIN_SIZE (4*1024*1024) - extern int numa_off; /* diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 6c8ff12140ae..d4eb9e1d61b8 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -6,6 +6,10 @@ #include <asm/paravirt_types.h> +#ifndef __ASSEMBLY__ +struct mm_struct; +#endif + #ifdef CONFIG_PARAVIRT #include <asm/pgtable_types.h> #include <asm/asm.h> @@ -142,8 +146,7 @@ static inline void write_cr0(unsigned long x) static __always_inline unsigned long read_cr2(void) { return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2, - "mov %%cr2, %%rax;", - ALT_NOT(X86_FEATURE_XENPV)); + "mov %%cr2, %%rax;", ALT_NOT_XEN); } static __always_inline void write_cr2(unsigned long x) @@ -154,13 +157,12 @@ static __always_inline void write_cr2(unsigned long x) static inline unsigned long __read_cr3(void) { return PVOP_ALT_CALL0(unsigned long, mmu.read_cr3, - "mov %%cr3, %%rax;", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%cr3, %%rax;", ALT_NOT_XEN); } static inline void write_cr3(unsigned long x) { - PVOP_ALT_VCALL1(mmu.write_cr3, x, - "mov %%rdi, %%cr3", ALT_NOT(X86_FEATURE_XENPV)); + PVOP_ALT_VCALL1(mmu.write_cr3, x, "mov %%rdi, %%cr3", ALT_NOT_XEN); } static inline void __write_cr4(unsigned long x) @@ -182,7 +184,7 @@ extern noinstr void pv_native_wbinvd(void); static __always_inline void wbinvd(void) { - PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT(X86_FEATURE_XENPV)); + PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT_XEN); } static inline u64 paravirt_read_msr(unsigned msr) @@ -390,27 +392,25 @@ static inline void paravirt_release_p4d(unsigned long pfn) static inline pte_t __pte(pteval_t val) { return (pte_t) { PVOP_ALT_CALLEE1(pteval_t, mmu.make_pte, val, - "mov %%rdi, %%rax", - ALT_NOT(X86_FEATURE_XENPV)) }; + "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pteval_t pte_val(pte_t pte) { return PVOP_ALT_CALLEE1(pteval_t, mmu.pte_val, pte.pte, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline pgd_t __pgd(pgdval_t val) { return (pgd_t) { PVOP_ALT_CALLEE1(pgdval_t, mmu.make_pgd, val, - "mov %%rdi, %%rax", - ALT_NOT(X86_FEATURE_XENPV)) }; + "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pgdval_t pgd_val(pgd_t pgd) { return PVOP_ALT_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); } #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION @@ -444,14 +444,13 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) static inline pmd_t __pmd(pmdval_t val) { return (pmd_t) { PVOP_ALT_CALLEE1(pmdval_t, mmu.make_pmd, val, - "mov %%rdi, %%rax", - ALT_NOT(X86_FEATURE_XENPV)) }; + "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pmdval_t pmd_val(pmd_t pmd) { return PVOP_ALT_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void set_pud(pud_t *pudp, pud_t pud) @@ -464,7 +463,7 @@ static inline pud_t __pud(pudval_t val) pudval_t ret; ret = PVOP_ALT_CALLEE1(pudval_t, mmu.make_pud, val, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); return (pud_t) { ret }; } @@ -472,7 +471,7 @@ static inline pud_t __pud(pudval_t val) static inline pudval_t pud_val(pud_t pud) { return PVOP_ALT_CALLEE1(pudval_t, mmu.pud_val, pud.pud, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void pud_clear(pud_t *pudp) @@ -492,8 +491,7 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) static inline p4d_t __p4d(p4dval_t val) { p4dval_t ret = PVOP_ALT_CALLEE1(p4dval_t, mmu.make_p4d, val, - "mov %%rdi, %%rax", - ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); return (p4d_t) { ret }; } @@ -501,7 +499,7 @@ static inline p4d_t __p4d(p4dval_t val) static inline p4dval_t p4d_val(p4d_t p4d) { return PVOP_ALT_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd) @@ -687,17 +685,17 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu); static __always_inline unsigned long arch_local_save_flags(void) { return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;", - ALT_NOT(X86_FEATURE_XENPV)); + ALT_NOT_XEN); } static __always_inline void arch_local_irq_disable(void) { - PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT(X86_FEATURE_XENPV)); + PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT_XEN); } static __always_inline void arch_local_irq_enable(void) { - PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT(X86_FEATURE_XENPV)); + PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT_XEN); } static __always_inline unsigned long arch_local_irq_save(void) @@ -726,52 +724,25 @@ static __always_inline unsigned long arch_local_irq_save(void) #undef PVOP_VCALL4 #undef PVOP_CALL4 -#define DEFINE_PARAVIRT_ASM(func, instr, sec) \ - asm (".pushsection " #sec ", \"ax\"\n" \ - ".global " #func "\n\t" \ - ".type " #func ", @function\n\t" \ - ASM_FUNC_ALIGN "\n" \ - #func ":\n\t" \ - ASM_ENDBR \ - instr "\n\t" \ - ASM_RET \ - ".size " #func ", . - " #func "\n\t" \ - ".popsection") - extern void default_banner(void); void native_pv_lock_init(void) __init; #else /* __ASSEMBLY__ */ -#define _PVSITE(ptype, ops, word, algn) \ -771:; \ - ops; \ -772:; \ - .pushsection .parainstructions,"a"; \ - .align algn; \ - word 771b; \ - .byte ptype; \ - .byte 772b-771b; \ - _ASM_ALIGN; \ - .popsection - - #ifdef CONFIG_X86_64 #ifdef CONFIG_PARAVIRT_XXL +#ifdef CONFIG_DEBUG_ENTRY -#define PARA_PATCH(off) ((off) / 8) -#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .quad, 8) #define PARA_INDIRECT(addr) *addr(%rip) -#ifdef CONFIG_DEBUG_ENTRY .macro PARA_IRQ_save_fl - PARA_SITE(PARA_PATCH(PV_IRQ_save_fl), - ANNOTATE_RETPOLINE_SAFE; - call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);) + ANNOTATE_RETPOLINE_SAFE; + call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl); .endm -#define SAVE_FLAGS ALTERNATIVE "PARA_IRQ_save_fl;", "pushf; pop %rax;", \ - ALT_NOT(X86_FEATURE_XENPV) +#define SAVE_FLAGS ALTERNATIVE_2 "PARA_IRQ_save_fl;", \ + "ALT_CALL_INSTR;", ALT_CALL_ALWAYS, \ + "pushf; pop %rax;", ALT_NOT_XEN #endif #endif /* CONFIG_PARAVIRT_XXL */ #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 4acbcddddc29..8d4fbe1be489 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -2,25 +2,10 @@ #ifndef _ASM_X86_PARAVIRT_TYPES_H #define _ASM_X86_PARAVIRT_TYPES_H -#ifndef __ASSEMBLY__ -/* These all sit in the .parainstructions section to tell us what to patch. */ -struct paravirt_patch_site { - u8 *instr; /* original instructions */ - u8 type; /* type of this instruction */ - u8 len; /* length of original instruction */ -}; - -/* Lazy mode for batching updates / context switch */ -enum paravirt_lazy_mode { - PARAVIRT_LAZY_NONE, - PARAVIRT_LAZY_MMU, - PARAVIRT_LAZY_CPU, -}; -#endif - #ifdef CONFIG_PARAVIRT #ifndef __ASSEMBLY__ +#include <linux/types.h> #include <asm/desc_defs.h> #include <asm/pgtable_types.h> @@ -257,43 +242,11 @@ struct paravirt_patch_template { extern struct pv_info pv_info; extern struct paravirt_patch_template pv_ops; -#define PARAVIRT_PATCH(x) \ - (offsetof(struct paravirt_patch_template, x) / sizeof(void *)) - -#define paravirt_type(op) \ - [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ - [paravirt_opptr] "m" (pv_ops.op) -/* - * Generate some code, and mark it as patchable by the - * apply_paravirt() alternate instruction patcher. - */ -#define _paravirt_alt(insn_string, type) \ - "771:\n\t" insn_string "\n" "772:\n" \ - ".pushsection .parainstructions,\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR " 771b\n" \ - " .byte " type "\n" \ - " .byte 772b-771b\n" \ - _ASM_ALIGN "\n" \ - ".popsection\n" - -/* Generate patchable code, with the default asm parameters. */ -#define paravirt_alt(insn_string) \ - _paravirt_alt(insn_string, "%c[paravirt_typenum]") - -/* Simple instruction patching code. */ -#define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t" - -unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, unsigned int len); +#define paravirt_ptr(op) [paravirt_opptr] "m" (pv_ops.op) int paravirt_disable_iospace(void); -/* - * This generates an indirect call based on the operation type number. - * The type number, computed in PARAVIRT_PATCH, is derived from the - * offset into the paravirt_patch_template structure, and can therefore be - * freely converted back into a structure offset. - */ +/* This generates an indirect call based on the operation type number. */ #define PARAVIRT_CALL \ ANNOTATE_RETPOLINE_SAFE \ "call *%[paravirt_opptr];" @@ -326,12 +279,6 @@ int paravirt_disable_iospace(void); * However, x86_64 also has to clobber all caller saved registers, which * unfortunately, are quite a bit (r8 - r11) * - * The call instruction itself is marked by placing its start address - * and size into the .parainstructions section, so that - * apply_paravirt() in arch/i386/kernel/alternative.c can do the - * appropriate patching under the control of the backend pv_init_ops - * implementation. - * * Unfortunately there's no way to get gcc to generate the args setup * for the call, and then allow the call itself to be generated by an * inline asm. Because of this, we must do the complete arg setup and @@ -430,14 +377,27 @@ int paravirt_disable_iospace(void); __mask & __eax; \ }) - +/* + * Use alternative patching for paravirt calls: + * - For replacing an indirect call with a direct one, use the "normal" + * ALTERNATIVE() macro with the indirect call as the initial code sequence, + * which will be replaced with the related direct call by using the + * ALT_FLAG_DIRECT_CALL special case and the "always on" feature. + * - In case the replacement is either a direct call or a short code sequence + * depending on a feature bit, the ALTERNATIVE_2() macro is being used. + * The indirect call is the initial code sequence again, while the special + * code sequence is selected with the specified feature bit. In case the + * feature is not active, the direct call is used as above via the + * ALT_FLAG_DIRECT_CALL special case and the "always on" feature. + */ #define ____PVOP_CALL(ret, op, call_clbr, extra_clbr, ...) \ ({ \ PVOP_CALL_ARGS; \ PVOP_TEST_NULL(op); \ - asm volatile(paravirt_alt(PARAVIRT_CALL) \ + asm volatile(ALTERNATIVE(PARAVIRT_CALL, ALT_CALL_INSTR, \ + ALT_CALL_ALWAYS) \ : call_clbr, ASM_CALL_CONSTRAINT \ - : paravirt_type(op), \ + : paravirt_ptr(op), \ ##__VA_ARGS__ \ : "memory", "cc" extra_clbr); \ ret; \ @@ -448,10 +408,11 @@ int paravirt_disable_iospace(void); ({ \ PVOP_CALL_ARGS; \ PVOP_TEST_NULL(op); \ - asm volatile(ALTERNATIVE(paravirt_alt(PARAVIRT_CALL), \ - alt, cond) \ + asm volatile(ALTERNATIVE_2(PARAVIRT_CALL, \ + ALT_CALL_INSTR, ALT_CALL_ALWAYS, \ + alt, cond) \ : call_clbr, ASM_CALL_CONSTRAINT \ - : paravirt_type(op), \ + : paravirt_ptr(op), \ ##__VA_ARGS__ \ : "memory", "cc" extra_clbr); \ ret; \ @@ -549,16 +510,6 @@ int paravirt_disable_iospace(void); __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) -enum paravirt_lazy_mode paravirt_get_lazy_mode(void); -void paravirt_start_context_switch(struct task_struct *prev); -void paravirt_end_context_switch(struct task_struct *next); - -void paravirt_enter_lazy_mmu(void); -void paravirt_leave_lazy_mmu(void); -void paravirt_flush_lazy_mmu(void); - -void _paravirt_nop(void); -void paravirt_BUG(void); unsigned long paravirt_ret0(void); #ifdef CONFIG_PARAVIRT_XXL u64 _paravirt_ident_64(u64); @@ -568,11 +519,11 @@ void pv_native_irq_enable(void); unsigned long pv_native_read_cr2(void); #endif -#define paravirt_nop ((void *)_paravirt_nop) - -extern struct paravirt_patch_site __parainstructions[], - __parainstructions_end[]; +#define paravirt_nop ((void *)nop_func) #endif /* __ASSEMBLY__ */ + +#define ALT_NOT_XEN ALT_NOT(X86_FEATURE_XENPV) + #endif /* CONFIG_PARAVIRT */ #endif /* _ASM_X86_PARAVIRT_TYPES_H */ diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index e56a37886143..44958ebaf626 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -29,8 +29,8 @@ #else /* ...!ASSEMBLY */ #include <linux/build_bug.h> -#include <linux/kernel.h> #include <linux/stringify.h> +#include <asm/asm.h> #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 85a9fd5a3ec3..3736b8a46c04 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -31,6 +31,7 @@ #define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22) #define ARCH_PERFMON_EVENTSEL_INV (1ULL << 23) #define ARCH_PERFMON_EVENTSEL_CMASK 0xFF000000ULL +#define ARCH_PERFMON_EVENTSEL_BR_CNTR (1ULL << 35) #define INTEL_FIXED_BITS_MASK 0xFULL #define INTEL_FIXED_BITS_STRIDE 4 @@ -112,6 +113,13 @@ (AMD64_PERFMON_V2_EVENTSEL_EVENT_NB | \ AMD64_PERFMON_V2_EVENTSEL_UMASK_NB) +#define AMD64_PERFMON_V2_ENABLE_UMC BIT_ULL(31) +#define AMD64_PERFMON_V2_EVENTSEL_EVENT_UMC GENMASK_ULL(7, 0) +#define AMD64_PERFMON_V2_EVENTSEL_RDWRMASK_UMC GENMASK_ULL(9, 8) +#define AMD64_PERFMON_V2_RAW_EVENT_MASK_UMC \ + (AMD64_PERFMON_V2_EVENTSEL_EVENT_UMC | \ + AMD64_PERFMON_V2_EVENTSEL_RDWRMASK_UMC) + #define AMD64_NUM_COUNTERS 4 #define AMD64_NUM_COUNTERS_CORE 6 #define AMD64_NUM_COUNTERS_NB 4 @@ -216,6 +224,9 @@ union cpuid28_ecx { unsigned int lbr_timed_lbr:1; /* Branch Type Field Supported */ unsigned int lbr_br_type:1; + unsigned int reserved:13; + /* Branch counters (Event Logging) Supported */ + unsigned int lbr_counters:4; } split; unsigned int full; }; @@ -232,6 +243,8 @@ union cpuid_0x80000022_ebx { unsigned int lbr_v2_stack_sz:6; /* Number of Data Fabric Counters */ unsigned int num_df_pmc:6; + /* Number of Unified Memory Controller Counters */ + unsigned int num_umc_pmc:6; } split; unsigned int full; }; diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index d6ad98ca1288..9d077bca6a10 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -141,6 +141,7 @@ static inline int pte_young(pte_t pte) return pte_flags(pte) & _PAGE_ACCESSED; } +#define pmd_dirty pmd_dirty static inline bool pmd_dirty(pmd_t pmd) { return pmd_flags(pmd) & _PAGE_DIRTY_BITS; @@ -955,6 +956,14 @@ static inline int pte_same(pte_t a, pte_t b) return a.pte == b.pte; } +static inline pte_t pte_next_pfn(pte_t pte) +{ + if (__pte_needs_invert(pte_val(pte))) + return __pte(pte_val(pte) - (1UL << PFN_PTE_SHIFT)); + return __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT)); +} +#define pte_next_pfn pte_next_pfn + static inline int pte_present(pte_t a) { return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); @@ -1671,12 +1680,6 @@ static inline bool arch_has_pfn_modify_check(void) return boot_cpu_has_bug(X86_BUG_L1TF); } -#define arch_has_hw_pte_young arch_has_hw_pte_young -static inline bool arch_has_hw_pte_young(void) -{ - return true; -} - #define arch_check_zapped_pte arch_check_zapped_pte void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte); @@ -1708,6 +1711,14 @@ static inline bool pud_user_accessible_page(pud_t pud) } #endif +#ifdef CONFIG_X86_SGX +int arch_memory_failure(unsigned long pfn, int flags); +#define arch_memory_failure arch_memory_failure + +bool arch_is_platform_page(u64 paddr); +#define arch_is_platform_page arch_is_platform_page +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_H */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index a629b1b9f65a..24af25b1551a 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -203,7 +203,7 @@ static inline void native_pgd_clear(pgd_t *pgd) * F (2) in swp entry is used to record when a pagetable is * writeprotected by userfaultfd WP support. * - * E (3) in swp entry is used to rememeber PG_anon_exclusive. + * E (3) in swp entry is used to remember PG_anon_exclusive. * * Bit 7 in swp entry should be 0 because pmd_present checks not only P, * but also L and G. diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 4b2a35d8d56a..919909d8cb77 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -6,7 +6,6 @@ #include <asm/percpu.h> #include <asm/current.h> -#include <linux/thread_info.h> #include <linux/static_call_types.h> /* We use the MSB mostly because its available */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index a94a857152c4..1188e8bf76a2 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -75,11 +75,36 @@ extern u16 __read_mostly tlb_lld_4m[NR_INFO]; extern u16 __read_mostly tlb_lld_1g[NR_INFO]; /* - * CPU type and hardware bug flags. Kept separately for each CPU. - * Members of this structure are referenced in head_32.S, so think twice - * before touching them. [mj] + * CPU type and hardware bug flags. Kept separately for each CPU. */ +struct cpuinfo_topology { + // Real APIC ID read from the local APIC + u32 apicid; + // The initial APIC ID provided by CPUID + u32 initial_apicid; + + // Physical package ID + u32 pkg_id; + + // Physical die ID on AMD, Relative on Intel + u32 die_id; + + // Compute unit ID - AMD specific + u32 cu_id; + + // Core ID relative to the package + u32 core_id; + + // Logical ID mappings + u32 logical_pkg_id; + u32 logical_die_id; + + // Cache level topology IDs + u32 llc_id; + u32 l2c_id; +}; + struct cpuinfo_x86 { __u8 x86; /* CPU family */ __u8 x86_vendor; /* CPU vendor */ @@ -96,7 +121,6 @@ struct cpuinfo_x86 { __u8 x86_phys_bits; /* CPUID returned core id bits: */ __u8 x86_coreid_bits; - __u8 cu_id; /* Max extended CPUID function supported: */ __u32 extended_cpuid_level; /* Maximum supported CPUID level, -1=no CPUID: */ @@ -112,6 +136,7 @@ struct cpuinfo_x86 { }; char x86_vendor_id[16]; char x86_model_id[64]; + struct cpuinfo_topology topo; /* in KB - valid for CPUS which support this call: */ unsigned int x86_cache_size; int x86_cache_alignment; /* In bytes */ @@ -125,19 +150,9 @@ struct cpuinfo_x86 { u64 ppin; /* cpuid returned max cores value: */ u16 x86_max_cores; - u16 apicid; - u16 initial_apicid; u16 x86_clflush_size; /* number of cores as seen by the OS: */ u16 booted_cores; - /* Physical processor id: */ - u16 phys_proc_id; - /* Logical processor id: */ - u16 logical_proc_id; - /* Core id: */ - u16 cpu_core_id; - u16 cpu_die_id; - u16 logical_die_id; /* Index into per_cpu list: */ u16 cpu_index; /* Is SMT active on this core? */ @@ -399,7 +414,7 @@ static inline unsigned long cpu_kernelmode_gs_base(int cpu) return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu); } -extern asmlinkage void ignore_sysret(void); +extern asmlinkage void entry_SYSCALL32_ignore(void); /* Save actual FS/GS selectors and bases to current->thread */ void current_save_fsgs(void); @@ -681,18 +696,24 @@ extern int set_tsc_mode(unsigned int val); DECLARE_PER_CPU(u64, msr_misc_features_shadow); -extern u16 get_llc_id(unsigned int cpu); +static inline u32 per_cpu_llc_id(unsigned int cpu) +{ + return per_cpu(cpu_info.topo.llc_id, cpu); +} + +static inline u32 per_cpu_l2c_id(unsigned int cpu) +{ + return per_cpu(cpu_info.topo.l2c_id, cpu); +} #ifdef CONFIG_CPU_SUP_AMD extern u32 amd_get_nodes_per_socket(void); extern u32 amd_get_highest_perf(void); -extern bool cpu_has_ibpb_brtype_microcode(void); extern void amd_clear_divider(void); extern void amd_check_microcode(void); #else static inline u32 amd_get_nodes_per_socket(void) { return 0; } static inline u32 amd_get_highest_perf(void) { return 0; } -static inline bool cpu_has_ibpb_brtype_microcode(void) { return false; } static inline void amd_clear_divider(void) { } static inline void amd_check_microcode(void) { } #endif @@ -729,14 +750,24 @@ enum mds_mitigations { MDS_MITIGATION_VMWERV, }; -#ifdef CONFIG_X86_SGX -int arch_memory_failure(unsigned long pfn, int flags); -#define arch_memory_failure arch_memory_failure - -bool arch_is_platform_page(u64 paddr); -#define arch_is_platform_page arch_is_platform_page -#endif - extern bool gds_ucode_mitigated(void); +/* + * Make previous memory operations globally visible before + * a WRMSR. + * + * MFENCE makes writes visible, but only affects load/store + * instructions. WRMSR is unfortunately not a load/store + * instruction and is unaffected by MFENCE. The LFENCE ensures + * that the WRMSR is not reordered. + * + * Most WRMSRs are full serializing instructions themselves and + * do not require this barrier. This is only required for the + * IA32_TSC_DEADLINE and X2APIC MSRs. + */ +static inline void weak_wrmsr_fence(void) +{ + alternative("mfence; lfence", "", ALT_NOT(X86_FEATURE_APIC_MSRS_FENCE)); +} + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index b716d291d0d4..65dee2420624 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -31,6 +31,11 @@ static inline void x86_dtb_init(void) { } #define of_ioapic 0 #endif +#ifdef CONFIG_OF_EARLY_FLATTREE +void x86_flattree_get_config(void); +#else +static inline void x86_flattree_get_config(void) { } +#endif extern char cmd_line[COMMAND_LINE_SIZE]; #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 12ef86b19910..484f4f0131a5 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -32,10 +32,9 @@ void entry_SYSCALL_compat(void); void entry_SYSCALL_compat_safe_stack(void); void entry_SYSRETL_compat_unsafe_stack(void); void entry_SYSRETL_compat_end(void); -void entry_INT80_compat(void); -#ifdef CONFIG_XEN_PV -void xen_entry_INT80_compat(void); -#endif +#else /* !CONFIG_IA32_EMULATION */ +#define entry_SYSCALL_compat NULL +#define entry_SYSENTER_compat NULL #endif void x86_configure_nx(void); diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index 85b6e3609cb9..ef9697f20129 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -56,8 +56,8 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text"); "pop %rdx\n\t" \ FRAME_END -DEFINE_PARAVIRT_ASM(__raw_callee_save___pv_queued_spin_unlock, - PV_UNLOCK_ASM, .spinlock.text); +DEFINE_ASM_FUNC(__raw_callee_save___pv_queued_spin_unlock, + PV_UNLOCK_ASM, .spinlock.text); #else /* CONFIG_64BIT */ diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index 9177b4354c3f..6536873f8fc0 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -25,7 +25,14 @@ void __noreturn machine_real_restart(unsigned int type); #define MRR_BIOS 0 #define MRR_APM 1 +#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) +typedef void (cpu_emergency_virt_cb)(void); +void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback); +void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback); void cpu_emergency_disable_virtualization(void); +#else +static inline void cpu_emergency_disable_virtualization(void) {} +#endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */ typedef void (*nmi_shootdown_cb)(int, struct pt_regs*); void nmi_shootdown_cpus(nmi_shootdown_cb callback); diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index 4b081e0d3306..363266cbcada 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -13,7 +13,7 @@ #define __GEN_RMWcc(fullop, _var, cc, clobbers, ...) \ ({ \ bool c = false; \ - asm_volatile_goto (fullop "; j" #cc " %l[cc_label]" \ + asm goto (fullop "; j" #cc " %l[cc_label]" \ : : [var] "m" (_var), ## __VA_ARGS__ \ : clobbers : cc_label); \ if (0) { \ diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index f3495623ac99..5c83729c8e71 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -31,8 +31,6 @@ #include <asm/bootparam.h> #include <asm/x86_init.h> -extern u64 relocated_ramdisk; - /* Interrupt control for vSMPowered x86_64 systems */ #ifdef CONFIG_X86_64 void vsmp_init(void); @@ -126,6 +124,7 @@ void clear_bss(void); #ifdef __i386__ asmlinkage void __init __noreturn i386_start_kernel(void); +void __init mk_early_pgtbl_32(void); #else asmlinkage void __init __noreturn x86_64_start_kernel(char *real_mode); diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index 7513b3bb69b7..fdfd41511b02 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -11,45 +11,91 @@ #define TDX_IDENT "IntelTDX " /* TDX module Call Leaf IDs */ -#define TDX_GET_INFO 1 -#define TDX_GET_VEINFO 3 -#define TDX_GET_REPORT 4 -#define TDX_ACCEPT_PAGE 6 -#define TDX_WR 8 +#define TDG_VP_VMCALL 0 +#define TDG_VP_INFO 1 +#define TDG_VP_VEINFO_GET 3 +#define TDG_MR_REPORT 4 +#define TDG_MEM_PAGE_ACCEPT 6 +#define TDG_VM_WR 8 /* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */ #define TDCS_NOTIFY_ENABLES 0x9100000000000010 /* TDX hypercall Leaf IDs */ #define TDVMCALL_MAP_GPA 0x10001 +#define TDVMCALL_GET_QUOTE 0x10002 #define TDVMCALL_REPORT_FATAL_ERROR 0x10003 +#define TDVMCALL_STATUS_RETRY 1 + +/* + * Bitmasks of exposed registers (with VMM). + */ +#define TDX_RDX BIT(2) +#define TDX_RBX BIT(3) +#define TDX_RSI BIT(6) +#define TDX_RDI BIT(7) +#define TDX_R8 BIT(8) +#define TDX_R9 BIT(9) +#define TDX_R10 BIT(10) +#define TDX_R11 BIT(11) +#define TDX_R12 BIT(12) +#define TDX_R13 BIT(13) +#define TDX_R14 BIT(14) +#define TDX_R15 BIT(15) + +/* + * These registers are clobbered to hold arguments for each + * TDVMCALL. They are safe to expose to the VMM. + * Each bit in this mask represents a register ID. Bit field + * details can be found in TDX GHCI specification, section + * titled "TDCALL [TDG.VP.VMCALL] leaf". + */ +#define TDVMCALL_EXPOSE_REGS_MASK \ + (TDX_RDX | TDX_RBX | TDX_RSI | TDX_RDI | TDX_R8 | TDX_R9 | \ + TDX_R10 | TDX_R11 | TDX_R12 | TDX_R13 | TDX_R14 | TDX_R15) + +/* TDX supported page sizes from the TDX module ABI. */ +#define TDX_PS_4K 0 +#define TDX_PS_2M 1 +#define TDX_PS_1G 2 +#define TDX_PS_NR (TDX_PS_1G + 1) + #ifndef __ASSEMBLY__ +#include <linux/compiler_attributes.h> + /* - * Used in __tdx_hypercall() to pass down and get back registers' values of - * the TDCALL instruction when requesting services from the VMM. - * - * This is a software only structure and not part of the TDX module/VMM ABI. + * Used in __tdcall*() to gather the input/output registers' values of the + * TDCALL instruction when requesting services from the TDX module. This is a + * software only structure and not part of the TDX module/VMM ABI */ -struct tdx_hypercall_args { +struct tdx_module_args { + /* callee-clobbered */ + u64 rcx; + u64 rdx; u64 r8; u64 r9; + /* extra callee-clobbered */ u64 r10; u64 r11; + /* callee-saved + rdi/rsi */ u64 r12; u64 r13; u64 r14; u64 r15; + u64 rbx; u64 rdi; u64 rsi; - u64 rbx; - u64 rdx; }; +/* Used to communicate with the TDX module */ +u64 __tdcall(u64 fn, struct tdx_module_args *args); +u64 __tdcall_ret(u64 fn, struct tdx_module_args *args); +u64 __tdcall_saved_ret(u64 fn, struct tdx_module_args *args); + /* Used to request services from the VMM */ -u64 __tdx_hypercall(struct tdx_hypercall_args *args); -u64 __tdx_hypercall_ret(struct tdx_hypercall_args *args); +u64 __tdx_hypercall(struct tdx_module_args *args); /* * Wrapper for standard use of __tdx_hypercall with no output aside from @@ -57,7 +103,7 @@ u64 __tdx_hypercall_ret(struct tdx_hypercall_args *args); */ static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15) { - struct tdx_hypercall_args args = { + struct tdx_module_args args = { .r10 = TDX_HYPERCALL_STANDARD, .r11 = fn, .r12 = r12, @@ -71,25 +117,7 @@ static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15) /* Called from __tdx_hypercall() for unrecoverable failure */ -void __tdx_hypercall_failed(void); - -/* - * Used in __tdx_module_call() to gather the output registers' values of the - * TDCALL instruction when requesting services from the TDX module. This is a - * software only structure and not part of the TDX module/VMM ABI - */ -struct tdx_module_output { - u64 rcx; - u64 rdx; - u64 r8; - u64 r9; - u64 r10; - u64 r11; -}; - -/* Used to communicate with the TDX module */ -u64 __tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, - struct tdx_module_output *out); +void __noreturn __tdx_hypercall_failed(void); bool tdx_accept_memory(phys_addr_t start, phys_addr_t end); diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index ad98dd1d9cfb..4fab2ed454f3 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -17,10 +17,8 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map); /* cpus sharing the last level cache: */ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map); -DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id); -DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id); -DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid); +DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid); DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid); struct task_struct; @@ -129,7 +127,6 @@ void native_smp_send_reschedule(int cpu); void native_send_call_func_ipi(const struct cpumask *mask); void native_send_call_func_single_ipi(int cpu); -bool smp_park_other_cpus_in_init(void); void smp_store_cpu_info(int id); asmlinkage __visible void smp_reboot_interrupt(void); diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index 64df897c0ee3..1be13b2dfe8b 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h @@ -37,6 +37,8 @@ extern int phys_to_target_node(phys_addr_t start); #define phys_to_target_node phys_to_target_node extern int memory_add_physaddr_to_nid(u64 start); #define memory_add_physaddr_to_nid memory_add_physaddr_to_nid +extern int numa_fill_memblks(u64 start, u64 end); +#define numa_fill_memblks numa_fill_memblks #endif #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h index cb0386fc4dc3..c648502e4535 100644 --- a/arch/x86/include/asm/spec-ctrl.h +++ b/arch/x86/include/asm/spec-ctrl.h @@ -4,6 +4,7 @@ #include <linux/thread_info.h> #include <asm/nospec-branch.h> +#include <asm/msr.h> /* * On VMENTER we must preserve whatever view of the SPEC_CTRL MSR @@ -76,6 +77,16 @@ static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn) return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL; } +/* + * This can be used in noinstr functions & should only be called in bare + * metal context. + */ +static __always_inline void __update_spec_ctrl(u64 val) +{ + __this_cpu_write(x86_spec_ctrl_current, val); + native_wrmsrl(MSR_IA32_SPEC_CTRL, val); +} + #ifdef CONFIG_SMP extern void speculative_store_bypass_ht_init(void); #else diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index d6cd9344f6c7..48f8dd47cf68 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -205,7 +205,7 @@ static inline void clwb(volatile void *__p) #ifdef CONFIG_X86_USER_SHADOW_STACK static inline int write_user_shstk_64(u64 __user *addr, u64 val) { - asm_volatile_goto("1: wrussq %[val], (%[addr])\n" + asm goto("1: wrussq %[val], (%[addr])\n" _ASM_EXTABLE(1b, %l[fail]) :: [addr] "r" (addr), [val] "r" (val) :: fail); diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index e7c7379d6ac7..87a7b917d30e 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -229,10 +229,6 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT) #define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT) -#define SVM_VM_CR_VALID_MASK 0x001fULL -#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL -#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL - #define SVM_NESTED_CTL_NP_ENABLE BIT(0) #define SVM_NESTED_CTL_SEV_ENABLE BIT(1) #define SVM_NESTED_CTL_SEV_ES_ENABLE BIT(2) @@ -268,6 +264,7 @@ enum avic_ipi_failure_cause { AVIC_IPI_FAILURE_TARGET_NOT_RUNNING, AVIC_IPI_FAILURE_INVALID_TARGET, AVIC_IPI_FAILURE_INVALID_BACKING_PAGE, + AVIC_IPI_FAILURE_INVALID_IPI_VECTOR, }; #define AVIC_PHYSICAL_MAX_INDEX_MASK GENMASK_ULL(8, 0) @@ -288,6 +285,7 @@ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_ #define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) +#define SVM_SEV_FEAT_DEBUG_SWAP BIT(5) struct vmcb_seg { u16 selector; @@ -345,7 +343,7 @@ struct vmcb_save_area { u64 last_excp_from; u64 last_excp_to; u8 reserved_0x298[72]; - u32 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */ + u64 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */ } __packed; /* Save area definition for SEV-ES and SEV-SNP guests */ @@ -512,7 +510,7 @@ struct ghcb { } __packed; -#define EXPECTED_VMCB_SAVE_AREA_SIZE 740 +#define EXPECTED_VMCB_SAVE_AREA_SIZE 744 #define EXPECTED_GHCB_SAVE_AREA_SIZE 1032 #define EXPECTED_SEV_ES_SAVE_AREA_SIZE 1648 #define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024 @@ -570,8 +568,6 @@ struct vmcb { #define SVM_CPUID_FUNC 0x8000000a -#define SVM_VM_CR_SVM_DISABLE 4 - #define SVM_SELECTOR_S_SHIFT 4 #define SVM_SELECTOR_DPL_SHIFT 5 #define SVM_SELECTOR_P_SHIFT 7 diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 4fb36fba4b5a..f44e2f9ab65d 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -126,12 +126,12 @@ static inline int syscall_get_arch(struct task_struct *task) ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; } -void do_syscall_64(struct pt_regs *regs, int nr); +bool do_syscall_64(struct pt_regs *regs, int nr); #endif /* CONFIG_X86_32 */ void do_int80_syscall_32(struct pt_regs *regs); -long do_fast_syscall_32(struct pt_regs *regs); -long do_SYSENTER_32(struct pt_regs *regs); +bool do_fast_syscall_32(struct pt_regs *regs); +bool do_SYSENTER_32(struct pt_regs *regs); #endif /* _ASM_X86_SYSCALL_H */ diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h index fd2669b1cb2d..7e88705e907f 100644 --- a/arch/x86/include/asm/syscall_wrapper.h +++ b/arch/x86/include/asm/syscall_wrapper.h @@ -58,12 +58,29 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); ,,regs->di,,regs->si,,regs->dx \ ,,regs->r10,,regs->r8,,regs->r9) \ + +/* SYSCALL_PT_ARGS is Adapted from s390x */ +#define SYSCALL_PT_ARG6(m, t1, t2, t3, t4, t5, t6) \ + SYSCALL_PT_ARG5(m, t1, t2, t3, t4, t5), m(t6, (regs->bp)) +#define SYSCALL_PT_ARG5(m, t1, t2, t3, t4, t5) \ + SYSCALL_PT_ARG4(m, t1, t2, t3, t4), m(t5, (regs->di)) +#define SYSCALL_PT_ARG4(m, t1, t2, t3, t4) \ + SYSCALL_PT_ARG3(m, t1, t2, t3), m(t4, (regs->si)) +#define SYSCALL_PT_ARG3(m, t1, t2, t3) \ + SYSCALL_PT_ARG2(m, t1, t2), m(t3, (regs->dx)) +#define SYSCALL_PT_ARG2(m, t1, t2) \ + SYSCALL_PT_ARG1(m, t1), m(t2, (regs->cx)) +#define SYSCALL_PT_ARG1(m, t1) m(t1, (regs->bx)) +#define SYSCALL_PT_ARGS(x, ...) SYSCALL_PT_ARG##x(__VA_ARGS__) + +#define __SC_COMPAT_CAST(t, a) \ + (__typeof(__builtin_choose_expr(__TYPE_IS_L(t), 0, 0U))) \ + (unsigned int)a + /* Mapping of registers to parameters for syscalls on i386 */ #define SC_IA32_REGS_TO_ARGS(x, ...) \ - __MAP(x,__SC_ARGS \ - ,,(unsigned int)regs->bx,,(unsigned int)regs->cx \ - ,,(unsigned int)regs->dx,,(unsigned int)regs->si \ - ,,(unsigned int)regs->di,,(unsigned int)regs->bp) + SYSCALL_PT_ARGS(x, __SC_COMPAT_CAST, \ + __MAP(x, __SC_TYPE, __VA_ARGS__)) \ #define __SYS_STUB0(abi, name) \ long __##abi##_##name(const struct pt_regs *regs); \ @@ -86,9 +103,6 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); return sys_ni_syscall(); \ } -#define __SYS_NI(abi, name) \ - SYSCALL_ALIAS(__##abi##_##name, sys_ni_posix_timers); - #ifdef CONFIG_X86_64 #define __X64_SYS_STUB0(name) \ __SYS_STUB0(x64, sys_##name) @@ -100,13 +114,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); #define __X64_COND_SYSCALL(name) \ __COND_SYSCALL(x64, sys_##name) -#define __X64_SYS_NI(name) \ - __SYS_NI(x64, sys_##name) #else /* CONFIG_X86_64 */ #define __X64_SYS_STUB0(name) #define __X64_SYS_STUBx(x, name, ...) #define __X64_COND_SYSCALL(name) -#define __X64_SYS_NI(name) #endif /* CONFIG_X86_64 */ #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) @@ -120,13 +131,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); #define __IA32_COND_SYSCALL(name) \ __COND_SYSCALL(ia32, sys_##name) -#define __IA32_SYS_NI(name) \ - __SYS_NI(ia32, sys_##name) #else /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ #define __IA32_SYS_STUB0(name) #define __IA32_SYS_STUBx(x, name, ...) #define __IA32_COND_SYSCALL(name) -#define __IA32_SYS_NI(name) #endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ #ifdef CONFIG_IA32_EMULATION @@ -135,8 +143,7 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); * additional wrappers (aptly named __ia32_sys_xyzzy) which decode the * ia32 regs in the proper order for shared or "common" syscalls. As some * syscalls may not be implemented, we need to expand COND_SYSCALL in - * kernel/sys_ni.c and SYS_NI in kernel/time/posix-stubs.c to cover this - * case as well. + * kernel/sys_ni.c to cover this case as well. */ #define __IA32_COMPAT_SYS_STUB0(name) \ __SYS_STUB0(ia32, compat_sys_##name) @@ -148,14 +155,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); #define __IA32_COMPAT_COND_SYSCALL(name) \ __COND_SYSCALL(ia32, compat_sys_##name) -#define __IA32_COMPAT_SYS_NI(name) \ - __SYS_NI(ia32, compat_sys_##name) - #else /* CONFIG_IA32_EMULATION */ #define __IA32_COMPAT_SYS_STUB0(name) #define __IA32_COMPAT_SYS_STUBx(x, name, ...) #define __IA32_COMPAT_COND_SYSCALL(name) -#define __IA32_COMPAT_SYS_NI(name) #endif /* CONFIG_IA32_EMULATION */ @@ -175,13 +178,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); #define __X32_COMPAT_COND_SYSCALL(name) \ __COND_SYSCALL(x64, compat_sys_##name) -#define __X32_COMPAT_SYS_NI(name) \ - __SYS_NI(x64, compat_sys_##name) #else /* CONFIG_X86_X32_ABI */ #define __X32_COMPAT_SYS_STUB0(name) #define __X32_COMPAT_SYS_STUBx(x, name, ...) #define __X32_COMPAT_COND_SYSCALL(name) -#define __X32_COMPAT_SYS_NI(name) #endif /* CONFIG_X86_X32_ABI */ @@ -212,17 +212,12 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); /* * As some compat syscalls may not be implemented, we need to expand - * COND_SYSCALL_COMPAT in kernel/sys_ni.c and COMPAT_SYS_NI in - * kernel/time/posix-stubs.c to cover this case as well. + * COND_SYSCALL_COMPAT in kernel/sys_ni.c to cover this case as well. */ #define COND_SYSCALL_COMPAT(name) \ __IA32_COMPAT_COND_SYSCALL(name) \ __X32_COMPAT_COND_SYSCALL(name) -#define COMPAT_SYS_NI(name) \ - __IA32_COMPAT_SYS_NI(name) \ - __X32_COMPAT_SYS_NI(name) - #endif /* CONFIG_COMPAT */ #define __SYSCALL_DEFINEx(x, name, ...) \ @@ -243,8 +238,8 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); * As the generic SYSCALL_DEFINE0() macro does not decode any parameters for * obvious reasons, and passing struct pt_regs *regs to it in %rdi does not * hurt, we only need to re-define it here to keep the naming congruent to - * SYSCALL_DEFINEx() -- which is essential for the COND_SYSCALL() and SYS_NI() - * macros to work correctly. + * SYSCALL_DEFINEx() -- which is essential for the COND_SYSCALL() macro + * to work correctly. */ #define SYSCALL_DEFINE0(sname) \ SYSCALL_METADATA(_##sname, 0); \ @@ -257,10 +252,6 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); __X64_COND_SYSCALL(name) \ __IA32_COND_SYSCALL(name) -#define SYS_NI(name) \ - __X64_SYS_NI(name) \ - __IA32_SYS_NI(name) - /* * For VSYSCALLS, we need to declare these three syscalls with the new diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 603e6d1e9d4a..eba178996d84 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -8,6 +8,7 @@ #include <asm/errno.h> #include <asm/ptrace.h> +#include <asm/trapnr.h> #include <asm/shared/tdx.h> /* @@ -20,8 +21,19 @@ #define TDX_SW_ERROR (TDX_ERROR | GENMASK_ULL(47, 40)) #define TDX_SEAMCALL_VMFAILINVALID (TDX_SW_ERROR | _UL(0xFFFF0000)) +#define TDX_SEAMCALL_GP (TDX_SW_ERROR | X86_TRAP_GP) +#define TDX_SEAMCALL_UD (TDX_SW_ERROR | X86_TRAP_UD) + +/* + * TDX module SEAMCALL leaf function error codes + */ +#define TDX_SUCCESS 0ULL +#define TDX_RND_NO_ENTROPY 0x8000020300000000ULL + #ifndef __ASSEMBLY__ +#include <uapi/asm/mce.h> + /* * Used by the #VE exception handler to gather the #VE exception * info from the TDX module. This is a software only structure @@ -52,6 +64,8 @@ bool tdx_early_handle_ve(struct pt_regs *regs); int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport); +u64 tdx_hcall_get_quote(u8 *buf, size_t size); + #else static inline void tdx_early_init(void) { }; @@ -72,5 +86,42 @@ static inline long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, return -ENODEV; } #endif /* CONFIG_INTEL_TDX_GUEST && CONFIG_KVM_GUEST */ + +#ifdef CONFIG_INTEL_TDX_HOST +u64 __seamcall(u64 fn, struct tdx_module_args *args); +u64 __seamcall_ret(u64 fn, struct tdx_module_args *args); +u64 __seamcall_saved_ret(u64 fn, struct tdx_module_args *args); +void tdx_init(void); + +#include <asm/archrandom.h> + +typedef u64 (*sc_func_t)(u64 fn, struct tdx_module_args *args); + +static inline u64 sc_retry(sc_func_t func, u64 fn, + struct tdx_module_args *args) +{ + int retry = RDRAND_RETRY_LOOPS; + u64 ret; + + do { + ret = func(fn, args); + } while (ret == TDX_RND_NO_ENTROPY && --retry); + + return ret; +} + +#define seamcall(_fn, _args) sc_retry(__seamcall, (_fn), (_args)) +#define seamcall_ret(_fn, _args) sc_retry(__seamcall_ret, (_fn), (_args)) +#define seamcall_saved_ret(_fn, _args) sc_retry(__seamcall_saved_ret, (_fn), (_args)) +int tdx_cpu_enable(void); +int tdx_enable(void); +const char *tdx_dump_mce_info(struct mce *m); +#else +static inline void tdx_init(void) { } +static inline int tdx_cpu_enable(void) { return -ENODEV; } +static inline int tdx_enable(void) { return -ENODEV; } +static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; } +#endif /* CONFIG_INTEL_TDX_HOST */ + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_TDX_H */ diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index fb338f00d383..345aafbc1964 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -6,20 +6,6 @@ #include <linux/stddef.h> #include <asm/ptrace.h> -struct paravirt_patch_site; -#ifdef CONFIG_PARAVIRT -void apply_paravirt(struct paravirt_patch_site *start, - struct paravirt_patch_site *end); -#else -static inline void apply_paravirt(struct paravirt_patch_site *start, - struct paravirt_patch_site *end) -{} -#define __parainstructions NULL -#define __parainstructions_end NULL -#endif - -extern void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len); - /* * Currently, the max observed size in the kernel code is * JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5. @@ -29,6 +15,8 @@ extern void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_ extern void text_poke_early(void *addr, const void *opcode, size_t len); +extern void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len); + /* * Clear and restore the kernel write-protection flag on the local CPU. * Allows the kernel to edit read-only pages. diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 3235ba1e5b06..5f87f6b9b09e 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -105,17 +105,17 @@ static inline void setup_node_to_cpumask_map(void) { } extern const struct cpumask *cpu_coregroup_mask(int cpu); extern const struct cpumask *cpu_clustergroup_mask(int cpu); -#define topology_logical_package_id(cpu) (cpu_data(cpu).logical_proc_id) -#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id) -#define topology_logical_die_id(cpu) (cpu_data(cpu).logical_die_id) -#define topology_die_id(cpu) (cpu_data(cpu).cpu_die_id) -#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) +#define topology_logical_package_id(cpu) (cpu_data(cpu).topo.logical_pkg_id) +#define topology_physical_package_id(cpu) (cpu_data(cpu).topo.pkg_id) +#define topology_logical_die_id(cpu) (cpu_data(cpu).topo.logical_die_id) +#define topology_die_id(cpu) (cpu_data(cpu).topo.die_id) +#define topology_core_id(cpu) (cpu_data(cpu).topo.core_id) #define topology_ppin(cpu) (cpu_data(cpu).ppin) extern unsigned int __max_die_per_package; #ifdef CONFIG_SMP -#define topology_cluster_id(cpu) (per_cpu(cpu_l2c_id, cpu)) +#define topology_cluster_id(cpu) (cpu_data(cpu).topo.l2c_id) #define topology_die_cpumask(cpu) (per_cpu(cpu_die_map, cpu)) #define topology_cluster_cpumask(cpu) (cpu_clustergroup_mask(cpu)) #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index b1c9cea6ba88..1f1deaecd364 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -14,7 +14,6 @@ asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs); asmlinkage __visible notrace struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs); -void __init trap_init(void); asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *eregs); #endif diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 8bae40a66282..237dc8cdd12b 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -133,7 +133,7 @@ extern int __get_user_bad(void); #ifdef CONFIG_X86_32 #define __put_user_goto_u64(x, addr, label) \ - asm_volatile_goto("\n" \ + asm goto("\n" \ "1: movl %%eax,0(%1)\n" \ "2: movl %%edx,4(%1)\n" \ _ASM_EXTABLE_UA(1b, %l2) \ @@ -295,7 +295,7 @@ do { \ } while (0) #define __get_user_asm(x, addr, itype, ltype, label) \ - asm_volatile_goto("\n" \ + asm_goto_output("\n" \ "1: mov"itype" %[umem],%[output]\n" \ _ASM_EXTABLE_UA(1b, %l2) \ : [output] ltype(x) \ @@ -375,7 +375,7 @@ do { \ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ - asm_volatile_goto("\n" \ + asm_goto_output("\n" \ "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\ _ASM_EXTABLE_UA(1b, %l[label]) \ : CC_OUT(z) (success), \ @@ -394,7 +394,7 @@ do { \ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ - asm_volatile_goto("\n" \ + asm_goto_output("\n" \ "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n" \ _ASM_EXTABLE_UA(1b, %l[label]) \ : CC_OUT(z) (success), \ @@ -477,7 +477,7 @@ struct __large_struct { unsigned long buf[100]; }; * aliasing issues. */ #define __put_user_goto(x, addr, itype, ltype, label) \ - asm_volatile_goto("\n" \ + asm goto("\n" \ "1: mov"itype" %0,%1\n" \ _ASM_EXTABLE_UA(1b, %l2) \ : : ltype(x), "m" (__m(addr)) \ @@ -496,7 +496,7 @@ copy_mc_to_kernel(void *to, const void *from, unsigned len); #define copy_mc_to_kernel copy_mc_to_kernel unsigned long __must_check -copy_mc_to_user(void *to, const void *from, unsigned len); +copy_mc_to_user(void __user *to, const void *from, unsigned len); #endif /* diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 5fa76c2ced51..ea877fd83114 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -653,7 +653,7 @@ static inline int uv_blade_to_node(int blade) return uv_socket_to_node(blade); } -/* Blade number of current cpu. Numnbered 0 .. <#blades -1> */ +/* Blade number of current cpu. Numbered 0 .. <#blades -1> */ static inline int uv_numa_blade_id(void) { return uv_hub_info->numa_blade_id; diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h index c81858d903dc..8e048ca980df 100644 --- a/arch/x86/include/asm/vdso/gettimeofday.h +++ b/arch/x86/include/asm/vdso/gettimeofday.h @@ -321,7 +321,7 @@ static __always_inline u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult) { /* - * Due to the MSB/Sign-bit being used as invald marker (see + * Due to the MSB/Sign-bit being used as invalid marker (see * arch_vdso_cycles_valid() above), the effective mask is S64_MAX. */ u64 delta = (cycles - last) & S64_MAX; @@ -337,8 +337,6 @@ u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult) } #define vdso_calc_delta vdso_calc_delta -int __vdso_clock_gettime64(clockid_t clock, struct __kernel_timespec *ts); - #endif /* !__ASSEMBLY__ */ #endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h deleted file mode 100644 index 3b12e6b99412..000000000000 --- a/arch/x86/include/asm/virtext.h +++ /dev/null @@ -1,154 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* CPU virtualization extensions handling - * - * This should carry the code for handling CPU virtualization extensions - * that needs to live in the kernel core. - * - * Author: Eduardo Habkost <ehabkost@redhat.com> - * - * Copyright (C) 2008, Red Hat Inc. - * - * Contains code from KVM, Copyright (C) 2006 Qumranet, Inc. - */ -#ifndef _ASM_X86_VIRTEX_H -#define _ASM_X86_VIRTEX_H - -#include <asm/processor.h> - -#include <asm/vmx.h> -#include <asm/svm.h> -#include <asm/tlbflush.h> - -/* - * VMX functions: - */ - -static inline int cpu_has_vmx(void) -{ - unsigned long ecx = cpuid_ecx(1); - return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ -} - - -/** - * cpu_vmxoff() - Disable VMX on the current CPU - * - * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) - * - * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to - * atomically track post-VMXON state, e.g. this may be called in NMI context. - * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. - * faults are guaranteed to be due to the !post-VMXON check unless the CPU is - * magically in RM, VM86, compat mode, or at CPL>0. - */ -static inline int cpu_vmxoff(void) -{ - asm_volatile_goto("1: vmxoff\n\t" - _ASM_EXTABLE(1b, %l[fault]) - ::: "cc", "memory" : fault); - - cr4_clear_bits(X86_CR4_VMXE); - return 0; - -fault: - cr4_clear_bits(X86_CR4_VMXE); - return -EIO; -} - -static inline int cpu_vmx_enabled(void) -{ - return __read_cr4() & X86_CR4_VMXE; -} - -/** Disable VMX if it is enabled on the current CPU - * - * You shouldn't call this if cpu_has_vmx() returns 0. - */ -static inline void __cpu_emergency_vmxoff(void) -{ - if (cpu_vmx_enabled()) - cpu_vmxoff(); -} - -/** Disable VMX if it is supported and enabled on the current CPU - */ -static inline void cpu_emergency_vmxoff(void) -{ - if (cpu_has_vmx()) - __cpu_emergency_vmxoff(); -} - - - - -/* - * SVM functions: - */ - -/** Check if the CPU has SVM support - * - * You can use the 'msg' arg to get a message describing the problem, - * if the function returns zero. Simply pass NULL if you are not interested - * on the messages; gcc should take care of not generating code for - * the messages on this case. - */ -static inline int cpu_has_svm(const char **msg) -{ - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && - boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) { - if (msg) - *msg = "not amd or hygon"; - return 0; - } - - if (boot_cpu_data.extended_cpuid_level < SVM_CPUID_FUNC) { - if (msg) - *msg = "can't execute cpuid_8000000a"; - return 0; - } - - if (!boot_cpu_has(X86_FEATURE_SVM)) { - if (msg) - *msg = "svm not available"; - return 0; - } - return 1; -} - - -/** Disable SVM on the current CPU - * - * You should call this only if cpu_has_svm() returned true. - */ -static inline void cpu_svm_disable(void) -{ - uint64_t efer; - - wrmsrl(MSR_VM_HSAVE_PA, 0); - rdmsrl(MSR_EFER, efer); - if (efer & EFER_SVME) { - /* - * Force GIF=1 prior to disabling SVM to ensure INIT and NMI - * aren't blocked, e.g. if a fatal error occurred between CLGI - * and STGI. Note, STGI may #UD if SVM is disabled from NMI - * context between reading EFER and executing STGI. In that - * case, GIF must already be set, otherwise the NMI would have - * been blocked, so just eat the fault. - */ - asm_volatile_goto("1: stgi\n\t" - _ASM_EXTABLE(1b, %l[fault]) - ::: "memory" : fault); -fault: - wrmsrl(MSR_EFER, efer & ~EFER_SVME); - } -} - -/** Makes sure SVM is disabled, if it is supported on the CPU - */ -static inline void cpu_emergency_svm_disable(void) -{ - if (cpu_has_svm(NULL)) - cpu_svm_disable(); -} - -#endif /* _ASM_X86_VIRTEX_H */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 0d02c4aafa6f..0e73616b82f3 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -71,7 +71,7 @@ #define SECONDARY_EXEC_RDSEED_EXITING VMCS_CONTROL_BIT(RDSEED_EXITING) #define SECONDARY_EXEC_ENABLE_PML VMCS_CONTROL_BIT(PAGE_MOD_LOGGING) #define SECONDARY_EXEC_PT_CONCEAL_VMX VMCS_CONTROL_BIT(PT_CONCEAL_VMX) -#define SECONDARY_EXEC_XSAVES VMCS_CONTROL_BIT(XSAVES) +#define SECONDARY_EXEC_ENABLE_XSAVES VMCS_CONTROL_BIT(XSAVES) #define SECONDARY_EXEC_MODE_BASED_EPT_EXEC VMCS_CONTROL_BIT(MODE_BASED_EPT_EXEC) #define SECONDARY_EXEC_PT_USE_GPA VMCS_CONTROL_BIT(PT_USE_GPA) #define SECONDARY_EXEC_TSC_SCALING VMCS_CONTROL_BIT(TSC_SCALING) diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 5240d88db52a..c878616a18b8 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -177,7 +177,7 @@ struct x86_init_ops { * struct x86_cpuinit_ops - platform specific cpu hotplug setups * @setup_percpu_clockev: set up the per cpu clock event device * @early_percpu_clock_init: early init of the per cpu clock event device - * @fixup_cpu_id: fixup function for cpuinfo_x86::phys_proc_id + * @fixup_cpu_id: fixup function for cpuinfo_x86::topo.pkg_id * @parallel_bringup: Parallel bringup control */ struct x86_cpuinit_ops { diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 5fc35f889cd1..a9088250770f 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -36,6 +36,7 @@ extern struct shared_info *HYPERVISOR_shared_info; extern struct start_info *xen_start_info; +#include <asm/bug.h> #include <asm/processor.h> #define XEN_SIGNATURE "XenVMMXenVMM" @@ -63,4 +64,49 @@ void __init xen_pvh_init(struct boot_params *boot_params); void __init mem_map_via_hcall(struct boot_params *boot_params_p); #endif +/* Lazy mode for batching updates / context switch */ +enum xen_lazy_mode { + XEN_LAZY_NONE, + XEN_LAZY_MMU, + XEN_LAZY_CPU, +}; + +DECLARE_PER_CPU(enum xen_lazy_mode, xen_lazy_mode); +DECLARE_PER_CPU(unsigned int, xen_lazy_nesting); + +static inline void enter_lazy(enum xen_lazy_mode mode) +{ + enum xen_lazy_mode old_mode = this_cpu_read(xen_lazy_mode); + + if (mode == old_mode) { + this_cpu_inc(xen_lazy_nesting); + return; + } + + BUG_ON(old_mode != XEN_LAZY_NONE); + + this_cpu_write(xen_lazy_mode, mode); +} + +static inline void leave_lazy(enum xen_lazy_mode mode) +{ + BUG_ON(this_cpu_read(xen_lazy_mode) != mode); + + if (this_cpu_read(xen_lazy_nesting) == 0) + this_cpu_write(xen_lazy_mode, XEN_LAZY_NONE); + else + this_cpu_dec(xen_lazy_nesting); +} + +enum xen_lazy_mode xen_get_lazy_mode(void); + +#if defined(CONFIG_XEN_DOM0) && defined(CONFIG_ACPI) +void xen_sanitize_proc_cap_bits(uint32_t *buf); +#else +static inline void xen_sanitize_proc_cap_bits(uint32_t *buf) +{ + BUG(); +} +#endif + #endif /* _ASM_X86_XEN_HYPERVISOR_H */ diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h index c599ec269a25..c10f279aae93 100644 --- a/arch/x86/include/asm/xen/interface_64.h +++ b/arch/x86/include/asm/xen/interface_64.h @@ -61,7 +61,7 @@ * RING1 -> RING3 kernel mode. * RING2 -> RING3 kernel mode. * RING3 -> RING3 user mode. - * However RING0 indicates that the guest kernel should return to iteself + * However RING0 indicates that the guest kernel should return to itself * directly with * orb $3,1*8(%rsp) * iretq |