diff options
Diffstat (limited to 'arch/x86/kernel')
36 files changed, 673 insertions, 324 deletions
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 8fd39ff74a49..c826cddae157 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -3,6 +3,7 @@ #include <linux/module.h> #include <linux/sched.h> +#include <linux/perf_event.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/stringify.h> @@ -15,6 +16,7 @@ #include <linux/kprobes.h> #include <linux/mmu_context.h> #include <linux/bsearch.h> +#include <linux/sync_core.h> #include <asm/text-patching.h> #include <asm/alternative.h> #include <asm/sections.h> @@ -53,7 +55,7 @@ __setup("noreplace-smp", setup_noreplace_smp); #define DPRINTK(fmt, args...) \ do { \ if (debug_alternative) \ - printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \ + printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \ } while (0) #define DUMP_BYTES(buf, len, fmt, args...) \ @@ -64,7 +66,7 @@ do { \ if (!(len)) \ break; \ \ - printk(KERN_DEBUG fmt, ##args); \ + printk(KERN_DEBUG pr_fmt(fmt), ##args); \ for (j = 0; j < (len) - 1; j++) \ printk(KERN_CONT "%02hhx ", buf[j]); \ printk(KERN_CONT "%02hhx\n", buf[j]); \ @@ -1001,6 +1003,7 @@ struct text_poke_loc { s32 rel32; u8 opcode; const u8 text[POKE_MAX_OPCODE_SIZE]; + u8 old; }; struct bp_patching_desc { @@ -1044,7 +1047,7 @@ static __always_inline int patch_cmp(const void *key, const void *elt) return 0; } -int noinstr poke_int3_handler(struct pt_regs *regs) +noinstr int poke_int3_handler(struct pt_regs *regs) { struct bp_patching_desc *desc; struct text_poke_loc *tp; @@ -1168,8 +1171,10 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries /* * First step: add a int3 trap to the address that will be patched. */ - for (i = 0; i < nr_entries; i++) + for (i = 0; i < nr_entries; i++) { + tp[i].old = *(u8 *)text_poke_addr(&tp[i]); text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); + } text_poke_sync(); @@ -1177,14 +1182,45 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * Second step: update all but the first byte of the patched range. */ for (do_sync = 0, i = 0; i < nr_entries; i++) { + u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, }; int len = text_opcode_size(tp[i].opcode); if (len - INT3_INSN_SIZE > 0) { + memcpy(old + INT3_INSN_SIZE, + text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + len - INT3_INSN_SIZE); text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, (const char *)tp[i].text + INT3_INSN_SIZE, len - INT3_INSN_SIZE); do_sync++; } + + /* + * Emit a perf event to record the text poke, primarily to + * support Intel PT decoding which must walk the executable code + * to reconstruct the trace. The flow up to here is: + * - write INT3 byte + * - IPI-SYNC + * - write instruction tail + * At this point the actual control flow will be through the + * INT3 and handler and not hit the old or new instruction. + * Intel PT outputs FUP/TIP packets for the INT3, so the flow + * can still be decoded. Subsequently: + * - emit RECORD_TEXT_POKE with the new instruction + * - IPI-SYNC + * - write first byte + * - IPI-SYNC + * So before the text poke event timestamp, the decoder will see + * either the old instruction flow or FUP/TIP of INT3. After the + * text poke event timestamp, the decoder will see either the + * new instruction flow or FUP/TIP of INT3. Thus decoders can + * use the timestamp as the point at which to modify the + * executable code. + * The old instruction is recorded so that the event can be + * processed forwards or backwards. + */ + perf_event_text_poke(text_poke_addr(&tp[i]), old, len, + tp[i].text, len); } if (do_sync) { diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 17cb5b933dcf..e89031e9c847 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -6,7 +6,7 @@ * This allows to use PCI devices that only support 32bit addresses on systems * with more than 4GB. * - * See Documentation/DMA-API-HOWTO.txt for the interface specification. + * See Documentation/core-api/dma-api-howto.rst for the interface specification. * * Copyright 2002 Andi Kleen, SuSE Labs. */ diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 81ffcfbfaef2..21325a4a78b9 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2335,8 +2335,13 @@ static int mp_irqdomain_create(int ioapic) static void ioapic_destroy_irqdomain(int idx) { + struct ioapic_domain_cfg *cfg = &ioapics[idx].irqdomain_cfg; + struct fwnode_handle *fn = ioapics[idx].irqdomain->fwnode; + if (ioapics[idx].irqdomain) { irq_domain_remove(ioapics[idx].irqdomain); + if (!cfg->dev) + irq_domain_free_fwnode(fn); ioapics[idx].irqdomain = NULL; } } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 7649da2478d8..dae32d948bf2 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -560,6 +560,10 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, * as that can corrupt the affinity move state. */ irqd_set_handle_enforce_irqctx(irqd); + + /* Don't invoke affinity setter on deactivated interrupts */ + irqd_set_affinity_on_activate(irqd); + /* * Legacy vectors are already assigned when the IOAPIC * takes them over. They stay on the same vector. This is diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 69e70ed0f5e6..0b6eea3f54e6 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -24,8 +24,6 @@ #include <asm/uv/uv.h> #include <asm/apic.h> -static DEFINE_PER_CPU(int, x2apic_extra_bits); - static enum uv_system_type uv_system_type; static int uv_hubbed_system; static int uv_hubless_system; @@ -40,7 +38,7 @@ static u8 oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1]; static struct { unsigned int apicid_shift; unsigned int apicid_mask; - unsigned int socketid_shift; /* aka pnode_shift for UV1/2/3 */ + unsigned int socketid_shift; /* aka pnode_shift for UV2/3 */ unsigned int pnode_mask; unsigned int gpa_shift; unsigned int gnode_shift; @@ -48,8 +46,6 @@ static struct { static int uv_min_hub_revision_id; -unsigned int uv_apicid_hibits; - static struct apic apic_x2apic_uv_x; static struct uv_hub_info_s uv_hub_info_node0; @@ -139,12 +135,8 @@ static void __init uv_tsc_check_sync(void) /* Accommodate different UV arch BIOSes */ mmr = uv_early_read_mmr(UVH_TSC_SYNC_MMR); mmr_shift = - is_uv1_hub() ? 0 : is_uv2_hub() ? UVH_TSC_SYNC_SHIFT_UV2K : UVH_TSC_SYNC_SHIFT; - if (mmr_shift) - sync_state = (mmr >> mmr_shift) & UVH_TSC_SYNC_MASK; - else - sync_state = 0; + sync_state = (mmr >> mmr_shift) & UVH_TSC_SYNC_MASK; switch (sync_state) { case UVH_TSC_SYNC_VALID: @@ -223,21 +215,6 @@ static void __init early_get_apic_socketid_shift(void) pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n", uv_cpuid.socketid_shift, uv_cpuid.pnode_mask); } -/* - * Add an extra bit as dictated by bios to the destination apicid of - * interrupts potentially passing through the UV HUB. This prevents - * a deadlock between interrupts and IO port operations. - */ -static void __init uv_set_apicid_hibit(void) -{ - union uv1h_lb_target_physical_apic_id_mask_u apicid_mask; - - if (is_uv1_hub()) { - apicid_mask.v = uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK); - uv_apicid_hibits = apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK; - } -} - static void __init uv_stringify(int len, char *to, char *from) { /* Relies on 'to' being NULL chars so result will be NULL terminated */ @@ -280,36 +257,25 @@ static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id) /* * Determine UV arch type. - * SGI: UV100/1000 * SGI2: UV2000/3000 * SGI3: UV300 (truncated to 4 chars because of different varieties) * SGI4: UV400 (truncated to 4 chars because of different varieties) */ - uv_hub_info->hub_revision = - !strncmp(oem_id, "SGI4", 4) ? UV4_HUB_REVISION_BASE : - !strncmp(oem_id, "SGI3", 4) ? UV3_HUB_REVISION_BASE : - !strcmp(oem_id, "SGI2") ? UV2_HUB_REVISION_BASE : - !strcmp(oem_id, "SGI") ? UV1_HUB_REVISION_BASE : 0; - - if (uv_hub_info->hub_revision == 0) - goto badbios; - - switch (uv_hub_info->hub_revision) { - case UV4_HUB_REVISION_BASE: + if (!strncmp(oem_id, "SGI4", 4)) { + uv_hub_info->hub_revision = UV4_HUB_REVISION_BASE; uv_hubbed_system = 0x11; - break; - case UV3_HUB_REVISION_BASE: + } else if (!strncmp(oem_id, "SGI3", 4)) { + uv_hub_info->hub_revision = UV3_HUB_REVISION_BASE; uv_hubbed_system = 0x9; - break; - case UV2_HUB_REVISION_BASE: + } else if (!strcmp(oem_id, "SGI2")) { + uv_hub_info->hub_revision = UV2_HUB_REVISION_BASE; uv_hubbed_system = 0x5; - break; - case UV1_HUB_REVISION_BASE: - uv_hubbed_system = 0x3; - break; + } else { + uv_hub_info->hub_revision = 0; + goto badbios; } pnodeid = early_get_pnodeid(); @@ -323,14 +289,6 @@ static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id) uv_system_type = UV_X2APIC; uv_apic = 0; - } else if (!strcmp(oem_table_id, "UVH")) { - /* Only UV1 systems: */ - uv_system_type = UV_NON_UNIQUE_APIC; - x86_platform.legacy.warm_reset = 0; - __this_cpu_write(x2apic_extra_bits, pnodeid << uvh_apicid.s.pnode_shift); - uv_set_apicid_hibit(); - uv_apic = 1; - } else if (!strcmp(oem_table_id, "UVL")) { /* Only used for very small systems: */ uv_system_type = UV_LEGACY_APIC; @@ -347,7 +305,7 @@ static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id) badbios: pr_err("UV: OEM_ID:%s OEM_TABLE_ID:%s\n", oem_id, oem_table_id); - pr_err("Current BIOS not supported, update kernel and/or BIOS\n"); + pr_err("Current UV Type or BIOS not supported\n"); BUG(); } @@ -545,7 +503,6 @@ static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) int pnode; pnode = uv_apicid_to_pnode(phys_apicid); - phys_apicid |= uv_apicid_hibits; val = (1UL << UVH_IPI_INT_SEND_SHFT) | (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | @@ -576,7 +533,7 @@ static void uv_send_IPI_one(int cpu, int vector) dmode = dest_Fixed; val = (1UL << UVH_IPI_INT_SEND_SHFT) | - ((apicid | uv_apicid_hibits) << UVH_IPI_INT_APIC_ID_SHFT) | + (apicid << UVH_IPI_INT_APIC_ID_SHFT) | (dmode << UVH_IPI_INT_DELIVERY_MODE_SHFT) | (vector << UVH_IPI_INT_VECTOR_SHFT); @@ -634,22 +591,16 @@ static void uv_init_apic_ldr(void) static u32 apic_uv_calc_apicid(unsigned int cpu) { - return apic_default_calc_apicid(cpu) | uv_apicid_hibits; + return apic_default_calc_apicid(cpu); } -static unsigned int x2apic_get_apic_id(unsigned long x) +static unsigned int x2apic_get_apic_id(unsigned long id) { - unsigned int id; - - WARN_ON(preemptible() && num_online_cpus() > 1); - id = x | __this_cpu_read(x2apic_extra_bits); - return id; } static u32 set_apic_id(unsigned int id) { - /* CHECKME: Do we need to mask out the xapic extra bits? */ return id; } @@ -721,11 +672,6 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, }; -static void set_x2apic_extra_bits(int pnode) -{ - __this_cpu_write(x2apic_extra_bits, pnode << uvh_apicid.s.pnode_shift); -} - #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH 3 #define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT @@ -920,15 +866,7 @@ static __init void map_mmioh_high(int min_pnode, int max_pnode) return; } - if (is_uv1_hub()) { - mmr = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR; - shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; - mmioh.v = uv_read_local_mmr(mmr); - enable = !!mmioh.s1.enable; - base = mmioh.s1.base; - m_io = mmioh.s1.m_io; - n_io = mmioh.s1.n_io; - } else if (is_uv2_hub()) { + if (is_uv2_hub()) { mmr = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR; shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; mmioh.v = uv_read_local_mmr(mmr); @@ -936,16 +874,15 @@ static __init void map_mmioh_high(int min_pnode, int max_pnode) base = mmioh.s2.base; m_io = mmioh.s2.m_io; n_io = mmioh.s2.n_io; - } else { - return; - } - if (enable) { - max_pnode &= (1 << n_io) - 1; - pr_info("UV: base:0x%lx shift:%d N_IO:%d M_IO:%d max_pnode:0x%x\n", base, shift, m_io, n_io, max_pnode); - map_high("MMIOH", base, shift, m_io, max_pnode, map_uc); - } else { - pr_info("UV: MMIOH disabled\n"); + if (enable) { + max_pnode &= (1 << n_io) - 1; + pr_info("UV: base:0x%lx shift:%d N_IO:%d M_IO:%d max_pnode:0x%x\n", + base, shift, m_io, n_io, max_pnode); + map_high("MMIOH", base, shift, m_io, max_pnode, map_uc); + } else { + pr_info("UV: MMIOH disabled\n"); + } } } @@ -1081,9 +1018,6 @@ void uv_cpu_init(void) return; uv_hub_info->nr_online_cpus++; - - if (get_uv_system_type() == UV_NON_UNIQUE_APIC) - set_x2apic_extra_bits(uv_hub_info->pnode); } struct mn { @@ -1114,9 +1048,6 @@ static void get_mn(struct mn *mnp) } else if (is_uv2_hub()) { mnp->m_val = m_n_config.s2.m_skt; mnp->n_lshift = mnp->m_val == 40 ? 40 : 39; - } else if (is_uv1_hub()) { - mnp->m_val = m_n_config.s1.m_skt; - mnp->n_lshift = mnp->m_val; } mnp->m_shift = mnp->m_val ? 64 - mnp->m_val : 0; } @@ -1318,7 +1249,7 @@ static void __init build_socket_tables(void) size_t bytes; if (!gre) { - if (is_uv1_hub() || is_uv2_hub() || is_uv3_hub()) { + if (is_uv2_hub() || is_uv3_hub()) { pr_info("UV: No UVsystab socket table, ignoring\n"); return; } @@ -1500,8 +1431,7 @@ static void __init uv_system_init_hub(void) unsigned short min_pnode = 9999, max_pnode = 0; char *hub = is_uv4_hub() ? "UV400" : is_uv3_hub() ? "UV300" : - is_uv2_hub() ? "UV2000/3000" : - is_uv1_hub() ? "UV100/1000" : NULL; + is_uv2_hub() ? "UV2000/3000" : NULL; if (!hub) { pr_err("UV: Unknown/unsupported UV hub\n"); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 0b71970d2d3d..f0b743a2fe9c 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -543,14 +543,12 @@ static void __init spectre_v1_select_mitigation(void) * If FSGSBASE is enabled, the user can put a kernel address in * GS, in which case SMAP provides no protection. * - * [ NOTE: Don't check for X86_FEATURE_FSGSBASE until the - * FSGSBASE enablement patches have been merged. ] - * * If FSGSBASE is disabled, the user can only put a user space * address in GS. That makes an attack harder, but still * possible if there's no SMAP protection. */ - if (!smap_works_speculatively()) { + if (boot_cpu_has(X86_FEATURE_FSGSBASE) || + !smap_works_speculatively()) { /* * Mitigation can be provided from SWAPGS itself or * PTI as the CR3 write in the Meltdown mitigation @@ -763,10 +761,12 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) } /* - * If enhanced IBRS is enabled or SMT impossible, STIBP is not + * If no STIBP, enhanced IBRS is enabled or SMT impossible, STIBP is not * required. */ - if (!smt_possible || spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) + if (!boot_cpu_has(X86_FEATURE_STIBP) || + !smt_possible || + spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) return; /* @@ -778,12 +778,6 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) mode = SPECTRE_V2_USER_STRICT_PREFERRED; - /* - * If STIBP is not available, clear the STIBP mode. - */ - if (!boot_cpu_has(X86_FEATURE_STIBP)) - mode = SPECTRE_V2_USER_NONE; - spectre_v2_user_stibp = mode; set_mode: @@ -1270,7 +1264,6 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) * Indirect branch speculation is always disabled in strict * mode. It can neither be enabled if it was force-disabled * by a previous prctl call. - */ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 95c090a45b4b..965474d78cef 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -441,6 +441,22 @@ static void __init setup_cr_pinning(void) static_key_enable(&cr_pinning.key); } +static __init int x86_nofsgsbase_setup(char *arg) +{ + /* Require an exact match without trailing characters. */ + if (strlen(arg)) + return 0; + + /* Do not emit a message if the feature is not present. */ + if (!boot_cpu_has(X86_FEATURE_FSGSBASE)) + return 1; + + setup_clear_cpu_cap(X86_FEATURE_FSGSBASE); + pr_info("FSGSBASE disabled via kernel command line\n"); + return 1; +} +__setup("nofsgsbase", x86_nofsgsbase_setup); + /* * Protection Keys are not available in 32-bit mode. */ @@ -1495,6 +1511,12 @@ static void identify_cpu(struct cpuinfo_x86 *c) setup_smap(c); setup_umip(c); + /* Enable FSGSBASE instructions if available. */ + if (cpu_has(c, X86_FEATURE_FSGSBASE)) { + cr4_set_bits(X86_CR4_FSGSBASE); + elf_hwcap2 |= HWCAP2_FSGSBASE; + } + /* * The vendor-specific functions might have changed features. * Now we do "generic changes." diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 0ab48f1cdf84..b6b7b38dff5f 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -1156,6 +1156,8 @@ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, 1), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, 1), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, 1), + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 1), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, 1), {} }; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 14e4b4d17ee5..f43a78bde670 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -42,6 +42,7 @@ #include <linux/export.h> #include <linux/jump_label.h> #include <linux/set_memory.h> +#include <linux/sync_core.h> #include <linux/task_work.h> #include <linux/hardirq.h> @@ -244,6 +245,8 @@ static void __print_mce(struct mce *m) pr_cont("ADDR %llx ", m->addr); if (m->misc) pr_cont("MISC %llx ", m->misc); + if (m->ppin) + pr_cont("PPIN %llx ", m->ppin); if (mce_flags.smca) { if (m->synd) @@ -1212,7 +1215,7 @@ static void kill_me_maybe(struct callback_head *cb) * backing the user stack, tracing that reads the user stack will cause * potentially infinite recursion. */ -void noinstr do_machine_check(struct pt_regs *regs) +noinstr void do_machine_check(struct pt_regs *regs) { DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); DECLARE_BITMAP(toclear, MAX_NR_BANKS); @@ -1927,11 +1930,11 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) static __always_inline void exc_machine_check_user(struct pt_regs *regs) { - idtentry_enter_user(regs); + irqentry_enter_from_user_mode(regs); instrumentation_begin(); machine_check_vector(regs); instrumentation_end(); - idtentry_exit_user(regs); + irqentry_exit_to_user_mode(regs); } #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c index 43c466020ed5..03e51053592a 100644 --- a/arch/x86/kernel/cpu/mce/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c @@ -345,7 +345,7 @@ static __init int dev_mcelog_init_device(void) int err; mce_log_len = max(MCE_LOG_MIN_LEN, num_online_cpus()); - mcelog = kzalloc(sizeof(*mcelog) + mce_log_len * sizeof(struct mce), GFP_KERNEL); + mcelog = kzalloc(struct_size(mcelog, entry, mce_log_len), GFP_KERNEL); if (!mcelog) return -ENOMEM; diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 0593b192eb8f..7843ab3fde09 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -511,7 +511,7 @@ static void do_inject(void) */ if (inj_type == DFR_INT_INJ) { i_mce.status |= MCI_STATUS_DEFERRED; - i_mce.status |= (i_mce.status & ~MCI_STATUS_UC); + i_mce.status &= ~MCI_STATUS_UC; } /* diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index baec68b7e010..ec6f0415bc6d 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -145,7 +145,6 @@ extern struct builtin_fw __end_builtin_fw[]; bool get_builtin_firmware(struct cpio_data *cd, const char *name) { -#ifdef CONFIG_FW_LOADER struct builtin_fw *b_fw; for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) { @@ -155,7 +154,6 @@ bool get_builtin_firmware(struct cpio_data *cd, const char *name) return true; } } -#endif return false; } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index b037cfa7c0c5..48ce44576947 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -71,6 +71,22 @@ static void printk_stack_address(unsigned long address, int reliable, printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); } +static int copy_code(struct pt_regs *regs, u8 *buf, unsigned long src, + unsigned int nbytes) +{ + if (!user_mode(regs)) + return copy_from_kernel_nofault(buf, (u8 *)src, nbytes); + + /* + * Make sure userspace isn't trying to trick us into dumping kernel + * memory by pointing the userspace instruction pointer at it. + */ + if (__chk_range_not_ok(src, nbytes, TASK_SIZE_MAX)) + return -EINVAL; + + return copy_from_user_nmi(buf, (void __user *)src, nbytes); +} + /* * There are a couple of reasons for the 2/3rd prologue, courtesy of Linus: * @@ -97,17 +113,8 @@ void show_opcodes(struct pt_regs *regs, const char *loglvl) #define OPCODE_BUFSIZE (PROLOGUE_SIZE + 1 + EPILOGUE_SIZE) u8 opcodes[OPCODE_BUFSIZE]; unsigned long prologue = regs->ip - PROLOGUE_SIZE; - bool bad_ip; - - /* - * Make sure userspace isn't trying to trick us into dumping kernel - * memory by pointing the userspace instruction pointer at it. - */ - bad_ip = user_mode(regs) && - __chk_range_not_ok(prologue, OPCODE_BUFSIZE, TASK_SIZE_MAX); - if (bad_ip || copy_from_kernel_nofault(opcodes, (u8 *)prologue, - OPCODE_BUFSIZE)) { + if (copy_code(regs, opcodes, prologue, sizeof(opcodes))) { printk("%sCode: Bad RIP value.\n", loglvl); } else { printk("%sCode: %" __stringify(PROLOGUE_SIZE) "ph <%02x> %" @@ -126,15 +133,15 @@ void show_ip(struct pt_regs *regs, const char *loglvl) show_opcodes(regs, loglvl); } -void show_iret_regs(struct pt_regs *regs) +void show_iret_regs(struct pt_regs *regs, const char *log_lvl) { - show_ip(regs, KERN_DEFAULT); - printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss, + show_ip(regs, log_lvl); + printk("%sRSP: %04x:%016lx EFLAGS: %08lx", log_lvl, (int)regs->ss, regs->sp, regs->flags); } static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, - bool partial) + bool partial, const char *log_lvl) { /* * These on_stack() checks aren't strictly necessary: the unwind code @@ -146,7 +153,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, * they can be printed in the right context. */ if (!partial && on_stack(info, regs, sizeof(*regs))) { - __show_regs(regs, SHOW_REGS_SHORT); + __show_regs(regs, SHOW_REGS_SHORT, log_lvl); } else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET, IRET_FRAME_SIZE)) { @@ -155,7 +162,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, * full pt_regs might not have been saved yet. In that case * just print the iret frame. */ - show_iret_regs(regs); + show_iret_regs(regs, log_lvl); } } @@ -210,7 +217,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, printk("%s <%s>\n", log_lvl, stack_name); if (regs) - show_regs_if_on_stack(&stack_info, regs, partial); + show_regs_if_on_stack(&stack_info, regs, partial, log_lvl); /* * Scan the stack, printing any text addresses we find. At the @@ -271,7 +278,7 @@ next: /* if the frame has entry regs, print them */ regs = unwind_get_entry_regs(&state, &partial); if (regs) - show_regs_if_on_stack(&stack_info, regs, partial); + show_regs_if_on_stack(&stack_info, regs, partial, log_lvl); } if (stack_name) @@ -345,7 +352,7 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr) oops_exit(); /* Executive summary in case the oops scrolled away */ - __show_regs(&exec_summary_regs, SHOW_REGS_ALL); + __show_regs(&exec_summary_regs, SHOW_REGS_ALL, KERN_DEFAULT); if (!signr) return; @@ -437,9 +444,12 @@ void die_addr(const char *str, struct pt_regs *regs, long err, long gp_addr) void show_regs(struct pt_regs *regs) { + enum show_regs_mode print_kernel_regs; + show_regs_print_info(KERN_DEFAULT); - __show_regs(regs, user_mode(regs) ? SHOW_REGS_USER : SHOW_REGS_ALL); + print_kernel_regs = user_mode(regs) ? SHOW_REGS_USER : SHOW_REGS_ALL; + __show_regs(regs, print_kernel_regs, KERN_DEFAULT); /* * When in-kernel, we also print out the stack at the time of the fault.. diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 15247b96c6ea..eb86a2b831b1 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -82,6 +82,45 @@ bool irq_fpu_usable(void) } EXPORT_SYMBOL(irq_fpu_usable); +/* + * These must be called with preempt disabled. Returns + * 'true' if the FPU state is still intact and we can + * keep registers active. + * + * The legacy FNSAVE instruction cleared all FPU state + * unconditionally, so registers are essentially destroyed. + * Modern FPU state can be kept in registers, if there are + * no pending FP exceptions. + */ +int copy_fpregs_to_fpstate(struct fpu *fpu) +{ + if (likely(use_xsave())) { + copy_xregs_to_kernel(&fpu->state.xsave); + + /* + * AVX512 state is tracked here because its use is + * known to slow the max clock speed of the core. + */ + if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512) + fpu->avx512_timestamp = jiffies; + return 1; + } + + if (likely(use_fxsr())) { + copy_fxregs_to_kernel(fpu); + return 1; + } + + /* + * Legacy FPU register saving, FNSAVE always clears FPU registers, + * so we have to mark them inactive: + */ + asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave)); + + return 0; +} +EXPORT_SYMBOL(copy_fpregs_to_fpstate); + void kernel_fpu_begin(void) { preempt_disable(); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index bda2e5eaca0e..be2a68a09d19 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -233,8 +233,10 @@ void fpu__init_cpu_xstate(void) /* * MSR_IA32_XSS sets supervisor states managed by XSAVES. */ - if (boot_cpu_has(X86_FEATURE_XSAVES)) - wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); + if (boot_cpu_has(X86_FEATURE_XSAVES)) { + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | + xfeatures_mask_dynamic()); + } } static bool xfeature_enabled(enum xfeature xfeature) @@ -486,7 +488,7 @@ static int xfeature_uncompacted_offset(int xfeature_nr) return ebx; } -static int xfeature_size(int xfeature_nr) +int xfeature_size(int xfeature_nr) { u32 eax, ebx, ecx, edx; @@ -598,7 +600,8 @@ static void check_xstate_against_struct(int nr) */ if ((nr < XFEATURE_YMM) || (nr >= XFEATURE_MAX) || - (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR)) { + (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) || + ((nr >= XFEATURE_RSRVD_COMP_10) && (nr <= XFEATURE_LBR))) { WARN_ONCE(1, "no structure for xstate: %d\n", nr); XSTATE_WARN_ON(1); } @@ -847,8 +850,10 @@ void fpu__resume_cpu(void) * Restore IA32_XSS. The same CPUID bit enumerates support * of XSAVES and MSR_IA32_XSS. */ - if (boot_cpu_has(X86_FEATURE_XSAVES)) - wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); + if (boot_cpu_has(X86_FEATURE_XSAVES)) { + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | + xfeatures_mask_dynamic()); + } } /* @@ -1074,7 +1079,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of copy_part(offsetof(struct fxregs_state, st_space), 128, &xsave->i387.st_space, &kbuf, &offset_start, &count); if (header.xfeatures & XFEATURE_MASK_SSE) - copy_part(xstate_offsets[XFEATURE_MASK_SSE], 256, + copy_part(xstate_offsets[XFEATURE_SSE], 256, &xsave->i387.xmm_space, &kbuf, &offset_start, &count); /* * Fill xsave->i387.sw_reserved value for ptrace frame: @@ -1356,6 +1361,78 @@ void copy_supervisor_to_kernel(struct xregs_state *xstate) } } +/** + * copy_dynamic_supervisor_to_kernel() - Save dynamic supervisor states to + * an xsave area + * @xstate: A pointer to an xsave area + * @mask: Represent the dynamic supervisor features saved into the xsave area + * + * Only the dynamic supervisor states sets in the mask are saved into the xsave + * area (See the comment in XFEATURE_MASK_DYNAMIC for the details of dynamic + * supervisor feature). Besides the dynamic supervisor states, the legacy + * region and XSAVE header are also saved into the xsave area. The supervisor + * features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and + * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not saved. + * + * The xsave area must be 64-bytes aligned. + */ +void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask) +{ + u64 dynamic_mask = xfeatures_mask_dynamic() & mask; + u32 lmask, hmask; + int err; + + if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES))) + return; + + if (WARN_ON_FPU(!dynamic_mask)) + return; + + lmask = dynamic_mask; + hmask = dynamic_mask >> 32; + + XSTATE_OP(XSAVES, xstate, lmask, hmask, err); + + /* Should never fault when copying to a kernel buffer */ + WARN_ON_FPU(err); +} + +/** + * copy_kernel_to_dynamic_supervisor() - Restore dynamic supervisor states from + * an xsave area + * @xstate: A pointer to an xsave area + * @mask: Represent the dynamic supervisor features restored from the xsave area + * + * Only the dynamic supervisor states sets in the mask are restored from the + * xsave area (See the comment in XFEATURE_MASK_DYNAMIC for the details of + * dynamic supervisor feature). Besides the dynamic supervisor states, the + * legacy region and XSAVE header are also restored from the xsave area. The + * supervisor features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and + * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not restored. + * + * The xsave area must be 64-bytes aligned. + */ +void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask) +{ + u64 dynamic_mask = xfeatures_mask_dynamic() & mask; + u32 lmask, hmask; + int err; + + if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES))) + return; + + if (WARN_ON_FPU(!dynamic_mask)) + return; + + lmask = dynamic_mask; + hmask = dynamic_mask >> 32; + + XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); + + /* Should never fault when copying from a kernel buffer */ + WARN_ON_FPU(err); +} + #ifdef CONFIG_PROC_PID_ARCH_STATUS /* * Report the amount of time elapsed in millisecond since last AVX512 diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index f3c76252247d..282b4ee1339f 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -207,7 +207,7 @@ spurious_8259A_irq: * lets ACK and report it. [once per IRQ] */ if (!(spurious_irq_mask & irqmask)) { - printk(KERN_DEBUG + printk_deferred(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); spurious_irq_mask |= irqmask; } diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 0db21206f2f3..7ecf9babf0cb 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -160,7 +160,7 @@ static const __initconst struct idt_data apic_idts[] = { /* Must be page-aligned because the real IDT is used in the cpu entry area */ static gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss; -struct desc_ptr idt_descr __ro_after_init = { +static struct desc_ptr idt_descr __ro_after_init = { .size = IDT_TABLE_SIZE - 1, .address = (unsigned long) idt_table, }; diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index db6578d45157..57c2ecf43134 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -170,15 +170,6 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr, if (!current_ei->efi_memmap_size) return 0; - /* - * If 1:1 mapping is not enabled, second kernel can not setup EFI - * and use EFI run time services. User space will have to pass - * acpi_rsdp=<addr> on kernel command line to make second kernel boot - * without efi. - */ - if (efi_have_uv1_memmap()) - return 0; - params->secure_boot = boot_params.secure_boot; ei->efi_loader_signature = current_ei->efi_loader_signature; ei->efi_systab = current_ei->efi_systab; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index ada39ddbc922..fdadc37d72af 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -33,6 +33,7 @@ #include <linux/hardirq.h> #include <linux/preempt.h> #include <linux/sched/debug.h> +#include <linux/perf_event.h> #include <linux/extable.h> #include <linux/kdebug.h> #include <linux/kallsyms.h> @@ -472,6 +473,9 @@ static int arch_copy_kprobe(struct kprobe *p) /* Also, displacement change doesn't affect the first byte */ p->opcode = buf[0]; + p->ainsn.tp_len = len; + perf_event_text_poke(p->ainsn.insn, NULL, 0, buf, len); + /* OK, write back the instruction(s) into ROX insn buffer */ text_poke(p->ainsn.insn, buf, len); @@ -503,12 +507,18 @@ int arch_prepare_kprobe(struct kprobe *p) void arch_arm_kprobe(struct kprobe *p) { - text_poke(p->addr, ((unsigned char []){INT3_INSN_OPCODE}), 1); + u8 int3 = INT3_INSN_OPCODE; + + text_poke(p->addr, &int3, 1); text_poke_sync(); + perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1); } void arch_disarm_kprobe(struct kprobe *p) { + u8 int3 = INT3_INSN_OPCODE; + + perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1); text_poke(p->addr, &p->opcode, 1); text_poke_sync(); } @@ -516,6 +526,9 @@ void arch_disarm_kprobe(struct kprobe *p) void arch_remove_kprobe(struct kprobe *p) { if (p->ainsn.insn) { + /* Record the perf event before freeing the slot */ + perf_event_text_poke(p->ainsn.insn, p->ainsn.insn, + p->ainsn.tp_len, NULL, 0); free_insn_slot(p->ainsn.insn, p->ainsn.boostable); p->ainsn.insn = NULL; } diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 7af4c61dde52..40f380461e6d 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -6,6 +6,7 @@ * Copyright (C) Hitachi Ltd., 2012 */ #include <linux/kprobes.h> +#include <linux/perf_event.h> #include <linux/ptrace.h> #include <linux/string.h> #include <linux/slab.h> @@ -352,8 +353,15 @@ int arch_within_optimized_kprobe(struct optimized_kprobe *op, static void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) { - if (op->optinsn.insn) { - free_optinsn_slot(op->optinsn.insn, dirty); + u8 *slot = op->optinsn.insn; + if (slot) { + int len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE; + + /* Record the perf event before freeing the slot */ + if (dirty) + perf_event_text_poke(slot, slot, len, NULL, 0); + + free_optinsn_slot(slot, dirty); op->optinsn.insn = NULL; op->optinsn.size = 0; } @@ -424,8 +432,15 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, (u8 *)op->kp.addr + op->optinsn.size); len += JMP32_INSN_SIZE; + /* + * Note len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE is also + * used in __arch_remove_optimized_kprobe(). + */ + /* We have to use text_poke() for instruction buffer because it is RO */ + perf_event_text_poke(slot, NULL, 0, buf, len); text_poke(slot, buf, len); + ret = 0; out: kfree(buf); @@ -477,10 +492,23 @@ void arch_optimize_kprobes(struct list_head *oplist) */ void arch_unoptimize_kprobe(struct optimized_kprobe *op) { - arch_arm_kprobe(&op->kp); - text_poke(op->kp.addr + INT3_INSN_SIZE, - op->optinsn.copied_insn, DISP32_SIZE); + u8 new[JMP32_INSN_SIZE] = { INT3_INSN_OPCODE, }; + u8 old[JMP32_INSN_SIZE]; + u8 *addr = op->kp.addr; + + memcpy(old, op->kp.addr, JMP32_INSN_SIZE); + memcpy(new + INT3_INSN_SIZE, + op->optinsn.copied_insn, + JMP32_INSN_SIZE - INT3_INSN_SIZE); + + text_poke(addr, new, INT3_INSN_SIZE); text_poke_sync(); + text_poke(addr + INT3_INSN_SIZE, + new + INT3_INSN_SIZE, + JMP32_INSN_SIZE - INT3_INSN_SIZE); + text_poke_sync(); + + perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE); } /* diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index df63786e7bfa..233c77d056c9 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -233,7 +233,7 @@ EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags); noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) { u32 reason = kvm_read_and_reset_apf_flags(); - bool rcu_exit; + irqentry_state_t state; switch (reason) { case KVM_PV_REASON_PAGE_NOT_PRESENT: @@ -243,7 +243,7 @@ noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) return false; } - rcu_exit = idtentry_enter_cond_rcu(regs); + state = irqentry_enter(regs); instrumentation_begin(); /* @@ -264,7 +264,7 @@ noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) } instrumentation_end(); - idtentry_exit_cond_rcu(regs, rcu_exit); + irqentry_exit(regs, state); return true; } diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 1547be359d7f..49dcfb85e773 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -42,6 +42,14 @@ static struct class *msr_class; static enum cpuhp_state cpuhp_msr_state; +enum allow_write_msrs { + MSR_WRITES_ON, + MSR_WRITES_OFF, + MSR_WRITES_DEFAULT, +}; + +static enum allow_write_msrs allow_writes = MSR_WRITES_DEFAULT; + static ssize_t msr_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -70,6 +78,24 @@ static ssize_t msr_read(struct file *file, char __user *buf, return bytes ? bytes : err; } +static int filter_write(u32 reg) +{ + switch (allow_writes) { + case MSR_WRITES_ON: return 0; + case MSR_WRITES_OFF: return -EPERM; + default: break; + } + + if (reg == MSR_IA32_ENERGY_PERF_BIAS) + return 0; + + pr_err_ratelimited("Write to unrecognized MSR 0x%x by %s\n" + "Please report to x86@kernel.org\n", + reg, current->comm); + + return 0; +} + static ssize_t msr_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -84,6 +110,10 @@ static ssize_t msr_write(struct file *file, const char __user *buf, if (err) return err; + err = filter_write(reg); + if (err) + return err; + if (count % 8) return -EINVAL; /* Invalid chunk size */ @@ -92,9 +122,13 @@ static ssize_t msr_write(struct file *file, const char __user *buf, err = -EFAULT; break; } + + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]); if (err) break; + tmp += 2; bytes += 8; } @@ -242,6 +276,41 @@ static void __exit msr_exit(void) } module_exit(msr_exit) +static int set_allow_writes(const char *val, const struct kernel_param *cp) +{ + /* val is NUL-terminated, see kernfs_fop_write() */ + char *s = strstrip((char *)val); + + if (!strcmp(s, "on")) + allow_writes = MSR_WRITES_ON; + else if (!strcmp(s, "off")) + allow_writes = MSR_WRITES_OFF; + else + allow_writes = MSR_WRITES_DEFAULT; + + return 0; +} + +static int get_allow_writes(char *buf, const struct kernel_param *kp) +{ + const char *res; + + switch (allow_writes) { + case MSR_WRITES_ON: res = "on"; break; + case MSR_WRITES_OFF: res = "off"; break; + default: res = "default"; break; + } + + return sprintf(buf, "%s\n", res); +} + +static const struct kernel_param_ops allow_writes_ops = { + .set = set_allow_writes, + .get = get_allow_writes +}; + +module_param_cb(allow_writes, &allow_writes_ops, NULL, 0600); + MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>"); MODULE_DESCRIPTION("x86 generic MSR driver"); MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index d7c5e44b26f7..4fc9954a9560 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -330,7 +330,6 @@ static noinstr void default_do_nmi(struct pt_regs *regs) __this_cpu_write(last_nmi_rip, regs->ip); instrumentation_begin(); - trace_hardirqs_off_finish(); handled = nmi_handle(NMI_LOCAL, regs); __this_cpu_add(nmi_stats.normal, handled); @@ -417,8 +416,6 @@ static noinstr void default_do_nmi(struct pt_regs *regs) unknown_nmi_error(reason, regs); out: - if (regs->flags & X86_EFLAGS_IF) - trace_hardirqs_on_prepare(); instrumentation_end(); } @@ -478,6 +475,8 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7); DEFINE_IDTENTRY_RAW(exc_nmi) { + bool irq_state; + if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id())) return; @@ -491,14 +490,14 @@ nmi_restart: this_cpu_write(nmi_dr7, local_db_save()); - nmi_enter(); + irq_state = idtentry_enter_nmi(regs); inc_irq_stat(__nmi_count); if (!ignore_nmis) default_do_nmi(regs); - nmi_exit(); + idtentry_exit_nmi(regs, irq_state); local_db_restore(this_cpu_read(nmi_dr7)); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index fe67dbd76e51..994d8393f2f7 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -121,8 +121,8 @@ static int set_new_tls(struct task_struct *p, unsigned long tls) return do_set_thread_area_64(p, ARCH_SET_FS, tls); } -int copy_thread_tls(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct *p, unsigned long tls) +int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, + struct task_struct *p, unsigned long tls) { struct inactive_task_frame *frame; struct fork_frame *fork_frame; @@ -140,10 +140,12 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); #ifdef CONFIG_X86_64 - savesegment(gs, p->thread.gsindex); - p->thread.gsbase = p->thread.gsindex ? 0 : current->thread.gsbase; - savesegment(fs, p->thread.fsindex); - p->thread.fsbase = p->thread.fsindex ? 0 : current->thread.fsbase; + current_save_fsgs(); + p->thread.fsindex = current->thread.fsindex; + p->thread.fsbase = current->thread.fsbase; + p->thread.gsindex = current->thread.gsindex; + p->thread.gsbase = current->thread.gsbase; + savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); #else diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index acfd6d2a0cbf..4f2f54e1281c 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -56,7 +56,8 @@ #include "process.h" -void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) +void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, + const char *log_lvl) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; unsigned long d0, d1, d2, d3, d6, d7; @@ -67,14 +68,14 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) else savesegment(gs, gs); - show_ip(regs, KERN_DEFAULT); + show_ip(regs, log_lvl); - printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", - regs->ax, regs->bx, regs->cx, regs->dx); - printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", - regs->si, regs->di, regs->bp, regs->sp); - printk(KERN_DEFAULT "DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x EFLAGS: %08lx\n", - (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, regs->ss, regs->flags); + printk("%sEAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", + log_lvl, regs->ax, regs->bx, regs->cx, regs->dx); + printk("%sESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", + log_lvl, regs->si, regs->di, regs->bp, regs->sp); + printk("%sDS: %04x ES: %04x FS: %04x GS: %04x SS: %04x EFLAGS: %08lx\n", + log_lvl, (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, regs->ss, regs->flags); if (mode != SHOW_REGS_ALL) return; @@ -83,8 +84,8 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) cr2 = read_cr2(); cr3 = __read_cr3(); cr4 = __read_cr4(); - printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", - cr0, cr2, cr3, cr4); + printk("%sCR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", + log_lvl, cr0, cr2, cr3, cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); @@ -98,10 +99,10 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) (d6 == DR6_RESERVED) && (d7 == 0x400)) return; - printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", - d0, d1, d2, d3); - printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n", - d6, d7); + printk("%sDR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", + log_lvl, d0, d1, d2, d3); + printk("%sDR6: %08lx DR7: %08lx\n", + log_lvl, d6, d7); } void release_thread(struct task_struct *dead_task) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 9a97415b2139..d6f946707270 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -62,30 +62,31 @@ #include "process.h" /* Prints also some state that isn't saved in the pt_regs */ -void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) +void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, + const char *log_lvl) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; unsigned long d0, d1, d2, d3, d6, d7; unsigned int fsindex, gsindex; unsigned int ds, es; - show_iret_regs(regs); + show_iret_regs(regs, log_lvl); if (regs->orig_ax != -1) pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); else pr_cont("\n"); - printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n", - regs->ax, regs->bx, regs->cx); - printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n", - regs->dx, regs->si, regs->di); - printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n", - regs->bp, regs->r8, regs->r9); - printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n", - regs->r10, regs->r11, regs->r12); - printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", - regs->r13, regs->r14, regs->r15); + printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n", + log_lvl, regs->ax, regs->bx, regs->cx); + printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n", + log_lvl, regs->dx, regs->si, regs->di); + printk("%sRBP: %016lx R08: %016lx R09: %016lx\n", + log_lvl, regs->bp, regs->r8, regs->r9); + printk("%sR10: %016lx R11: %016lx R12: %016lx\n", + log_lvl, regs->r10, regs->r11, regs->r12); + printk("%sR13: %016lx R14: %016lx R15: %016lx\n", + log_lvl, regs->r13, regs->r14, regs->r15); if (mode == SHOW_REGS_SHORT) return; @@ -93,8 +94,8 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) if (mode == SHOW_REGS_USER) { rdmsrl(MSR_FS_BASE, fs); rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); - printk(KERN_DEFAULT "FS: %016lx GS: %016lx\n", - fs, shadowgs); + printk("%sFS: %016lx GS: %016lx\n", + log_lvl, fs, shadowgs); return; } @@ -112,12 +113,12 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) cr3 = __read_cr3(); cr4 = __read_cr4(); - printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", - fs, fsindex, gs, gsindex, shadowgs); - printk(KERN_DEFAULT "CS: %04lx DS: %04x ES: %04x CR0: %016lx\n", regs->cs, ds, - es, cr0); - printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, - cr4); + printk("%sFS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", + log_lvl, fs, fsindex, gs, gsindex, shadowgs); + printk("%sCS: %04lx DS: %04x ES: %04x CR0: %016lx\n", + log_lvl, regs->cs, ds, es, cr0); + printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n", + log_lvl, cr2, cr3, cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); @@ -129,14 +130,14 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) /* Only print out debug registers if they are in their non-default state. */ if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && (d6 == DR6_RESERVED) && (d7 == 0x400))) { - printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", - d0, d1, d2); - printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", - d3, d6, d7); + printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n", + log_lvl, d0, d1, d2); + printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n", + log_lvl, d3, d6, d7); } if (boot_cpu_has(X86_FEATURE_OSPKE)) - printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru()); + printk("%sPKRU: %08x\n", log_lvl, read_pkru()); } void release_thread(struct task_struct *dead_task) @@ -150,6 +151,56 @@ enum which_selector { }; /* + * Out of line to be protected from kprobes and tracing. If this would be + * traced or probed than any access to a per CPU variable happens with + * the wrong GS. + * + * It is not used on Xen paravirt. When paravirt support is needed, it + * needs to be renamed with native_ prefix. + */ +static noinstr unsigned long __rdgsbase_inactive(void) +{ + unsigned long gsbase; + + lockdep_assert_irqs_disabled(); + + if (!static_cpu_has(X86_FEATURE_XENPV)) { + native_swapgs(); + gsbase = rdgsbase(); + native_swapgs(); + } else { + instrumentation_begin(); + rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + instrumentation_end(); + } + + return gsbase; +} + +/* + * Out of line to be protected from kprobes and tracing. If this would be + * traced or probed than any access to a per CPU variable happens with + * the wrong GS. + * + * It is not used on Xen paravirt. When paravirt support is needed, it + * needs to be renamed with native_ prefix. + */ +static noinstr void __wrgsbase_inactive(unsigned long gsbase) +{ + lockdep_assert_irqs_disabled(); + + if (!static_cpu_has(X86_FEATURE_XENPV)) { + native_swapgs(); + wrgsbase(gsbase); + native_swapgs(); + } else { + instrumentation_begin(); + wrmsrl(MSR_KERNEL_GS_BASE, gsbase); + instrumentation_end(); + } +} + +/* * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are * not available. The goal is to be reasonably fast on non-FSGSBASE systems. * It's forcibly inlined because it'll generate better code and this function @@ -198,22 +249,35 @@ static __always_inline void save_fsgs(struct task_struct *task) { savesegment(fs, task->thread.fsindex); savesegment(gs, task->thread.gsindex); - save_base_legacy(task, task->thread.fsindex, FS); - save_base_legacy(task, task->thread.gsindex, GS); + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + /* + * If FSGSBASE is enabled, we can't make any useful guesses + * about the base, and user code expects us to save the current + * value. Fortunately, reading the base directly is efficient. + */ + task->thread.fsbase = rdfsbase(); + task->thread.gsbase = __rdgsbase_inactive(); + } else { + save_base_legacy(task, task->thread.fsindex, FS); + save_base_legacy(task, task->thread.gsindex, GS); + } } -#if IS_ENABLED(CONFIG_KVM) /* * While a process is running,current->thread.fsbase and current->thread.gsbase - * may not match the corresponding CPU registers (see save_base_legacy()). KVM - * wants an efficient way to save and restore FSBASE and GSBASE. - * When FSGSBASE extensions are enabled, this will have to use RD{FS,GS}BASE. + * may not match the corresponding CPU registers (see save_base_legacy()). */ -void save_fsgs_for_kvm(void) +void current_save_fsgs(void) { + unsigned long flags; + + /* Interrupts need to be off for FSGSBASE */ + local_irq_save(flags); save_fsgs(current); + local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(save_fsgs_for_kvm); +#if IS_ENABLED(CONFIG_KVM) +EXPORT_SYMBOL_GPL(current_save_fsgs); #endif static __always_inline void loadseg(enum which_selector which, @@ -278,14 +342,26 @@ static __always_inline void load_seg_legacy(unsigned short prev_index, static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, struct thread_struct *next) { - load_seg_legacy(prev->fsindex, prev->fsbase, - next->fsindex, next->fsbase, FS); - load_seg_legacy(prev->gsindex, prev->gsbase, - next->gsindex, next->gsbase, GS); + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + /* Update the FS and GS selectors if they could have changed. */ + if (unlikely(prev->fsindex || next->fsindex)) + loadseg(FS, next->fsindex); + if (unlikely(prev->gsindex || next->gsindex)) + loadseg(GS, next->gsindex); + + /* Update the bases. */ + wrfsbase(next->fsbase); + __wrgsbase_inactive(next->gsbase); + } else { + load_seg_legacy(prev->fsindex, prev->fsbase, + next->fsindex, next->fsbase, FS); + load_seg_legacy(prev->gsindex, prev->gsbase, + next->gsindex, next->gsbase, GS); + } } -static unsigned long x86_fsgsbase_read_task(struct task_struct *task, - unsigned short selector) +unsigned long x86_fsgsbase_read_task(struct task_struct *task, + unsigned short selector) { unsigned short idx = selector >> 3; unsigned long base; @@ -327,13 +403,44 @@ static unsigned long x86_fsgsbase_read_task(struct task_struct *task, return base; } +unsigned long x86_gsbase_read_cpu_inactive(void) +{ + unsigned long gsbase; + + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + unsigned long flags; + + local_irq_save(flags); + gsbase = __rdgsbase_inactive(); + local_irq_restore(flags); + } else { + rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + } + + return gsbase; +} + +void x86_gsbase_write_cpu_inactive(unsigned long gsbase) +{ + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + unsigned long flags; + + local_irq_save(flags); + __wrgsbase_inactive(gsbase); + local_irq_restore(flags); + } else { + wrmsrl(MSR_KERNEL_GS_BASE, gsbase); + } +} + unsigned long x86_fsbase_read_task(struct task_struct *task) { unsigned long fsbase; if (task == current) fsbase = x86_fsbase_read_cpu(); - else if (task->thread.fsindex == 0) + else if (static_cpu_has(X86_FEATURE_FSGSBASE) || + (task->thread.fsindex == 0)) fsbase = task->thread.fsbase; else fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex); @@ -347,7 +454,8 @@ unsigned long x86_gsbase_read_task(struct task_struct *task) if (task == current) gsbase = x86_gsbase_read_cpu_inactive(); - else if (task->thread.gsindex == 0) + else if (static_cpu_has(X86_FEATURE_FSGSBASE) || + (task->thread.gsindex == 0)) gsbase = task->thread.gsbase; else gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 44130588987f..3f006489087f 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -281,17 +281,9 @@ static int set_segment_reg(struct task_struct *task, return -EIO; /* - * This function has some ABI oddities. - * - * A 32-bit ptracer probably expects that writing FS or GS will change - * FSBASE or GSBASE respectively. In the absence of FSGSBASE support, - * this code indeed has that effect. When FSGSBASE is added, this - * will require a special case. - * - * For existing 64-bit ptracers, writing FS or GS *also* currently - * changes the base if the selector is nonzero the next time the task - * is run. This behavior may not be needed, and trying to preserve it - * when FSGSBASE is added would be complicated at best. + * Writes to FS and GS will change the stored selector. Whether + * this changes the segment base as well depends on whether + * FSGSBASE is enabled. */ switch (offset) { @@ -379,25 +371,12 @@ static int putreg(struct task_struct *child, case offsetof(struct user_regs_struct,fs_base): if (value >= TASK_SIZE_MAX) return -EIO; - /* - * When changing the FS base, use do_arch_prctl_64() - * to set the index to zero and to set the base - * as requested. - * - * NB: This behavior is nonsensical and likely needs to - * change when FSGSBASE support is added. - */ - if (child->thread.fsbase != value) - return do_arch_prctl_64(child, ARCH_SET_FS, value); + x86_fsbase_write_task(child, value); return 0; case offsetof(struct user_regs_struct,gs_base): - /* - * Exactly the same here as the %fs handling above. - */ if (value >= TASK_SIZE_MAX) return -EIO; - if (child->thread.gsbase != value) - return do_arch_prctl_64(child, ARCH_SET_GS, value); + x86_gsbase_write_task(child, value); return 0; #endif } @@ -880,14 +859,39 @@ long arch_ptrace(struct task_struct *child, long request, static int putreg32(struct task_struct *child, unsigned regno, u32 value) { struct pt_regs *regs = task_pt_regs(child); + int ret; switch (regno) { SEG32(cs); SEG32(ds); SEG32(es); - SEG32(fs); - SEG32(gs); + + /* + * A 32-bit ptracer on a 64-bit kernel expects that writing + * FS or GS will also update the base. This is needed for + * operations like PTRACE_SETREGS to fully restore a saved + * CPU state. + */ + + case offsetof(struct user32, regs.fs): + ret = set_segment_reg(child, + offsetof(struct user_regs_struct, fs), + value); + if (ret == 0) + child->thread.fsbase = + x86_fsgsbase_read_task(child, value); + return ret; + + case offsetof(struct user32, regs.gs): + ret = set_segment_reg(child, + offsetof(struct user_regs_struct, gs), + value); + if (ret == 0) + child->thread.gsbase = + x86_fsgsbase_read_task(child, value); + return ret; + SEG32(ss); R32(ebx, bx); diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 896d74cb5081..1b10717c9321 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -95,7 +95,7 @@ static void ich_force_hpet_resume(void) static void ich_force_enable_hpet(struct pci_dev *dev) { u32 val; - u32 uninitialized_var(rcba); + u32 rcba; int err = 0; if (hpet_address || force_hpet_address) @@ -185,7 +185,7 @@ static void hpet_print_force_info(void) static void old_ich_force_hpet_resume(void) { u32 val; - u32 uninitialized_var(gen_cntl); + u32 gen_cntl; if (!force_hpet_address || !cached_dev) return; @@ -207,7 +207,7 @@ static void old_ich_force_hpet_resume(void) static void old_ich_force_enable_hpet(struct pci_dev *dev) { u32 val; - u32 uninitialized_var(gen_cntl); + u32 gen_cntl; if (hpet_address || force_hpet_address) return; @@ -298,7 +298,7 @@ static void vt8237_force_hpet_resume(void) static void vt8237_force_enable_hpet(struct pci_dev *dev) { - u32 uninitialized_var(val); + u32 val; if (hpet_address || force_hpet_address) return; @@ -429,7 +429,7 @@ static void nvidia_force_hpet_resume(void) static void nvidia_force_enable_hpet(struct pci_dev *dev) { - u32 uninitialized_var(val); + u32 val; if (hpet_address || force_hpet_address) return; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 399f97abee02..d5fa494c2304 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -25,6 +25,7 @@ #include <linux/user-return-notifier.h> #include <linux/uprobes.h> #include <linux/context_tracking.h> +#include <linux/entry-common.h> #include <linux/syscalls.h> #include <asm/processor.h> @@ -803,7 +804,7 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. */ -void do_signal(struct pt_regs *regs) +void arch_do_signal(struct pt_regs *regs) { struct ksignal ksig; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ffbd9a3d78d8..27aa04a95702 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -51,11 +51,11 @@ #include <linux/err.h> #include <linux/nmi.h> #include <linux/tboot.h> -#include <linux/stackprotector.h> #include <linux/gfp.h> #include <linux/cpuidle.h> #include <linux/numa.h> #include <linux/pgtable.h> +#include <linux/overflow.h> #include <asm/acpi.h> #include <asm/desc.h> @@ -80,6 +80,7 @@ #include <asm/cpu_device_id.h> #include <asm/spec-ctrl.h> #include <asm/hw_irq.h> +#include <asm/stackprotector.h> /* representing HT siblings of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); @@ -259,21 +260,10 @@ static void notrace start_secondary(void *unused) /* enable local interrupts */ local_irq_enable(); - /* to prevent fake stack check failure in clock setup */ - boot_init_stack_canary(); - x86_cpuinit.setup_percpu_clockev(); wmb(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); - - /* - * Prevent tail call to cpu_startup_entry() because the stack protector - * guard has been changed a couple of function calls up, in - * boot_init_stack_canary() and must not be checked before tail calling - * another function. - */ - prevent_tail_call_optimization(); } /** @@ -1011,6 +1001,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) alternatives_enable_smp(); per_cpu(current_task, cpu) = idle; + cpu_init_stack_canary(cpu, idle); /* Initialize the interrupt stack(s) */ ret = irq_init_percpu_irqstack(cpu); @@ -1777,6 +1768,7 @@ void native_play_dead(void) #endif +#ifdef CONFIG_X86_64 /* * APERF/MPERF frequency ratio computation. * @@ -1975,6 +1967,7 @@ static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) static bool intel_set_max_freq_ratio(void) { u64 base_freq, turbo_freq; + u64 turbo_ratio; if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) goto out; @@ -2000,15 +1993,23 @@ out: /* * Some hypervisors advertise X86_FEATURE_APERFMPERF * but then fill all MSR's with zeroes. + * Some CPUs have turbo boost but don't declare any turbo ratio + * in MSR_TURBO_RATIO_LIMIT. */ - if (!base_freq) { - pr_debug("Couldn't determine cpu base frequency, necessary for scale-invariant accounting.\n"); + if (!base_freq || !turbo_freq) { + pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n"); return false; } - arch_turbo_freq_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, - base_freq); + turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq); + if (!turbo_ratio) { + pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n"); + return false; + } + + arch_turbo_freq_ratio = turbo_ratio; arch_set_max_freq_ratio(turbo_disabled()); + return true; } @@ -2048,11 +2049,19 @@ static void init_freq_invariance(bool secondary) } } +static void disable_freq_invariance_workfn(struct work_struct *work) +{ + static_branch_disable(&arch_scale_freq_key); +} + +static DECLARE_WORK(disable_freq_invariance_work, + disable_freq_invariance_workfn); + DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; void arch_scale_freq_tick(void) { - u64 freq_scale; + u64 freq_scale = SCHED_CAPACITY_SCALE; u64 aperf, mperf; u64 acnt, mcnt; @@ -2064,19 +2073,32 @@ void arch_scale_freq_tick(void) acnt = aperf - this_cpu_read(arch_prev_aperf); mcnt = mperf - this_cpu_read(arch_prev_mperf); - if (!mcnt) - return; this_cpu_write(arch_prev_aperf, aperf); this_cpu_write(arch_prev_mperf, mperf); - acnt <<= 2*SCHED_CAPACITY_SHIFT; - mcnt *= arch_max_freq_ratio; + if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) + goto error; + + if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt) + goto error; freq_scale = div64_u64(acnt, mcnt); + if (!freq_scale) + goto error; if (freq_scale > SCHED_CAPACITY_SCALE) freq_scale = SCHED_CAPACITY_SCALE; this_cpu_write(arch_freq_scale, freq_scale); + return; + +error: + pr_warn("Scheduler frequency invariance went wobbly, disabling!\n"); + schedule_work(&disable_freq_invariance_work); +} +#else +static inline void init_freq_invariance(bool secondary) +{ } +#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 6ad43fc44556..2fd698e28e4d 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -58,7 +58,6 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, * or a page fault), which can make frame pointers * unreliable. */ - if (IS_ENABLED(CONFIG_FRAME_POINTER)) return -EINVAL; } @@ -81,10 +80,6 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, if (unwind_error(&state)) return -EINVAL; - /* Success path for non-user tasks, i.e. kthreads and idle tasks */ - if (!(task->flags & (PF_KTHREAD | PF_IDLE))) - return -EINVAL; - return 0; } diff --git a/arch/x86/kernel/sys_ia32.c b/arch/x86/kernel/sys_ia32.c index f8d65c99feb8..720cde885042 100644 --- a/arch/x86/kernel/sys_ia32.c +++ b/arch/x86/kernel/sys_ia32.c @@ -251,9 +251,6 @@ COMPAT_SYSCALL_DEFINE5(ia32_clone, unsigned long, clone_flags, .tls = tls_val, }; - if (!legacy_clone_args_valid(&args)) - return -EINVAL; - return _do_fork(&args); } #endif /* CONFIG_IA32_EMULATION */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b7cb3e0716f7..438fc554d48d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -245,7 +245,7 @@ static noinstr bool handle_bug(struct pt_regs *regs) DEFINE_IDTENTRY_RAW(exc_invalid_op) { - bool rcu_exit; + irqentry_state_t state; /* * We use UD2 as a short encoding for 'CALL __WARN', as such @@ -255,11 +255,11 @@ DEFINE_IDTENTRY_RAW(exc_invalid_op) if (!user_mode(regs) && handle_bug(regs)) return; - rcu_exit = idtentry_enter_cond_rcu(regs); + state = irqentry_enter(regs); instrumentation_begin(); handle_invalid_op(regs); instrumentation_end(); - idtentry_exit_cond_rcu(regs, rcu_exit); + irqentry_exit(regs, state); } DEFINE_IDTENTRY(exc_coproc_segment_overrun) @@ -405,7 +405,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault) } #endif - nmi_enter(); + idtentry_enter_nmi(regs); instrumentation_begin(); notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); @@ -638,28 +638,25 @@ DEFINE_IDTENTRY_RAW(exc_int3) return; /* - * idtentry_enter_user() uses static_branch_{,un}likely() and therefore - * can trigger INT3, hence poke_int3_handler() must be done - * before. If the entry came from kernel mode, then use nmi_enter() - * because the INT3 could have been hit in any context including - * NMI. + * irqentry_enter_from_user_mode() uses static_branch_{,un}likely() + * and therefore can trigger INT3, hence poke_int3_handler() must + * be done before. If the entry came from kernel mode, then use + * nmi_enter() because the INT3 could have been hit in any context + * including NMI. */ if (user_mode(regs)) { - idtentry_enter_user(regs); + irqentry_enter_from_user_mode(regs); instrumentation_begin(); do_int3_user(regs); instrumentation_end(); - idtentry_exit_user(regs); + irqentry_exit_to_user_mode(regs); } else { - nmi_enter(); + bool irq_state = idtentry_enter_nmi(regs); instrumentation_begin(); - trace_hardirqs_off_finish(); if (!do_int3(regs)) die("int3", regs, 0); - if (regs->flags & X86_EFLAGS_IF) - trace_hardirqs_on_prepare(); instrumentation_end(); - nmi_exit(); + idtentry_exit_nmi(regs, irq_state); } } @@ -867,9 +864,8 @@ out: static __always_inline void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6) { - nmi_enter(); + bool irq_state = idtentry_enter_nmi(regs); instrumentation_begin(); - trace_hardirqs_off_finish(); /* * If something gets miswired and we end up here for a user mode @@ -886,10 +882,8 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, handle_debug(regs, dr6, false); - if (regs->flags & X86_EFLAGS_IF) - trace_hardirqs_on_prepare(); instrumentation_end(); - nmi_exit(); + idtentry_exit_nmi(regs, irq_state); } static __always_inline void exc_debug_user(struct pt_regs *regs, @@ -901,12 +895,13 @@ static __always_inline void exc_debug_user(struct pt_regs *regs, */ WARN_ON_ONCE(!user_mode(regs)); - idtentry_enter_user(regs); + irqentry_enter_from_user_mode(regs); instrumentation_begin(); handle_debug(regs, dr6, true); + instrumentation_end(); - idtentry_exit_user(regs); + irqentry_exit_to_user_mode(regs); } #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 722a85f3b2dd..d7c44b257f7f 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -269,13 +269,13 @@ bool unwind_next_frame(struct unwind_state *state) /* * kthreads (other than the boot CPU's idle thread) have some * partial regs at the end of their stack which were placed - * there by copy_thread_tls(). But the regs don't have any + * there by copy_thread(). But the regs don't have any * useful information, so we can skip them. * * This user_mode() check is slightly broader than a PF_KTHREAD * check because it also catches the awkward situation where a * newly forked kthread transitions into a user task by calling - * do_execve(), which eventually clears PF_KTHREAD. + * kernel_execve(), which eventually clears PF_KTHREAD. */ if (!user_mode(regs)) goto the_end; diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 7f969b2d240f..ec88bbe08a32 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -440,8 +440,11 @@ bool unwind_next_frame(struct unwind_state *state) /* * Find the orc_entry associated with the text address. * - * Decrement call return addresses by one so they work for sibling - * calls and calls to noreturn functions. + * For a call frame (as opposed to a signal frame), state->ip points to + * the instruction after the call. That instruction's stack layout + * could be different from the call instruction's layout, for example + * if the call was to a noreturn function. So get the ORC data for the + * call instruction itself. */ orc = orc_find(state->signal ? state->ip : state->ip - 1); if (!orc) { @@ -662,6 +665,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, state->sp = task->thread.sp; state->bp = READ_ONCE_NOCHECK(frame->bp); state->ip = READ_ONCE_NOCHECK(frame->ret_addr); + state->signal = (void *)state->ip == ret_from_fork; } if (get_stack_info((unsigned long *)state->sp, state->task, diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 3bfc8dd8a43d..9a03e5b23135 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -358,6 +358,7 @@ SECTIONS .bss : AT(ADDR(.bss) - LOAD_OFFSET) { __bss_start = .; *(.bss..page_aligned) + . = ALIGN(PAGE_SIZE); *(BSS_MAIN) BSS_DECRYPTED . = ALIGN(PAGE_SIZE); |