diff options
Diffstat (limited to 'arch/x86/kernel')
68 files changed, 1471 insertions, 1013 deletions
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 8dcbf6890714..9fc92e4539d8 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -197,7 +197,7 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) } static int __init -acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) +acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_local_x2apic *processor = NULL; #ifdef CONFIG_X86_X2APIC @@ -210,7 +210,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) if (BAD_MADT_ENTRY(processor, end)) return -EINVAL; - acpi_table_print_madt_entry(header); + acpi_table_print_madt_entry(&header->common); #ifdef CONFIG_X86_X2APIC apic_id = processor->local_apic_id; @@ -242,7 +242,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) } static int __init -acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end) +acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end) { struct acpi_madt_local_apic *processor = NULL; @@ -251,7 +251,7 @@ acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end) if (BAD_MADT_ENTRY(processor, end)) return -EINVAL; - acpi_table_print_madt_entry(header); + acpi_table_print_madt_entry(&header->common); /* Ignore invalid ID */ if (processor->id == 0xff) @@ -272,7 +272,7 @@ acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end) } static int __init -acpi_parse_sapic(struct acpi_subtable_header *header, const unsigned long end) +acpi_parse_sapic(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_local_sapic *processor = NULL; @@ -281,7 +281,7 @@ acpi_parse_sapic(struct acpi_subtable_header *header, const unsigned long end) if (BAD_MADT_ENTRY(processor, end)) return -EINVAL; - acpi_table_print_madt_entry(header); + acpi_table_print_madt_entry(&header->common); acpi_register_lapic((processor->id << 8) | processor->eid,/* APIC ID */ processor->processor_id, /* ACPI ID */ @@ -291,7 +291,7 @@ acpi_parse_sapic(struct acpi_subtable_header *header, const unsigned long end) } static int __init -acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header, +acpi_parse_lapic_addr_ovr(union acpi_subtable_headers * header, const unsigned long end) { struct acpi_madt_local_apic_override *lapic_addr_ovr = NULL; @@ -301,7 +301,7 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header, if (BAD_MADT_ENTRY(lapic_addr_ovr, end)) return -EINVAL; - acpi_table_print_madt_entry(header); + acpi_table_print_madt_entry(&header->common); acpi_lapic_addr = lapic_addr_ovr->address; @@ -309,7 +309,7 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header, } static int __init -acpi_parse_x2apic_nmi(struct acpi_subtable_header *header, +acpi_parse_x2apic_nmi(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_local_x2apic_nmi *x2apic_nmi = NULL; @@ -319,7 +319,7 @@ acpi_parse_x2apic_nmi(struct acpi_subtable_header *header, if (BAD_MADT_ENTRY(x2apic_nmi, end)) return -EINVAL; - acpi_table_print_madt_entry(header); + acpi_table_print_madt_entry(&header->common); if (x2apic_nmi->lint != 1) printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n"); @@ -328,7 +328,7 @@ acpi_parse_x2apic_nmi(struct acpi_subtable_header *header, } static int __init -acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long end) +acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long end) { struct acpi_madt_local_apic_nmi *lapic_nmi = NULL; @@ -337,7 +337,7 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e if (BAD_MADT_ENTRY(lapic_nmi, end)) return -EINVAL; - acpi_table_print_madt_entry(header); + acpi_table_print_madt_entry(&header->common); if (lapic_nmi->lint != 1) printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n"); @@ -449,7 +449,7 @@ static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity, } static int __init -acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end) +acpi_parse_ioapic(union acpi_subtable_headers * header, const unsigned long end) { struct acpi_madt_io_apic *ioapic = NULL; struct ioapic_domain_cfg cfg = { @@ -462,7 +462,7 @@ acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end) if (BAD_MADT_ENTRY(ioapic, end)) return -EINVAL; - acpi_table_print_madt_entry(header); + acpi_table_print_madt_entry(&header->common); /* Statically assign IRQ numbers for IOAPICs hosting legacy IRQs */ if (ioapic->global_irq_base < nr_legacy_irqs()) @@ -508,7 +508,7 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, } static int __init -acpi_parse_int_src_ovr(struct acpi_subtable_header * header, +acpi_parse_int_src_ovr(union acpi_subtable_headers * header, const unsigned long end) { struct acpi_madt_interrupt_override *intsrc = NULL; @@ -518,7 +518,7 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header, if (BAD_MADT_ENTRY(intsrc, end)) return -EINVAL; - acpi_table_print_madt_entry(header); + acpi_table_print_madt_entry(&header->common); if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) { acpi_sci_ioapic_setup(intsrc->source_irq, @@ -550,7 +550,7 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header, } static int __init -acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end) +acpi_parse_nmi_src(union acpi_subtable_headers * header, const unsigned long end) { struct acpi_madt_nmi_source *nmi_src = NULL; @@ -559,7 +559,7 @@ acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end if (BAD_MADT_ENTRY(nmi_src, end)) return -EINVAL; - acpi_table_print_madt_entry(header); + acpi_table_print_madt_entry(&header->common); /* TBD: Support nimsrc entries? */ diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 158ad1483c43..cb6e076a6d39 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -51,6 +51,18 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, if (c->x86_vendor == X86_VENDOR_INTEL && (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f))) flags->bm_control = 0; + /* + * For all recent Centaur CPUs, the ucode will make sure that each + * core can keep cache coherence with each other while entering C3 + * type state. So, set bm_check to 1 to indicate that the kernel + * doesn't need to execute a cache flush operation (WBINVD) when + * entering C3 type state. + */ + if (c->x86_vendor == X86_VENDOR_CENTAUR) { + if (c->x86 > 6 || (c->x86 == 6 && c->x86_model == 0x0f && + c->x86_stepping >= 0x0e)) + flags->bm_check = 1; + } } EXPORT_SYMBOL(acpi_processor_power_init_bm_check); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 9a79c7808f9c..7b9b49dfc05a 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -12,6 +12,7 @@ #include <linux/slab.h> #include <linux/kdebug.h> #include <linux/kprobes.h> +#include <linux/mmu_context.h> #include <asm/text-patching.h> #include <asm/alternative.h> #include <asm/sections.h> @@ -264,7 +265,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; -void *text_poke_early(void *addr, const void *opcode, size_t len); +void text_poke_early(void *addr, const void *opcode, size_t len); /* * Are we looking at a near JMP with a 1 or 4-byte displacement. @@ -666,16 +667,136 @@ void __init alternative_instructions(void) * instructions. And on the local CPU you need to be protected again NMI or MCE * handlers seeing an inconsistent instruction while you patch. */ -void *__init_or_module text_poke_early(void *addr, const void *opcode, - size_t len) +void __init_or_module text_poke_early(void *addr, const void *opcode, + size_t len) { unsigned long flags; + + if (boot_cpu_has(X86_FEATURE_NX) && + is_module_text_address((unsigned long)addr)) { + /* + * Modules text is marked initially as non-executable, so the + * code cannot be running and speculative code-fetches are + * prevented. Just change the code. + */ + memcpy(addr, opcode, len); + } else { + local_irq_save(flags); + memcpy(addr, opcode, len); + local_irq_restore(flags); + sync_core(); + + /* + * Could also do a CLFLUSH here to speed up CPU recovery; but + * that causes hangs on some VIA CPUs. + */ + } +} + +__ro_after_init struct mm_struct *poking_mm; +__ro_after_init unsigned long poking_addr; + +static void *__text_poke(void *addr, const void *opcode, size_t len) +{ + bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; + struct page *pages[2] = {NULL}; + temp_mm_state_t prev; + unsigned long flags; + pte_t pte, *ptep; + spinlock_t *ptl; + pgprot_t pgprot; + + /* + * While boot memory allocator is running we cannot use struct pages as + * they are not yet initialized. There is no way to recover. + */ + BUG_ON(!after_bootmem); + + if (!core_kernel_text((unsigned long)addr)) { + pages[0] = vmalloc_to_page(addr); + if (cross_page_boundary) + pages[1] = vmalloc_to_page(addr + PAGE_SIZE); + } else { + pages[0] = virt_to_page(addr); + WARN_ON(!PageReserved(pages[0])); + if (cross_page_boundary) + pages[1] = virt_to_page(addr + PAGE_SIZE); + } + /* + * If something went wrong, crash and burn since recovery paths are not + * implemented. + */ + BUG_ON(!pages[0] || (cross_page_boundary && !pages[1])); + local_irq_save(flags); - memcpy(addr, opcode, len); + + /* + * Map the page without the global bit, as TLB flushing is done with + * flush_tlb_mm_range(), which is intended for non-global PTEs. + */ + pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL); + + /* + * The lock is not really needed, but this allows to avoid open-coding. + */ + ptep = get_locked_pte(poking_mm, poking_addr, &ptl); + + /* + * This must not fail; preallocated in poking_init(). + */ + VM_BUG_ON(!ptep); + + pte = mk_pte(pages[0], pgprot); + set_pte_at(poking_mm, poking_addr, ptep, pte); + + if (cross_page_boundary) { + pte = mk_pte(pages[1], pgprot); + set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); + } + + /* + * Loading the temporary mm behaves as a compiler barrier, which + * guarantees that the PTE will be set at the time memcpy() is done. + */ + prev = use_temporary_mm(poking_mm); + + kasan_disable_current(); + memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len); + kasan_enable_current(); + + /* + * Ensure that the PTE is only cleared after the instructions of memcpy + * were issued by using a compiler barrier. + */ + barrier(); + + pte_clear(poking_mm, poking_addr, ptep); + if (cross_page_boundary) + pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); + + /* + * Loading the previous page-table hierarchy requires a serializing + * instruction that already allows the core to see the updated version. + * Xen-PV is assumed to serialize execution in a similar manner. + */ + unuse_temporary_mm(prev); + + /* + * Flushing the TLB might involve IPIs, which would require enabled + * IRQs, but not if the mm is not used, as it is in this point. + */ + flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + + (cross_page_boundary ? 2 : 1) * PAGE_SIZE, + PAGE_SHIFT, false); + + /* + * If the text does not match what we just wrote then something is + * fundamentally screwy; there's nothing we can really do about that. + */ + BUG_ON(memcmp(addr, opcode, len)); + + pte_unmap_unlock(ptep, ptl); local_irq_restore(flags); - sync_core(); - /* Could also do a CLFLUSH here to speed up CPU recovery; but - that causes hangs on some VIA CPUs. */ return addr; } @@ -689,48 +810,36 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode, * It means the size must be writable atomically and the address must be aligned * in a way that permits an atomic write. It also makes sure we fit on a single * page. + * + * Note that the caller must ensure that if the modified code is part of a + * module, the module would not be removed during poking. This can be achieved + * by registering a module notifier, and ordering module removal and patching + * trough a mutex. */ void *text_poke(void *addr, const void *opcode, size_t len) { - unsigned long flags; - char *vaddr; - struct page *pages[2]; - int i; - - /* - * While boot memory allocator is runnig we cannot use struct - * pages as they are not yet initialized. - */ - BUG_ON(!after_bootmem); - lockdep_assert_held(&text_mutex); - if (!core_kernel_text((unsigned long)addr)) { - pages[0] = vmalloc_to_page(addr); - pages[1] = vmalloc_to_page(addr + PAGE_SIZE); - } else { - pages[0] = virt_to_page(addr); - WARN_ON(!PageReserved(pages[0])); - pages[1] = virt_to_page(addr + PAGE_SIZE); - } - BUG_ON(!pages[0]); - local_irq_save(flags); - set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0])); - if (pages[1]) - set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1])); - vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0); - memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); - clear_fixmap(FIX_TEXT_POKE0); - if (pages[1]) - clear_fixmap(FIX_TEXT_POKE1); - local_flush_tlb(); - sync_core(); - /* Could also do a CLFLUSH here to speed up CPU recovery; but - that causes hangs on some VIA CPUs. */ - for (i = 0; i < len; i++) - BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]); - local_irq_restore(flags); - return addr; + return __text_poke(addr, opcode, len); +} + +/** + * text_poke_kgdb - Update instructions on a live kernel by kgdb + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy + * + * Only atomic text poke/set should be allowed when not doing early patching. + * It means the size must be writable atomically and the address must be aligned + * in a way that permits an atomic write. It also makes sure we fit on a single + * page. + * + * Context: should only be used by kgdb, which ensures no other core is running, + * despite the fact it does not hold the text_mutex. + */ +void *text_poke_kgdb(void *addr, const void *opcode, size_t len) +{ + return __text_poke(addr, opcode, len); } static void do_sync_core(void *info) @@ -788,7 +897,7 @@ NOKPROBE_SYMBOL(poke_int3_handler); * replacing opcode * - sync cores */ -void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) +void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) { unsigned char int3 = 0xcc; @@ -830,7 +939,5 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) * the writing of the new instruction. */ bp_patching_in_progress = false; - - return addr; } diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 58176b56354e..294ed4392a0e 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -14,6 +14,7 @@ #define pr_fmt(fmt) "AGP: " fmt #include <linux/kernel.h> +#include <linux/kcore.h> #include <linux/types.h> #include <linux/init.h> #include <linux/memblock.h> @@ -57,7 +58,7 @@ int fallback_aper_force __initdata; int fix_aperture __initdata = 1; -#ifdef CONFIG_PROC_VMCORE +#if defined(CONFIG_PROC_VMCORE) || defined(CONFIG_PROC_KCORE) /* * If the first kernel maps the aperture over e820 RAM, the kdump kernel will * use the same range because it will remain configured in the northbridge. @@ -66,20 +67,25 @@ int fix_aperture __initdata = 1; */ static unsigned long aperture_pfn_start, aperture_page_count; -static int gart_oldmem_pfn_is_ram(unsigned long pfn) +static int gart_mem_pfn_is_ram(unsigned long pfn) { return likely((pfn < aperture_pfn_start) || (pfn >= aperture_pfn_start + aperture_page_count)); } -static void exclude_from_vmcore(u64 aper_base, u32 aper_order) +static void __init exclude_from_core(u64 aper_base, u32 aper_order) { aperture_pfn_start = aper_base >> PAGE_SHIFT; aperture_page_count = (32 * 1024 * 1024) << aper_order >> PAGE_SHIFT; - WARN_ON(register_oldmem_pfn_is_ram(&gart_oldmem_pfn_is_ram)); +#ifdef CONFIG_PROC_VMCORE + WARN_ON(register_oldmem_pfn_is_ram(&gart_mem_pfn_is_ram)); +#endif +#ifdef CONFIG_PROC_KCORE + WARN_ON(register_mem_pfn_is_ram(&gart_mem_pfn_is_ram)); +#endif } #else -static void exclude_from_vmcore(u64 aper_base, u32 aper_order) +static void exclude_from_core(u64 aper_base, u32 aper_order) { } #endif @@ -474,7 +480,7 @@ out: * may have allocated the range over its e820 RAM * and fixed up the northbridge */ - exclude_from_vmcore(last_aper_base, last_aper_order); + exclude_from_core(last_aper_base, last_aper_order); return 1; } @@ -520,7 +526,7 @@ out: * overlap with the first kernel's memory. We can't access the * range through vmcore even though it should be part of the dump. */ - exclude_from_vmcore(aper_alloc, aper_order); + exclude_from_core(aper_alloc, aper_order); /* Fix up the north bridges */ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) { diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b7bcdd781651..ab6af775f06c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -802,6 +802,24 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) return 0; } +static int __init lapic_init_clockevent(void) +{ + if (!lapic_timer_frequency) + return -1; + + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR, + TICK_NSEC, lapic_clockevent.shift); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent); + lapic_clockevent.max_delta_ticks = 0x7FFFFFFF; + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + lapic_clockevent.min_delta_ticks = 0xF; + + return 0; +} + static int __init calibrate_APIC_clock(void) { struct clock_event_device *levt = this_cpu_ptr(&lapic_events); @@ -810,25 +828,21 @@ static int __init calibrate_APIC_clock(void) long delta, deltatsc; int pm_referenced = 0; - /** - * check if lapic timer has already been calibrated by platform - * specific routine, such as tsc calibration code. if so, we just fill + if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + return 0; + + /* + * Check if lapic timer has already been calibrated by platform + * specific routine, such as tsc calibration code. If so just fill * in the clockevent structure and return. */ - - if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { - return 0; - } else if (lapic_timer_frequency) { + if (!lapic_init_clockevent()) { apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n", - lapic_timer_frequency); - lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR, - TICK_NSEC, lapic_clockevent.shift); - lapic_clockevent.max_delta_ns = - clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); - lapic_clockevent.max_delta_ticks = 0x7FFFFF; - lapic_clockevent.min_delta_ns = - clockevent_delta2ns(0xF, &lapic_clockevent); - lapic_clockevent.min_delta_ticks = 0xF; + lapic_timer_frequency); + /* + * Direct calibration methods must have an always running + * local APIC timer, no need for broadcast timer. + */ lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; return 0; } @@ -869,17 +883,8 @@ static int __init calibrate_APIC_clock(void) pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1, &delta, &deltatsc); - /* Calculate the scaled math multiplication factor */ - lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, - lapic_clockevent.shift); - lapic_clockevent.max_delta_ns = - clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent); - lapic_clockevent.max_delta_ticks = 0x7FFFFFFF; - lapic_clockevent.min_delta_ns = - clockevent_delta2ns(0xF, &lapic_clockevent); - lapic_clockevent.min_delta_ticks = 0xF; - lapic_timer_frequency = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; + lapic_init_clockevent(); apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult); diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 78778b54f904..a5464b8b6c46 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -175,7 +175,7 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) this_cpu_write(cpu_llc_id, node); /* Account for nodes per socket in multi-core-module processors */ - if (static_cpu_has(X86_FEATURE_NODEID_MSR)) { + if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { rdmsrl(MSR_FAM10H_NODE_ID, val); nodes = ((val >> 3) & 7) + 1; } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index ddced33184b5..d3d075226c0a 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -68,10 +68,12 @@ int main(void) #undef ENTRY OFFSET(TSS_ist, tss_struct, x86_tss.ist); + DEFINE(DB_STACK_OFFSET, offsetof(struct cea_exception_stacks, DB_stack) - + offsetof(struct cea_exception_stacks, DB1_stack)); BLANK(); #ifdef CONFIG_STACKPROTECTOR - DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary)); + DEFINE(stack_canary_offset, offsetof(struct fixed_percpu_data, stack_canary)); BLANK(); #endif diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index cfd24f9f7614..1796d2bdcaaa 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -28,7 +28,7 @@ obj-y += cpuid-deps.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o -obj-$(CONFIG_CPU_SUP_INTEL) += intel.o intel_pconfig.o +obj-$(CONFIG_CPU_SUP_INTEL) += intel.o intel_pconfig.o intel_epb.o obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_HYGON) += hygon.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 01004bfb1a1b..fb6a64bd765f 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -82,11 +82,14 @@ static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) * performance at the same time.. */ +#ifdef CONFIG_X86_32 extern __visible void vide(void); -__asm__(".globl vide\n" +__asm__(".text\n" + ".globl vide\n" ".type vide, @function\n" ".align 4\n" "vide: ret\n"); +#endif static void init_amd_k5(struct cpuinfo_x86 *c) { diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 804c49493938..64d5aec24203 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -83,7 +83,7 @@ unsigned int aperfmperf_get_khz(int cpu) if (!cpu_khz) return 0; - if (!static_cpu_has(X86_FEATURE_APERFMPERF)) + if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) return 0; aperfmperf_snapshot_cpu(cpu, ktime_get(), true); @@ -99,7 +99,7 @@ void arch_freq_prepare_all(void) if (!cpu_khz) return; - if (!static_cpu_has(X86_FEATURE_APERFMPERF)) + if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) return; for_each_online_cpu(cpu) @@ -115,7 +115,7 @@ unsigned int arch_freq_get_on_cpu(int cpu) if (!cpu_khz) return 0; - if (!static_cpu_has(X86_FEATURE_APERFMPERF)) + if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) return 0; if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true)) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 2da82eff0eb4..29630393f300 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -275,7 +275,7 @@ static const struct { const char *option; enum spectre_v2_user_cmd cmd; bool secure; -} v2_user_options[] __initdata = { +} v2_user_options[] __initconst = { { "auto", SPECTRE_V2_USER_CMD_AUTO, false }, { "off", SPECTRE_V2_USER_CMD_NONE, false }, { "on", SPECTRE_V2_USER_CMD_FORCE, true }, @@ -419,7 +419,7 @@ static const struct { const char *option; enum spectre_v2_mitigation_cmd cmd; bool secure; -} mitigation_options[] __initdata = { +} mitigation_options[] __initconst = { { "off", SPECTRE_V2_CMD_NONE, false }, { "on", SPECTRE_V2_CMD_FORCE, true }, { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, @@ -440,7 +440,8 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) char arg[20]; int ret, i; - if (cmdline_find_option_bool(boot_command_line, "nospectre_v2")) + if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") || + cpu_mitigations_off()) return SPECTRE_V2_CMD_NONE; ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); @@ -658,7 +659,7 @@ static const char * const ssb_strings[] = { static const struct { const char *option; enum ssb_mitigation_cmd cmd; -} ssb_mitigation_options[] __initdata = { +} ssb_mitigation_options[] __initconst = { { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */ { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */ { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */ @@ -672,7 +673,8 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) char arg[20]; int ret, i; - if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable")) { + if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") || + cpu_mitigations_off()) { return SPEC_STORE_BYPASS_CMD_NONE; } else { ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", @@ -1008,6 +1010,11 @@ static void __init l1tf_select_mitigation(void) if (!boot_cpu_has_bug(X86_BUG_L1TF)) return; + if (cpu_mitigations_off()) + l1tf_mitigation = L1TF_MITIGATION_OFF; + else if (cpu_mitigations_auto_nosmt()) + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; + override_cache_bits(&boot_cpu_data); switch (l1tf_mitigation) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cb28e98a0659..8739bdfe9bdf 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -372,6 +372,8 @@ static bool pku_disabled; static __always_inline void setup_pku(struct cpuinfo_x86 *c) { + struct pkru_state *pk; + /* check the boot processor, plus compile options for PKU: */ if (!cpu_feature_enabled(X86_FEATURE_PKU)) return; @@ -382,6 +384,9 @@ static __always_inline void setup_pku(struct cpuinfo_x86 *c) return; cr4_set_bits(X86_CR4_PKE); + pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU); + if (pk) + pk->pkru = init_pkru_value; /* * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE * cpuid bit to be set. We need to ensure that we @@ -507,19 +512,6 @@ void load_percpu_segment(int cpu) DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); #endif -#ifdef CONFIG_X86_64 -/* - * Special IST stacks which the CPU switches to when it calls - * an IST-marked descriptor entry. Up to 7 stacks (hardware - * limit), all of them are 4K, except the debug stack which - * is 8K. - */ -static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [DEBUG_STACK - 1] = DEBUG_STKSZ -}; -#endif - /* Load the original GDT from the per-cpu structure */ void load_direct_gdt(int cpu) { @@ -1511,9 +1503,9 @@ static __init int setup_clearcpuid(char *arg) __setup("clearcpuid=", setup_clearcpuid); #ifdef CONFIG_X86_64 -DEFINE_PER_CPU_FIRST(union irq_stack_union, - irq_stack_union) __aligned(PAGE_SIZE) __visible; -EXPORT_PER_CPU_SYMBOL_GPL(irq_stack_union); +DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, + fixed_percpu_data) __aligned(PAGE_SIZE) __visible; +EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); /* * The following percpu variables are hot. Align current_task to @@ -1523,9 +1515,7 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); -DEFINE_PER_CPU(char *, irq_stack_ptr) = - init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE; - +DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; @@ -1562,23 +1552,7 @@ void syscall_init(void) X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT); } -/* - * Copies of the original ist values from the tss are only accessed during - * debugging, no special alignment required. - */ -DEFINE_PER_CPU(struct orig_ist, orig_ist); - -static DEFINE_PER_CPU(unsigned long, debug_stack_addr); DEFINE_PER_CPU(int, debug_stack_usage); - -int is_debug_stack(unsigned long addr) -{ - return __this_cpu_read(debug_stack_usage) || - (addr <= __this_cpu_read(debug_stack_addr) && - addr > (__this_cpu_read(debug_stack_addr) - DEBUG_STKSZ)); -} -NOKPROBE_SYMBOL(is_debug_stack); - DEFINE_PER_CPU(u32, debug_idt_ctr); void debug_stack_set_zero(void) @@ -1668,7 +1642,7 @@ static void setup_getcpu(int cpu) unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu)); struct desc_struct d = { }; - if (static_cpu_has(X86_FEATURE_RDTSCP)) + if (boot_cpu_has(X86_FEATURE_RDTSCP)) write_rdtscp_aux(cpudata); /* Store CPU and node number in limit. */ @@ -1690,17 +1664,14 @@ static void setup_getcpu(int cpu) * initialized (naturally) in the bootstrap process, such as the GDT * and IDT. We reload them nevertheless, this function acts as a * 'CPU state barrier', nothing should get across. - * A lot of state is already set up in PDA init for 64 bit */ #ifdef CONFIG_X86_64 void cpu_init(void) { - struct orig_ist *oist; + int cpu = raw_smp_processor_id(); struct task_struct *me; struct tss_struct *t; - unsigned long v; - int cpu = raw_smp_processor_id(); int i; wait_for_master_cpu(cpu); @@ -1715,7 +1686,6 @@ void cpu_init(void) load_ucode_ap(); t = &per_cpu(cpu_tss_rw, cpu); - oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA if (this_cpu_read(numa_node) == 0 && @@ -1753,16 +1723,11 @@ void cpu_init(void) /* * set up and load the per-CPU TSS */ - if (!oist->ist[0]) { - char *estacks = get_cpu_entry_area(cpu)->exception_stacks; - - for (v = 0; v < N_EXCEPTION_STACKS; v++) { - estacks += exception_stack_sizes[v]; - oist->ist[v] = t->x86_tss.ist[v] = - (unsigned long)estacks; - if (v == DEBUG_STACK-1) - per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks; - } + if (!t->x86_tss.ist[0]) { + t->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF); + t->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI); + t->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB); + t->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); } t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; @@ -1864,23 +1829,6 @@ void cpu_init(void) } #endif -static void bsp_resume(void) -{ - if (this_cpu->c_bsp_resume) - this_cpu->c_bsp_resume(&boot_cpu_data); -} - -static struct syscore_ops cpu_syscore_ops = { - .resume = bsp_resume, -}; - -static int __init init_cpu_syscore(void) -{ - register_syscore_ops(&cpu_syscore_ops); - return 0; -} -core_initcall(init_cpu_syscore); - /* * The microcode loader calls this upon late microcode load to recheck features, * only when microcode has been updated. Caller holds microcode_mutex and CPU diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 5eb946b9a9f3..c0e2407abdd6 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -14,7 +14,6 @@ struct cpu_dev { void (*c_init)(struct cpuinfo_x86 *); void (*c_identify)(struct cpuinfo_x86 *); void (*c_detect_tlb)(struct cpuinfo_x86 *); - void (*c_bsp_resume)(struct cpuinfo_x86 *); int c_x86_vendor; #ifdef CONFIG_X86_32 /* Optional vendor specific routine to obtain the cache size. */ diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index d12226f60168..1d9b8aaea06c 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -124,7 +124,7 @@ static void set_cx86_reorder(void) setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ /* Load/Store Serialize to mem access disable (=reorder it) */ - setCx86_old(CX86_PCR0, getCx86_old(CX86_PCR0) & ~0x80); + setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80); /* set load/store serialize from 1GB to 4GB */ ccr3 |= 0xe0; setCx86(CX86_CCR3, ccr3); @@ -135,11 +135,11 @@ static void set_cx86_memwb(void) pr_info("Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); /* CCR2 bit 2: unlock NW bit */ - setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04); + setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04); /* set 'Not Write-through' */ write_cr0(read_cr0() | X86_CR0_NW); /* CCR2 bit 2: lock NW bit and set WT1 */ - setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x14); + setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14); } /* @@ -153,14 +153,14 @@ static void geode_configure(void) local_irq_save(flags); /* Suspend on halt power saving and enable #SUSP pin */ - setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x88); + setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ /* FPU fast, DTE cache, Mem bypass */ - setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x38); + setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38); setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ set_cx86_memwb(); @@ -296,7 +296,7 @@ static void init_cyrix(struct cpuinfo_x86 *c) /* GXm supports extended cpuid levels 'ala' AMD */ if (c->cpuid_level == 2) { /* Enable cxMMX extensions (GX1 Datasheet 54) */ - setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7) | 1); + setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1); /* * GXm : 0x30 ... 0x5f GXm datasheet 51 @@ -319,7 +319,7 @@ static void init_cyrix(struct cpuinfo_x86 *c) if (dir1 > 7) { dir0_msn++; /* M II */ /* Enable MMX extensions (App note 108) */ - setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7)|1); + setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1); } else { /* A 6x86MX - it has the bug. */ set_cpu_bug(c, X86_BUG_COMA); diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index cf25405444ab..415621ddb8a2 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -19,6 +19,8 @@ #include "cpu.h" +#define APICID_SOCKET_ID_BIT 6 + /* * nodes_per_socket: Stores the number of nodes per socket. * Refer to CPUID Fn8000_001E_ECX Node Identifiers[10:8] @@ -87,6 +89,9 @@ static void hygon_get_topology(struct cpuinfo_x86 *c) if (!err) c->x86_coreid_bits = get_count_order(c->x86_max_cores); + /* Socket ID is ApicId[6] for these processors. */ + c->phys_proc_id = c->apicid >> APICID_SOCKET_ID_BIT; + cacheinfo_hygon_init_llc_id(c, cpu, node_id); } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index fc3c07fe7df5..f17c1a714779 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -596,36 +596,6 @@ detect_keyid_bits: c->x86_phys_bits -= keyid_bits; } -static void init_intel_energy_perf(struct cpuinfo_x86 *c) -{ - u64 epb; - - /* - * Initialize MSR_IA32_ENERGY_PERF_BIAS if not already initialized. - * (x86_energy_perf_policy(8) is available to change it at run-time.) - */ - if (!cpu_has(c, X86_FEATURE_EPB)) - return; - - rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); - if ((epb & 0xF) != ENERGY_PERF_BIAS_PERFORMANCE) - return; - - pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); - pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n"); - epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL; - wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); -} - -static void intel_bsp_resume(struct cpuinfo_x86 *c) -{ - /* - * MSR_IA32_ENERGY_PERF_BIAS is lost across suspend/resume, - * so reinitialize it properly like during bootup: - */ - init_intel_energy_perf(c); -} - static void init_cpuid_fault(struct cpuinfo_x86 *c) { u64 msr; @@ -763,8 +733,6 @@ static void init_intel(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_TME)) detect_tme(c); - init_intel_energy_perf(c); - init_intel_misc_features(c); } @@ -1023,9 +991,7 @@ static const struct cpu_dev intel_cpu_dev = { .c_detect_tlb = intel_detect_tlb, .c_early_init = early_init_intel, .c_init = init_intel, - .c_bsp_resume = intel_bsp_resume, .c_x86_vendor = X86_VENDOR_INTEL, }; cpu_dev_register(intel_cpu_dev); - diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c new file mode 100644 index 000000000000..f4dd73396f28 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Performance and Energy Bias Hint support. + * + * Copyright (C) 2019 Intel Corporation + * + * Author: + * Rafael J. Wysocki <rafael.j.wysocki@intel.com> + */ + +#include <linux/cpuhotplug.h> +#include <linux/cpu.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/syscore_ops.h> +#include <linux/pm.h> + +#include <asm/cpufeature.h> +#include <asm/msr.h> + +/** + * DOC: overview + * + * The Performance and Energy Bias Hint (EPB) allows software to specify its + * preference with respect to the power-performance tradeoffs present in the + * processor. Generally, the EPB is expected to be set by user space (directly + * via sysfs or with the help of the x86_energy_perf_policy tool), but there are + * two reasons for the kernel to update it. + * + * First, there are systems where the platform firmware resets the EPB during + * system-wide transitions from sleep states back into the working state + * effectively causing the previous EPB updates by user space to be lost. + * Thus the kernel needs to save the current EPB values for all CPUs during + * system-wide transitions to sleep states and restore them on the way back to + * the working state. That can be achieved by saving EPB for secondary CPUs + * when they are taken offline during transitions into system sleep states and + * for the boot CPU in a syscore suspend operation, so that it can be restored + * for the boot CPU in a syscore resume operation and for the other CPUs when + * they are brought back online. However, CPUs that are already offline when + * a system-wide PM transition is started are not taken offline again, but their + * EPB values may still be reset by the platform firmware during the transition, + * so in fact it is necessary to save the EPB of any CPU taken offline and to + * restore it when the given CPU goes back online at all times. + * + * Second, on many systems the initial EPB value coming from the platform + * firmware is 0 ('performance') and at least on some of them that is because + * the platform firmware does not initialize EPB at all with the assumption that + * the OS will do that anyway. That sometimes is problematic, as it may cause + * the system battery to drain too fast, for example, so it is better to adjust + * it on CPU bring-up and if the initial EPB value for a given CPU is 0, the + * kernel changes it to 6 ('normal'). + */ + +static DEFINE_PER_CPU(u8, saved_epb); + +#define EPB_MASK 0x0fULL +#define EPB_SAVED 0x10ULL +#define MAX_EPB EPB_MASK + +static int intel_epb_save(void) +{ + u64 epb; + + rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + /* + * Ensure that saved_epb will always be nonzero after this write even if + * the EPB value read from the MSR is 0. + */ + this_cpu_write(saved_epb, (epb & EPB_MASK) | EPB_SAVED); + + return 0; +} + +static void intel_epb_restore(void) +{ + u64 val = this_cpu_read(saved_epb); + u64 epb; + + rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + if (val) { + val &= EPB_MASK; + } else { + /* + * Because intel_epb_save() has not run for the current CPU yet, + * it is going online for the first time, so if its EPB value is + * 0 ('performance') at this point, assume that it has not been + * initialized by the platform firmware and set it to 6 + * ('normal'). + */ + val = epb & EPB_MASK; + if (val == ENERGY_PERF_BIAS_PERFORMANCE) { + val = ENERGY_PERF_BIAS_NORMAL; + pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); + } + } + wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val); +} + +static struct syscore_ops intel_epb_syscore_ops = { + .suspend = intel_epb_save, + .resume = intel_epb_restore, +}; + +static const char * const energy_perf_strings[] = { + "performance", + "balance-performance", + "normal", + "balance-power", + "power" +}; +static const u8 energ_perf_values[] = { + ENERGY_PERF_BIAS_PERFORMANCE, + ENERGY_PERF_BIAS_BALANCE_PERFORMANCE, + ENERGY_PERF_BIAS_NORMAL, + ENERGY_PERF_BIAS_BALANCE_POWERSAVE, + ENERGY_PERF_BIAS_POWERSAVE +}; + +static ssize_t energy_perf_bias_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + unsigned int cpu = dev->id; + u64 epb; + int ret; + + ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); + if (ret < 0) + return ret; + + return sprintf(buf, "%llu\n", epb); +} + +static ssize_t energy_perf_bias_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned int cpu = dev->id; + u64 epb, val; + int ret; + + ret = __sysfs_match_string(energy_perf_strings, + ARRAY_SIZE(energy_perf_strings), buf); + if (ret >= 0) + val = energ_perf_values[ret]; + else if (kstrtou64(buf, 0, &val) || val > MAX_EPB) + return -EINVAL; + + ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); + if (ret < 0) + return ret; + + ret = wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, + (epb & ~EPB_MASK) | val); + if (ret < 0) + return ret; + + return count; +} + +static DEVICE_ATTR_RW(energy_perf_bias); + +static struct attribute *intel_epb_attrs[] = { + &dev_attr_energy_perf_bias.attr, + NULL +}; + +static const struct attribute_group intel_epb_attr_group = { + .name = power_group_name, + .attrs = intel_epb_attrs +}; + +static int intel_epb_online(unsigned int cpu) +{ + struct device *cpu_dev = get_cpu_device(cpu); + + intel_epb_restore(); + if (!cpuhp_tasks_frozen) + sysfs_merge_group(&cpu_dev->kobj, &intel_epb_attr_group); + + return 0; +} + +static int intel_epb_offline(unsigned int cpu) +{ + struct device *cpu_dev = get_cpu_device(cpu); + + if (!cpuhp_tasks_frozen) + sysfs_unmerge_group(&cpu_dev->kobj, &intel_epb_attr_group); + + intel_epb_save(); + return 0; +} + +static __init int intel_epb_init(void) +{ + int ret; + + if (!boot_cpu_has(X86_FEATURE_EPB)) + return -ENODEV; + + ret = cpuhp_setup_state(CPUHP_AP_X86_INTEL_EPB_ONLINE, + "x86/intel/epb:online", intel_epb_online, + intel_epb_offline); + if (ret < 0) + goto err_out_online; + + register_syscore_ops(&intel_epb_syscore_ops); + return 0; + +err_out_online: + cpuhp_remove_state(CPUHP_AP_X86_INTEL_EPB_ONLINE); + return ret; +} +subsys_initcall(intel_epb_init); diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index e64de5149e50..d904aafe6409 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -563,33 +563,59 @@ out: return offset; } +bool amd_filter_mce(struct mce *m) +{ + enum smca_bank_types bank_type = smca_get_bank_type(m->bank); + struct cpuinfo_x86 *c = &boot_cpu_data; + u8 xec = (m->status >> 16) & 0x3F; + + /* See Family 17h Models 10h-2Fh Erratum #1114. */ + if (c->x86 == 0x17 && + c->x86_model >= 0x10 && c->x86_model <= 0x2F && + bank_type == SMCA_IF && xec == 10) + return true; + + return false; +} + /* - * Turn off MC4_MISC thresholding banks on all family 0x15 models since - * they're not supported there. + * Turn off thresholding banks for the following conditions: + * - MC4_MISC thresholding is not supported on Family 0x15. + * - Prevent possible spurious interrupts from the IF bank on Family 0x17 + * Models 0x10-0x2F due to Erratum #1114. */ -void disable_err_thresholding(struct cpuinfo_x86 *c) +void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) { - int i; + int i, num_msrs; u64 hwcr; bool need_toggle; - u32 msrs[] = { - 0x00000413, /* MC4_MISC0 */ - 0xc0000408, /* MC4_MISC1 */ - }; + u32 msrs[NR_BLOCKS]; + + if (c->x86 == 0x15 && bank == 4) { + msrs[0] = 0x00000413; /* MC4_MISC0 */ + msrs[1] = 0xc0000408; /* MC4_MISC1 */ + num_msrs = 2; + } else if (c->x86 == 0x17 && + (c->x86_model >= 0x10 && c->x86_model <= 0x2F)) { - if (c->x86 != 0x15) + if (smca_get_bank_type(bank) != SMCA_IF) + return; + + msrs[0] = MSR_AMD64_SMCA_MCx_MISC(bank); + num_msrs = 1; + } else { return; + } rdmsrl(MSR_K7_HWCR, hwcr); /* McStatusWrEn has to be set */ need_toggle = !(hwcr & BIT(18)); - if (need_toggle) wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); /* Clear CntP bit safely */ - for (i = 0; i < ARRAY_SIZE(msrs); i++) + for (i = 0; i < num_msrs; i++) msr_clear_bit(msrs[i], 62); /* restore old settings */ @@ -604,12 +630,12 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) unsigned int bank, block, cpu = smp_processor_id(); int offset = -1; - disable_err_thresholding(c); - for (bank = 0; bank < mca_cfg.banks; ++bank) { if (mce_flags.smca) smca_configure(bank, cpu); + disable_err_thresholding(c, bank); + for (block = 0; block < NR_BLOCKS; ++block) { address = get_block_address(address, low, high, bank, block); if (!address) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index b7fb541a4873..5112a50e6486 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -460,23 +460,6 @@ static void mce_irq_work_cb(struct irq_work *entry) mce_schedule_work(); } -static void mce_report_event(struct pt_regs *regs) -{ - if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { - mce_notify_irq(); - /* - * Triggering the work queue here is just an insurance - * policy in case the syscall exit notify handler - * doesn't run soon enough or ends up running on the - * wrong CPU (can happen when audit sleeps) - */ - mce_schedule_work(); - return; - } - - irq_work_queue(&mce_irq_work); -} - /* * Check if the address reported by the CPU is in a format we can parse. * It would be possible to add code for most other cases, but all would @@ -712,19 +695,49 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) barrier(); m.status = mce_rdmsrl(msr_ops.status(i)); + + /* If this entry is not valid, ignore it */ if (!(m.status & MCI_STATUS_VAL)) continue; /* - * Uncorrected or signalled events are handled by the exception - * handler when it is enabled, so don't process those here. - * - * TBD do the same check for MCI_STATUS_EN here? + * If we are logging everything (at CPU online) or this + * is a corrected error, then we must log it. */ - if (!(flags & MCP_UC) && - (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC))) - continue; + if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC)) + goto log_it; + + /* + * Newer Intel systems that support software error + * recovery need to make additional checks. Other + * CPUs should skip over uncorrected errors, but log + * everything else. + */ + if (!mca_cfg.ser) { + if (m.status & MCI_STATUS_UC) + continue; + goto log_it; + } + /* Log "not enabled" (speculative) errors */ + if (!(m.status & MCI_STATUS_EN)) + goto log_it; + + /* + * Log UCNA (SDM: 15.6.3 "UCR Error Classification") + * UC == 1 && PCC == 0 && S == 0 + */ + if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S)) + goto log_it; + + /* + * Skip anything else. Presumption is that our read of this + * bank is racing with a machine check. Leave the log alone + * for do_machine_check() to deal with it. + */ + continue; + +log_it: error_seen = true; mce_read_aux(&m, i); @@ -1301,7 +1314,8 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_panic("Fatal machine check on current CPU", &m, msg); if (worst > 0) - mce_report_event(regs); + irq_work_queue(&mce_irq_work); + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); sync_core(); @@ -1451,13 +1465,12 @@ EXPORT_SYMBOL_GPL(mce_notify_irq); static int __mcheck_cpu_mce_banks_init(void) { int i; - u8 num_banks = mca_cfg.banks; - mce_banks = kcalloc(num_banks, sizeof(struct mce_bank), GFP_KERNEL); + mce_banks = kcalloc(MAX_NR_BANKS, sizeof(struct mce_bank), GFP_KERNEL); if (!mce_banks) return -ENOMEM; - for (i = 0; i < num_banks; i++) { + for (i = 0; i < MAX_NR_BANKS; i++) { struct mce_bank *b = &mce_banks[i]; b->ctl = -1ULL; @@ -1471,28 +1484,19 @@ static int __mcheck_cpu_mce_banks_init(void) */ static int __mcheck_cpu_cap_init(void) { - unsigned b; u64 cap; + u8 b; rdmsrl(MSR_IA32_MCG_CAP, cap); b = cap & MCG_BANKCNT_MASK; - if (!mca_cfg.banks) - pr_info("CPU supports %d MCE banks\n", b); - - if (b > MAX_NR_BANKS) { - pr_warn("Using only %u machine check banks out of %u\n", - MAX_NR_BANKS, b); + if (WARN_ON_ONCE(b > MAX_NR_BANKS)) b = MAX_NR_BANKS; - } - /* Don't support asymmetric configurations today */ - WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks); - mca_cfg.banks = b; + mca_cfg.banks = max(mca_cfg.banks, b); if (!mce_banks) { int err = __mcheck_cpu_mce_banks_init(); - if (err) return err; } @@ -1771,6 +1775,14 @@ static void __mcheck_cpu_init_timer(void) mce_start_timer(t); } +bool filter_mce(struct mce *m) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return amd_filter_mce(m); + + return false; +} + /* Handle unconfigured int18 (should never happen) */ static void unexpected_machine_check(struct pt_regs *regs, long error_code) { @@ -2425,8 +2437,8 @@ static int fake_panic_set(void *data, u64 val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, - fake_panic_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set, + "%llu\n"); static int __init mcheck_debugfs_init(void) { @@ -2435,8 +2447,8 @@ static int __init mcheck_debugfs_init(void) dmce = mce_get_debugfs_dir(); if (!dmce) return -ENOMEM; - ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, - &fake_panic_fops); + ffake_panic = debugfs_create_file_unsafe("fake_panic", 0444, dmce, + NULL, &fake_panic_fops); if (!ffake_panic) return -ENOMEM; @@ -2451,6 +2463,8 @@ EXPORT_SYMBOL_GPL(mcsafe_key); static int __init mcheck_late_init(void) { + pr_info("Using %d MCE banks\n", mca_cfg.banks); + if (mca_cfg.recovery) static_branch_inc(&mcsafe_key); diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c index 3395549c51d3..64d1d5a00f39 100644 --- a/arch/x86/kernel/cpu/mce/genpool.c +++ b/arch/x86/kernel/cpu/mce/genpool.c @@ -99,6 +99,9 @@ int mce_gen_pool_add(struct mce *mce) { struct mce_evt_llist *node; + if (filter_mce(mce)) + return -EINVAL; + if (!mce_evt_pool) return -EINVAL; diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 8492ef7d9015..a6026170af92 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -46,8 +46,6 @@ static struct mce i_mce; static struct dentry *dfs_inj; -static u8 n_banks; - #define MAX_FLAG_OPT_SIZE 4 #define NBCFG 0x44 @@ -528,7 +526,7 @@ static void do_inject(void) * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for * Fam10h and later BKDGs. */ - if (static_cpu_has(X86_FEATURE_AMD_DCM) && + if (boot_cpu_has(X86_FEATURE_AMD_DCM) && b == 4 && boot_cpu_data.x86 < 0x17) { toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu)); @@ -570,9 +568,15 @@ err: static int inj_bank_set(void *data, u64 val) { struct mce *m = (struct mce *)data; + u8 n_banks; + u64 cap; + + /* Get bank count on target CPU so we can handle non-uniform values. */ + rdmsrl_on_cpu(m->extcpu, MSR_IA32_MCG_CAP, &cap); + n_banks = cap & MCG_BANKCNT_MASK; if (val >= n_banks) { - pr_err("Non-existent MCE bank: %llu\n", val); + pr_err("MCA bank %llu non-existent on CPU%d\n", val, m->extcpu); return -EINVAL; } @@ -665,10 +669,6 @@ static struct dfs_node { static int __init debugfs_init(void) { unsigned int i; - u64 cap; - - rdmsrl(MSR_IA32_MCG_CAP, cap); - n_banks = cap & MCG_BANKCNT_MASK; dfs_inj = debugfs_create_dir("mce-inject", NULL); if (!dfs_inj) diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index af5eab1e65e2..a34b55baa7aa 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -173,4 +173,13 @@ struct mca_msr_regs { extern struct mca_msr_regs msr_ops; +/* Decide whether to add MCE record to MCE event pool or filter it out. */ +extern bool filter_mce(struct mce *m); + +#ifdef CONFIG_X86_MCE_AMD +extern bool amd_filter_mce(struct mce *m); +#else +static inline bool amd_filter_mce(struct mce *m) { return false; }; +#endif + #endif /* __X86_MCE_INTERNAL_H__ */ diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 97f9ada9ceda..c321f4f513f9 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -418,8 +418,9 @@ static int do_microcode_update(const void __user *buf, size_t size) if (ustate == UCODE_ERROR) { error = -1; break; - } else if (ustate == UCODE_OK) + } else if (ustate == UCODE_NEW) { apply_microcode_on_target(cpu); + } } return error; @@ -427,7 +428,7 @@ static int do_microcode_update(const void __user *buf, size_t size) static int microcode_open(struct inode *inode, struct file *file) { - return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM; + return capable(CAP_SYS_RAWIO) ? stream_open(inode, file) : -EPERM; } static ssize_t microcode_write(struct file *file, const char __user *buf, @@ -608,6 +609,8 @@ static int microcode_reload_late(void) if (ret > 0) microcode_check(); + pr_info("Reload completed, microcode revision: 0x%x\n", boot_cpu_data.microcode); + return ret; } diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 16936a24795c..a44bdbe7c55e 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -31,6 +31,7 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/cpu.h> +#include <linux/uio.h> #include <linux/mm.h> #include <asm/microcode_intel.h> @@ -861,32 +862,33 @@ out: return ret; } -static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, - int (*get_ucode_data)(void *, const void *, size_t)) +static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL; - int new_rev = uci->cpu_sig.rev; - unsigned int leftover = size; unsigned int curr_mc_size = 0, new_mc_size = 0; - unsigned int csig, cpf; enum ucode_state ret = UCODE_OK; + int new_rev = uci->cpu_sig.rev; + u8 *new_mc = NULL, *mc = NULL; + unsigned int csig, cpf; - while (leftover) { + while (iov_iter_count(iter)) { struct microcode_header_intel mc_header; - unsigned int mc_size; + unsigned int mc_size, data_size; + u8 *data; - if (leftover < sizeof(mc_header)) { - pr_err("error! Truncated header in microcode data file\n"); + if (!copy_from_iter_full(&mc_header, sizeof(mc_header), iter)) { + pr_err("error! Truncated or inaccessible header in microcode data file\n"); break; } - if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header))) - break; - mc_size = get_totalsize(&mc_header); - if (!mc_size || mc_size > leftover) { - pr_err("error! Bad data in microcode data file\n"); + if (mc_size < sizeof(mc_header)) { + pr_err("error! Bad data in microcode data file (totalsize too small)\n"); + break; + } + data_size = mc_size - sizeof(mc_header); + if (data_size > iov_iter_count(iter)) { + pr_err("error! Bad data in microcode data file (truncated file?)\n"); break; } @@ -899,7 +901,9 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, curr_mc_size = mc_size; } - if (get_ucode_data(mc, ucode_ptr, mc_size) || + memcpy(mc, &mc_header, sizeof(mc_header)); + data = mc + sizeof(mc_header); + if (!copy_from_iter_full(data, data_size, iter) || microcode_sanity_check(mc, 1) < 0) { break; } @@ -914,14 +918,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, mc = NULL; /* trigger new vmalloc */ ret = UCODE_NEW; } - - ucode_ptr += mc_size; - leftover -= mc_size; } vfree(mc); - if (leftover) { + if (iov_iter_count(iter)) { vfree(new_mc); return UCODE_ERROR; } @@ -945,12 +946,6 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, return ret; } -static int get_ucode_fw(void *to, const void *from, size_t n) -{ - memcpy(to, from, n); - return 0; -} - static bool is_blacklisted(unsigned int cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -977,10 +972,12 @@ static bool is_blacklisted(unsigned int cpu) static enum ucode_state request_microcode_fw(int cpu, struct device *device, bool refresh_fw) { - char name[30]; struct cpuinfo_x86 *c = &cpu_data(cpu); const struct firmware *firmware; + struct iov_iter iter; enum ucode_state ret; + struct kvec kvec; + char name[30]; if (is_blacklisted(cpu)) return UCODE_NFOUND; @@ -993,26 +990,30 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device, return UCODE_NFOUND; } - ret = generic_load_microcode(cpu, (void *)firmware->data, - firmware->size, &get_ucode_fw); + kvec.iov_base = (void *)firmware->data; + kvec.iov_len = firmware->size; + iov_iter_kvec(&iter, WRITE, &kvec, 1, firmware->size); + ret = generic_load_microcode(cpu, &iter); release_firmware(firmware); return ret; } -static int get_ucode_user(void *to, const void *from, size_t n) -{ - return copy_from_user(to, from, n); -} - static enum ucode_state request_microcode_user(int cpu, const void __user *buf, size_t size) { + struct iov_iter iter; + struct iovec iov; + if (is_blacklisted(cpu)) return UCODE_NFOUND; - return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); + iov.iov_base = (void __user *)buf; + iov.iov_len = size; + iov_iter_init(&iter, WRITE, &iov, 1, size); + + return generic_load_microcode(cpu, &iter); } static struct microcode_ops microcode_intel_ops = { diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 2c8522a39ed5..cb2e49810d68 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -35,11 +35,11 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) "fpu_exception\t: %s\n" "cpuid level\t: %d\n" "wp\t\t: yes\n", - static_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no", - static_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no", - static_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no", - static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", - static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", + boot_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no", + boot_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no", + boot_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no", + boot_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", + boot_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", c->cpuid_level); } #else diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 2dbd990a2eb7..89320c0396b1 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -342,10 +342,10 @@ int update_domains(struct rdt_resource *r, int closid) if (cpumask_empty(cpu_mask) || mba_sc) goto done; cpu = get_cpu(); - /* Update CBM on this cpu if it's in cpu_mask. */ + /* Update resource control msr on this CPU if it's in cpu_mask. */ if (cpumask_test_cpu(cpu, cpu_mask)) rdt_ctrl_update(&msr_param); - /* Update CBM on other cpus. */ + /* Update resource control msr on other CPUs. */ smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1); put_cpu(); diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index f33f11f69078..1573a0a6b525 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -501,11 +501,8 @@ out_unlock: void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms) { unsigned long delay = msecs_to_jiffies(delay_ms); - struct rdt_resource *r; int cpu; - r = &rdt_resources_all[RDT_RESOURCE_L3]; - cpu = cpumask_any(&dom->cpu_mask); dom->cqm_work_cpu = cpu; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 399601eda8e4..333c177a2471 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -2039,14 +2039,14 @@ out: enum rdt_param { Opt_cdp, Opt_cdpl2, - Opt_mba_mpbs, + Opt_mba_mbps, nr__rdt_params }; static const struct fs_parameter_spec rdt_param_specs[] = { fsparam_flag("cdp", Opt_cdp), fsparam_flag("cdpl2", Opt_cdpl2), - fsparam_flag("mba_mpbs", Opt_mba_mpbs), + fsparam_flag("mba_MBps", Opt_mba_mbps), {} }; @@ -2072,7 +2072,7 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_cdpl2: ctx->enable_cdpl2 = true; return 0; - case Opt_mba_mpbs: + case Opt_mba_mbps: if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return -EINVAL; ctx->enable_mba_mbps = true; @@ -2516,103 +2516,131 @@ static void cbm_ensure_valid(u32 *_val, struct rdt_resource *r) bitmap_clear(val, zero_bit, cbm_len - zero_bit); } -/** - * rdtgroup_init_alloc - Initialize the new RDT group's allocations - * - * A new RDT group is being created on an allocation capable (CAT) - * supporting system. Set this group up to start off with all usable - * allocations. That is, all shareable and unused bits. +/* + * Initialize cache resources per RDT domain * - * All-zero CBM is invalid. If there are no more shareable bits available - * on any domain then the entire allocation will fail. + * Set the RDT domain up to start off with all usable allocations. That is, + * all shareable and unused bits. All-zero CBM is invalid. */ -static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) +static int __init_one_rdt_domain(struct rdt_domain *d, struct rdt_resource *r, + u32 closid) { struct rdt_resource *r_cdp = NULL; struct rdt_domain *d_cdp = NULL; u32 used_b = 0, unused_b = 0; - u32 closid = rdtgrp->closid; - struct rdt_resource *r; unsigned long tmp_cbm; enum rdtgrp_mode mode; - struct rdt_domain *d; u32 peer_ctl, *ctrl; - int i, ret; + int i; - for_each_alloc_enabled_rdt_resource(r) { - /* - * Only initialize default allocations for CBM cache - * resources - */ - if (r->rid == RDT_RESOURCE_MBA) - continue; - list_for_each_entry(d, &r->domains, list) { - rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp); - d->have_new_ctrl = false; - d->new_ctrl = r->cache.shareable_bits; - used_b = r->cache.shareable_bits; - ctrl = d->ctrl_val; - for (i = 0; i < closids_supported(); i++, ctrl++) { - if (closid_allocated(i) && i != closid) { - mode = rdtgroup_mode_by_closid(i); - if (mode == RDT_MODE_PSEUDO_LOCKSETUP) - break; - /* - * If CDP is active include peer - * domain's usage to ensure there - * is no overlap with an exclusive - * group. - */ - if (d_cdp) - peer_ctl = d_cdp->ctrl_val[i]; - else - peer_ctl = 0; - used_b |= *ctrl | peer_ctl; - if (mode == RDT_MODE_SHAREABLE) - d->new_ctrl |= *ctrl | peer_ctl; - } - } - if (d->plr && d->plr->cbm > 0) - used_b |= d->plr->cbm; - unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); - unused_b &= BIT_MASK(r->cache.cbm_len) - 1; - d->new_ctrl |= unused_b; - /* - * Force the initial CBM to be valid, user can - * modify the CBM based on system availability. - */ - cbm_ensure_valid(&d->new_ctrl, r); + rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp); + d->have_new_ctrl = false; + d->new_ctrl = r->cache.shareable_bits; + used_b = r->cache.shareable_bits; + ctrl = d->ctrl_val; + for (i = 0; i < closids_supported(); i++, ctrl++) { + if (closid_allocated(i) && i != closid) { + mode = rdtgroup_mode_by_closid(i); + if (mode == RDT_MODE_PSEUDO_LOCKSETUP) + break; /* - * Assign the u32 CBM to an unsigned long to ensure - * that bitmap_weight() does not access out-of-bound - * memory. + * If CDP is active include peer domain's + * usage to ensure there is no overlap + * with an exclusive group. */ - tmp_cbm = d->new_ctrl; - if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < - r->cache.min_cbm_bits) { - rdt_last_cmd_printf("No space on %s:%d\n", - r->name, d->id); - return -ENOSPC; - } - d->have_new_ctrl = true; + if (d_cdp) + peer_ctl = d_cdp->ctrl_val[i]; + else + peer_ctl = 0; + used_b |= *ctrl | peer_ctl; + if (mode == RDT_MODE_SHAREABLE) + d->new_ctrl |= *ctrl | peer_ctl; } } + if (d->plr && d->plr->cbm > 0) + used_b |= d->plr->cbm; + unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); + unused_b &= BIT_MASK(r->cache.cbm_len) - 1; + d->new_ctrl |= unused_b; + /* + * Force the initial CBM to be valid, user can + * modify the CBM based on system availability. + */ + cbm_ensure_valid(&d->new_ctrl, r); + /* + * Assign the u32 CBM to an unsigned long to ensure that + * bitmap_weight() does not access out-of-bound memory. + */ + tmp_cbm = d->new_ctrl; + if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) { + rdt_last_cmd_printf("No space on %s:%d\n", r->name, d->id); + return -ENOSPC; + } + d->have_new_ctrl = true; + + return 0; +} + +/* + * Initialize cache resources with default values. + * + * A new RDT group is being created on an allocation capable (CAT) + * supporting system. Set this group up to start off with all usable + * allocations. + * + * If there are no more shareable bits available on any domain then + * the entire allocation will fail. + */ +static int rdtgroup_init_cat(struct rdt_resource *r, u32 closid) +{ + struct rdt_domain *d; + int ret; + + list_for_each_entry(d, &r->domains, list) { + ret = __init_one_rdt_domain(d, r, closid); + if (ret < 0) + return ret; + } + + return 0; +} + +/* Initialize MBA resource with default values. */ +static void rdtgroup_init_mba(struct rdt_resource *r) +{ + struct rdt_domain *d; + + list_for_each_entry(d, &r->domains, list) { + d->new_ctrl = is_mba_sc(r) ? MBA_MAX_MBPS : r->default_ctrl; + d->have_new_ctrl = true; + } +} + +/* Initialize the RDT group's allocations. */ +static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) +{ + struct rdt_resource *r; + int ret; for_each_alloc_enabled_rdt_resource(r) { - /* - * Only initialize default allocations for CBM cache - * resources - */ - if (r->rid == RDT_RESOURCE_MBA) - continue; + if (r->rid == RDT_RESOURCE_MBA) { + rdtgroup_init_mba(r); + } else { + ret = rdtgroup_init_cat(r, rdtgrp->closid); + if (ret < 0) + return ret; + } + ret = update_domains(r, rdtgrp->closid); if (ret < 0) { rdt_last_cmd_puts("Failed to initialize allocations\n"); return ret; } - rdtgrp->mode = RDT_MODE_SHAREABLE; + } + rdtgrp->mode = RDT_MODE_SHAREABLE; + return 0; } diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 17ffc869cab8..a96ca8584803 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -204,8 +204,7 @@ static struct crash_mem *fill_up_crash_elf_data(void) * another range split. So add extra two slots here. */ nr_ranges += 2; - cmem = vzalloc(sizeof(struct crash_mem) + - sizeof(struct crash_mem_range) * nr_ranges); + cmem = vzalloc(struct_size(cmem, ranges, nr_ranges)); if (!cmem) return NULL; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index cd53f3030e40..64a59d726639 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -34,14 +34,14 @@ const char *stack_type_name(enum stack_type type) static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack); + unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack_ptr); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); /* * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack <= begin || stack > end) + if (stack < begin || stack > end) return false; info->type = STACK_TYPE_IRQ; @@ -59,14 +59,14 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack); + unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack_ptr); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); /* * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack <= begin || stack > end) + if (stack < begin || stack > end) return false; info->type = STACK_TYPE_SOFTIRQ; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 5cdb9e84da57..753b8cfe8b8a 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -16,23 +16,21 @@ #include <linux/bug.h> #include <linux/nmi.h> +#include <asm/cpu_entry_area.h> #include <asm/stacktrace.h> -static char *exception_stack_names[N_EXCEPTION_STACKS] = { - [ DOUBLEFAULT_STACK-1 ] = "#DF", - [ NMI_STACK-1 ] = "NMI", - [ DEBUG_STACK-1 ] = "#DB", - [ MCE_STACK-1 ] = "#MC", -}; - -static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [DEBUG_STACK - 1] = DEBUG_STKSZ +static const char * const exception_stack_names[] = { + [ ESTACK_DF ] = "#DF", + [ ESTACK_NMI ] = "NMI", + [ ESTACK_DB2 ] = "#DB2", + [ ESTACK_DB1 ] = "#DB1", + [ ESTACK_DB ] = "#DB", + [ ESTACK_MCE ] = "#MC", }; const char *stack_type_name(enum stack_type type) { - BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); + BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); if (type == STACK_TYPE_IRQ) return "IRQ"; @@ -52,43 +50,84 @@ const char *stack_type_name(enum stack_type type) return NULL; } +/** + * struct estack_pages - Page descriptor for exception stacks + * @offs: Offset from the start of the exception stack area + * @size: Size of the exception stack + * @type: Type to store in the stack_info struct + */ +struct estack_pages { + u32 offs; + u16 size; + u16 type; +}; + +#define EPAGERANGE(st) \ + [PFN_DOWN(CEA_ESTACK_OFFS(st)) ... \ + PFN_DOWN(CEA_ESTACK_OFFS(st) + CEA_ESTACK_SIZE(st) - 1)] = { \ + .offs = CEA_ESTACK_OFFS(st), \ + .size = CEA_ESTACK_SIZE(st), \ + .type = STACK_TYPE_EXCEPTION + ESTACK_ ##st, } + +/* + * Array of exception stack page descriptors. If the stack is larger than + * PAGE_SIZE, all pages covering a particular stack will have the same + * info. The guard pages including the not mapped DB2 stack are zeroed + * out. + */ +static const +struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = { + EPAGERANGE(DF), + EPAGERANGE(NMI), + EPAGERANGE(DB1), + EPAGERANGE(DB), + EPAGERANGE(MCE), +}; + static bool in_exception_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin, *end; + unsigned long begin, end, stk = (unsigned long)stack; + const struct estack_pages *ep; struct pt_regs *regs; - unsigned k; + unsigned int k; - BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); + BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); - for (k = 0; k < N_EXCEPTION_STACKS; k++) { - end = (unsigned long *)raw_cpu_ptr(&orig_ist)->ist[k]; - begin = end - (exception_stack_sizes[k] / sizeof(long)); - regs = (struct pt_regs *)end - 1; - - if (stack <= begin || stack >= end) - continue; + begin = (unsigned long)__this_cpu_read(cea_exception_stacks); + end = begin + sizeof(struct cea_exception_stacks); + /* Bail if @stack is outside the exception stack area. */ + if (stk < begin || stk >= end) + return false; - info->type = STACK_TYPE_EXCEPTION + k; - info->begin = begin; - info->end = end; - info->next_sp = (unsigned long *)regs->sp; + /* Calc page offset from start of exception stacks */ + k = (stk - begin) >> PAGE_SHIFT; + /* Lookup the page descriptor */ + ep = &estack_pages[k]; + /* Guard page? */ + if (!ep->size) + return false; - return true; - } + begin += (unsigned long)ep->offs; + end = begin + (unsigned long)ep->size; + regs = (struct pt_regs *)end - 1; - return false; + info->type = ep->type; + info->begin = (unsigned long *)begin; + info->end = (unsigned long *)end; + info->next_sp = (unsigned long *)regs->sp; + return true; } static bool in_irq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *end = (unsigned long *)this_cpu_read(irq_stack_ptr); + unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr); unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long)); /* * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack <= begin || stack > end) + if (stack < begin || stack >= end) return false; info->type = STACK_TYPE_IRQ; diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 2e5003fef51a..ce243f76bdb7 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -101,24 +101,21 @@ static void __kernel_fpu_begin(void) kernel_fpu_disable(); - if (fpu->initialized) { - /* - * Ignore return value -- we don't care if reg state - * is clobbered. - */ - copy_fpregs_to_fpstate(fpu); - } else { - __cpu_invalidate_fpregs_state(); + if (current->mm) { + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { + set_thread_flag(TIF_NEED_FPU_LOAD); + /* + * Ignore return value -- we don't care if reg state + * is clobbered. + */ + copy_fpregs_to_fpstate(fpu); + } } + __cpu_invalidate_fpregs_state(); } static void __kernel_fpu_end(void) { - struct fpu *fpu = ¤t->thread.fpu; - - if (fpu->initialized) - copy_kernel_to_fpregs(&fpu->state); - kernel_fpu_enable(); } @@ -145,15 +142,17 @@ void fpu__save(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); - preempt_disable(); + fpregs_lock(); trace_x86_fpu_before_save(fpu); - if (fpu->initialized) { + + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { if (!copy_fpregs_to_fpstate(fpu)) { copy_kernel_to_fpregs(&fpu->state); } } + trace_x86_fpu_after_save(fpu); - preempt_enable(); + fpregs_unlock(); } EXPORT_SYMBOL_GPL(fpu__save); @@ -186,11 +185,14 @@ void fpstate_init(union fpregs_state *state) } EXPORT_SYMBOL_GPL(fpstate_init); -int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) +int fpu__copy(struct task_struct *dst, struct task_struct *src) { + struct fpu *dst_fpu = &dst->thread.fpu; + struct fpu *src_fpu = &src->thread.fpu; + dst_fpu->last_cpu = -1; - if (!src_fpu->initialized || !static_cpu_has(X86_FEATURE_FPU)) + if (!static_cpu_has(X86_FEATURE_FPU)) return 0; WARN_ON_FPU(src_fpu != ¤t->thread.fpu); @@ -202,16 +204,23 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size); /* - * Save current FPU registers directly into the child - * FPU context, without any memory-to-memory copying. + * If the FPU registers are not current just memcpy() the state. + * Otherwise save current FPU registers directly into the child's FPU + * context, without any memory-to-memory copying. * * ( The function 'fails' in the FNSAVE case, which destroys - * register contents so we have to copy them back. ) + * register contents so we have to load them back. ) */ - if (!copy_fpregs_to_fpstate(dst_fpu)) { - memcpy(&src_fpu->state, &dst_fpu->state, fpu_kernel_xstate_size); - copy_kernel_to_fpregs(&src_fpu->state); - } + fpregs_lock(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + memcpy(&dst_fpu->state, &src_fpu->state, fpu_kernel_xstate_size); + + else if (!copy_fpregs_to_fpstate(dst_fpu)) + copy_kernel_to_fpregs(&dst_fpu->state); + + fpregs_unlock(); + + set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD); trace_x86_fpu_copy_src(src_fpu); trace_x86_fpu_copy_dst(dst_fpu); @@ -223,20 +232,14 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) * Activate the current task's in-memory FPU context, * if it has not been used before: */ -void fpu__initialize(struct fpu *fpu) +static void fpu__initialize(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); - if (!fpu->initialized) { - fpstate_init(&fpu->state); - trace_x86_fpu_init_state(fpu); - - trace_x86_fpu_activate_state(fpu); - /* Safe to do for the current task: */ - fpu->initialized = 1; - } + set_thread_flag(TIF_NEED_FPU_LOAD); + fpstate_init(&fpu->state); + trace_x86_fpu_init_state(fpu); } -EXPORT_SYMBOL_GPL(fpu__initialize); /* * This function must be called before we read a task's fpstate. @@ -248,32 +251,20 @@ EXPORT_SYMBOL_GPL(fpu__initialize); * * - or it's called for stopped tasks (ptrace), in which case the * registers were already saved by the context-switch code when - * the task scheduled out - we only have to initialize the registers - * if they've never been initialized. + * the task scheduled out. * * If the task has used the FPU before then save it. */ void fpu__prepare_read(struct fpu *fpu) { - if (fpu == ¤t->thread.fpu) { + if (fpu == ¤t->thread.fpu) fpu__save(fpu); - } else { - if (!fpu->initialized) { - fpstate_init(&fpu->state); - trace_x86_fpu_init_state(fpu); - - trace_x86_fpu_activate_state(fpu); - /* Safe to do for current and for stopped child tasks: */ - fpu->initialized = 1; - } - } } /* * This function must be called before we write a task's fpstate. * - * If the task has used the FPU before then invalidate any cached FPU registers. - * If the task has not used the FPU before then initialize its fpstate. + * Invalidate any cached FPU registers. * * After this function call, after registers in the fpstate are * modified and the child task has woken up, the child task will @@ -290,44 +281,11 @@ void fpu__prepare_write(struct fpu *fpu) */ WARN_ON_FPU(fpu == ¤t->thread.fpu); - if (fpu->initialized) { - /* Invalidate any cached state: */ - __fpu_invalidate_fpregs_state(fpu); - } else { - fpstate_init(&fpu->state); - trace_x86_fpu_init_state(fpu); - - trace_x86_fpu_activate_state(fpu); - /* Safe to do for stopped child tasks: */ - fpu->initialized = 1; - } + /* Invalidate any cached state: */ + __fpu_invalidate_fpregs_state(fpu); } /* - * 'fpu__restore()' is called to copy FPU registers from - * the FPU fpstate to the live hw registers and to activate - * access to the hardware registers, so that FPU instructions - * can be used afterwards. - * - * Must be called with kernel preemption disabled (for example - * with local interrupts disabled, as it is in the case of - * do_device_not_available()). - */ -void fpu__restore(struct fpu *fpu) -{ - fpu__initialize(fpu); - - /* Avoid __kernel_fpu_begin() right after fpregs_activate() */ - kernel_fpu_disable(); - trace_x86_fpu_before_restore(fpu); - fpregs_activate(fpu); - copy_kernel_to_fpregs(&fpu->state); - trace_x86_fpu_after_restore(fpu); - kernel_fpu_enable(); -} -EXPORT_SYMBOL_GPL(fpu__restore); - -/* * Drops current FPU state: deactivates the fpregs and * the fpstate. NOTE: it still leaves previous contents * in the fpregs in the eager-FPU case. @@ -341,17 +299,13 @@ void fpu__drop(struct fpu *fpu) preempt_disable(); if (fpu == ¤t->thread.fpu) { - if (fpu->initialized) { - /* Ignore delayed exceptions from user space */ - asm volatile("1: fwait\n" - "2:\n" - _ASM_EXTABLE(1b, 2b)); - fpregs_deactivate(fpu); - } + /* Ignore delayed exceptions from user space */ + asm volatile("1: fwait\n" + "2:\n" + _ASM_EXTABLE(1b, 2b)); + fpregs_deactivate(fpu); } - fpu->initialized = 0; - trace_x86_fpu_dropped(fpu); preempt_enable(); @@ -363,6 +317,8 @@ void fpu__drop(struct fpu *fpu) */ static inline void copy_init_fpstate_to_fpregs(void) { + fpregs_lock(); + if (use_xsave()) copy_kernel_to_xregs(&init_fpstate.xsave, -1); else if (static_cpu_has(X86_FEATURE_FXSR)) @@ -372,6 +328,9 @@ static inline void copy_init_fpstate_to_fpregs(void) if (boot_cpu_has(X86_FEATURE_OSPKE)) copy_init_pkru_to_fpregs(); + + fpregs_mark_activate(); + fpregs_unlock(); } /* @@ -389,16 +348,52 @@ void fpu__clear(struct fpu *fpu) /* * Make sure fpstate is cleared and initialized. */ - if (static_cpu_has(X86_FEATURE_FPU)) { - preempt_disable(); - fpu__initialize(fpu); - user_fpu_begin(); + fpu__initialize(fpu); + if (static_cpu_has(X86_FEATURE_FPU)) copy_init_fpstate_to_fpregs(); - preempt_enable(); - } } /* + * Load FPU context before returning to userspace. + */ +void switch_fpu_return(void) +{ + if (!static_cpu_has(X86_FEATURE_FPU)) + return; + + __fpregs_load_activate(); +} +EXPORT_SYMBOL_GPL(switch_fpu_return); + +#ifdef CONFIG_X86_DEBUG_FPU +/* + * If current FPU state according to its tracking (loaded FPU context on this + * CPU) is not valid then we must have TIF_NEED_FPU_LOAD set so the context is + * loaded on return to userland. + */ +void fpregs_assert_state_consistent(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + return; + + WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id())); +} +EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent); +#endif + +void fpregs_mark_activate(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + + fpregs_activate(fpu); + fpu->last_cpu = smp_processor_id(); + clear_thread_flag(TIF_NEED_FPU_LOAD); +} +EXPORT_SYMBOL_GPL(fpregs_mark_activate); + +/* * x87 math exception handling: */ diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 6abd83572b01..20d8fa7124c7 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -239,8 +239,6 @@ static void __init fpu__init_system_ctx_switch(void) WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; - - WARN_ON_FPU(current->thread.fpu.initialized); } /* diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index bc02f5144b95..d652b939ccfb 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -15,16 +15,12 @@ */ int regset_fpregs_active(struct task_struct *target, const struct user_regset *regset) { - struct fpu *target_fpu = &target->thread.fpu; - - return target_fpu->initialized ? regset->n : 0; + return regset->n; } int regset_xregset_fpregs_active(struct task_struct *target, const struct user_regset *regset) { - struct fpu *target_fpu = &target->thread.fpu; - - if (boot_cpu_has(X86_FEATURE_FXSR) && target_fpu->initialized) + if (boot_cpu_has(X86_FEATURE_FXSR)) return regset->n; else return 0; @@ -269,11 +265,10 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) memcpy(&to[i], &from[i], sizeof(to[0])); } -void convert_to_fxsr(struct task_struct *tsk, +void convert_to_fxsr(struct fxregs_state *fxsave, const struct user_i387_ia32_struct *env) { - struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave; struct _fpreg *from = (struct _fpreg *) &env->st_space[0]; struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0]; int i; @@ -350,7 +345,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1); if (!ret) - convert_to_fxsr(target, &env); + convert_to_fxsr(&target->thread.fpu.state.fxsave, &env); /* * update the header bit in the xsave header, indicating the @@ -371,16 +366,9 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, int dump_fpu(struct pt_regs *regs, struct user_i387_struct *ufpu) { struct task_struct *tsk = current; - struct fpu *fpu = &tsk->thread.fpu; - int fpvalid; - - fpvalid = fpu->initialized; - if (fpvalid) - fpvalid = !fpregs_get(tsk, NULL, - 0, sizeof(struct user_i387_ia32_struct), - ufpu, NULL); - return fpvalid; + return !fpregs_get(tsk, NULL, 0, sizeof(struct user_i387_ia32_struct), + ufpu, NULL); } EXPORT_SYMBOL(dump_fpu); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index f6a1d299627c..5a8d118bc423 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -92,13 +92,13 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame) return err; err |= __put_user(FP_XSTATE_MAGIC2, - (__u32 *)(buf + fpu_user_xstate_size)); + (__u32 __user *)(buf + fpu_user_xstate_size)); /* * Read the xfeatures which we copied (directly from the cpu or * from the state in task struct) to the user buffers. */ - err |= __get_user(xfeatures, (__u32 *)&x->header.xfeatures); + err |= __get_user(xfeatures, (__u32 __user *)&x->header.xfeatures); /* * For legacy compatible, we always set FP/SSE bits in the bit @@ -113,7 +113,7 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame) */ xfeatures |= XFEATURE_MASK_FPSSE; - err |= __put_user(xfeatures, (__u32 *)&x->header.xfeatures); + err |= __put_user(xfeatures, (__u32 __user *)&x->header.xfeatures); return err; } @@ -144,9 +144,10 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) * buf == buf_fx for 64-bit frames and 32-bit fsave frame. * buf != buf_fx for 32-bit frames with fxstate. * - * If the fpu, extended register state is live, save the state directly - * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise, - * copy the thread's fpu state to the user frame starting at 'buf_fx'. + * Try to save it directly to the user frame with disabled page fault handler. + * If this fails then do the slow path where the FPU state is first saved to + * task's fpu->state and then copy it to the user frame pointed to by the + * aligned pointer 'buf_fx'. * * If this is a 32-bit frame with fxstate, put a fsave header before * the aligned state at 'buf_fx'. @@ -156,10 +157,9 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) */ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) { - struct fpu *fpu = ¤t->thread.fpu; - struct xregs_state *xsave = &fpu->state.xsave; struct task_struct *tsk = current; int ia32_fxstate = (buf != buf_fx); + int ret; ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || IS_ENABLED(CONFIG_IA32_EMULATION)); @@ -172,28 +172,34 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) sizeof(struct user_i387_ia32_struct), NULL, (struct _fpstate_32 __user *) buf) ? -1 : 1; - if (fpu->initialized || using_compacted_format()) { - /* Save the live register state to the user directly. */ - if (copy_fpregs_to_sigframe(buf_fx)) - return -1; - /* Update the thread's fxstate to save the fsave header. */ - if (ia32_fxstate) - copy_fxregs_to_kernel(fpu); - } else { - /* - * It is a *bug* if kernel uses compacted-format for xsave - * area and we copy it out directly to a signal frame. It - * should have been handled above by saving the registers - * directly. - */ - if (boot_cpu_has(X86_FEATURE_XSAVES)) { - WARN_ONCE(1, "x86/fpu: saving compacted-format xsave area to a signal frame!\n"); - return -1; - } - - fpstate_sanitize_xstate(fpu); - if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size)) - return -1; +retry: + /* + * Load the FPU registers if they are not valid for the current task. + * With a valid FPU state we can attempt to save the state directly to + * userland's stack frame which will likely succeed. If it does not, + * resolve the fault in the user memory and try again. + */ + fpregs_lock(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + __fpregs_load_activate(); + + pagefault_disable(); + ret = copy_fpregs_to_sigframe(buf_fx); + pagefault_enable(); + fpregs_unlock(); + + if (ret) { + int aligned_size; + int nr_pages; + + aligned_size = offset_in_page(buf_fx) + fpu_user_xstate_size; + nr_pages = DIV_ROUND_UP(aligned_size, PAGE_SIZE); + + ret = get_user_pages_unlocked((unsigned long)buf_fx, nr_pages, + NULL, FOLL_WRITE); + if (ret == nr_pages) + goto retry; + return -EFAULT; } /* Save the fsave header for the 32-bit frames. */ @@ -207,11 +213,11 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) } static inline void -sanitize_restored_xstate(struct task_struct *tsk, +sanitize_restored_xstate(union fpregs_state *state, struct user_i387_ia32_struct *ia32_env, u64 xfeatures, int fx_only) { - struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; + struct xregs_state *xsave = &state->xsave; struct xstate_header *header = &xsave->header; if (use_xsave()) { @@ -238,17 +244,18 @@ sanitize_restored_xstate(struct task_struct *tsk, */ xsave->i387.mxcsr &= mxcsr_feature_mask; - convert_to_fxsr(tsk, ia32_env); + if (ia32_env) + convert_to_fxsr(&state->fxsave, ia32_env); } } /* * Restore the extended state if present. Otherwise, restore the FP/SSE state. */ -static inline int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only) +static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only) { if (use_xsave()) { - if ((unsigned long)buf % 64 || fx_only) { + if (fx_only) { u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE; copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); return copy_user_to_fxregs(buf); @@ -266,12 +273,15 @@ static inline int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) { + struct user_i387_ia32_struct *envp = NULL; + int state_size = fpu_kernel_xstate_size; int ia32_fxstate = (buf != buf_fx); struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; - int state_size = fpu_kernel_xstate_size; + struct user_i387_ia32_struct env; u64 xfeatures = 0; int fx_only = 0; + int ret = 0; ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || IS_ENABLED(CONFIG_IA32_EMULATION)); @@ -284,8 +294,6 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) if (!access_ok(buf, size)) return -EACCES; - fpu__initialize(fpu); - if (!static_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_set(current, NULL, 0, sizeof(struct user_i387_ia32_struct), @@ -308,61 +316,101 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) } } + /* + * The current state of the FPU registers does not matter. By setting + * TIF_NEED_FPU_LOAD unconditionally it is ensured that the our xstate + * is not modified on context switch and that the xstate is considered + * to be loaded again on return to userland (overriding last_cpu avoids + * the optimisation). + */ + set_thread_flag(TIF_NEED_FPU_LOAD); + __fpu_invalidate_fpregs_state(fpu); + + if ((unsigned long)buf_fx % 64) + fx_only = 1; + /* + * For 32-bit frames with fxstate, copy the fxstate so it can be + * reconstructed later. + */ if (ia32_fxstate) { + ret = __copy_from_user(&env, buf, sizeof(env)); + if (ret) + goto err_out; + envp = &env; + } else { /* - * For 32-bit frames with fxstate, copy the user state to the - * thread's fpu state, reconstruct fxstate from the fsave - * header. Validate and sanitize the copied state. + * Attempt to restore the FPU registers directly from user + * memory. For that to succeed, the user access cannot cause + * page faults. If it does, fall back to the slow path below, + * going through the kernel buffer with the enabled pagefault + * handler. */ - struct user_i387_ia32_struct env; - int err = 0; + fpregs_lock(); + pagefault_disable(); + ret = copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only); + pagefault_enable(); + if (!ret) { + fpregs_mark_activate(); + fpregs_unlock(); + return 0; + } + fpregs_unlock(); + } - /* - * Drop the current fpu which clears fpu->initialized. This ensures - * that any context-switch during the copy of the new state, - * avoids the intermediate state from getting restored/saved. - * Thus avoiding the new restored state from getting corrupted. - * We will be ready to restore/save the state only after - * fpu->initialized is again set. - */ - fpu__drop(fpu); + + if (use_xsave() && !fx_only) { + u64 init_bv = xfeatures_mask & ~xfeatures; if (using_compacted_format()) { - err = copy_user_to_xstate(&fpu->state.xsave, buf_fx); + ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx); } else { - err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); + ret = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); - if (!err && state_size > offsetof(struct xregs_state, header)) - err = validate_xstate_header(&fpu->state.xsave.header); + if (!ret && state_size > offsetof(struct xregs_state, header)) + ret = validate_xstate_header(&fpu->state.xsave.header); } + if (ret) + goto err_out; - if (err || __copy_from_user(&env, buf, sizeof(env))) { - fpstate_init(&fpu->state); - trace_x86_fpu_init_state(fpu); - err = -1; - } else { - sanitize_restored_xstate(tsk, &env, xfeatures, fx_only); + sanitize_restored_xstate(&fpu->state, envp, xfeatures, fx_only); + + fpregs_lock(); + if (unlikely(init_bv)) + copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); + ret = copy_kernel_to_xregs_err(&fpu->state.xsave, xfeatures); + + } else if (use_fxsr()) { + ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size); + if (ret) { + ret = -EFAULT; + goto err_out; } - local_bh_disable(); - fpu->initialized = 1; - fpu__restore(fpu); - local_bh_enable(); + sanitize_restored_xstate(&fpu->state, envp, xfeatures, fx_only); - return err; - } else { - /* - * For 64-bit frames and 32-bit fsave frames, restore the user - * state to the registers directly (with exceptions handled). - */ - user_fpu_begin(); - if (copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only)) { - fpu__clear(fpu); - return -1; + fpregs_lock(); + if (use_xsave()) { + u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE; + copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); } + + ret = copy_kernel_to_fxregs_err(&fpu->state.fxsave); + } else { + ret = __copy_from_user(&fpu->state.fsave, buf_fx, state_size); + if (ret) + goto err_out; + + fpregs_lock(); + ret = copy_kernel_to_fregs_err(&fpu->state.fsave); } + if (!ret) + fpregs_mark_activate(); + fpregs_unlock(); - return 0; +err_out: + if (ret) + fpu__clear(fpu); + return ret; } static inline int xstate_sigframe_size(void) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index d7432c2b1051..9c459fd1d38e 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -805,20 +805,18 @@ void fpu__resume_cpu(void) } /* - * Given an xstate feature mask, calculate where in the xsave + * Given an xstate feature nr, calculate where in the xsave * buffer the state is. Callers should ensure that the buffer * is valid. */ -static void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask) +static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr) { - int feature_nr = fls64(xstate_feature_mask) - 1; - - if (!xfeature_enabled(feature_nr)) { + if (!xfeature_enabled(xfeature_nr)) { WARN_ON_FPU(1); return NULL; } - return (void *)xsave + xstate_comp_offsets[feature_nr]; + return (void *)xsave + xstate_comp_offsets[xfeature_nr]; } /* * Given the xsave area and a state inside, this function returns the @@ -832,13 +830,13 @@ static void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask * * Inputs: * xstate: the thread's storage area for all FPU data - * xstate_feature: state which is defined in xsave.h (e.g. - * XFEATURE_MASK_FP, XFEATURE_MASK_SSE, etc...) + * xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP, + * XFEATURE_SSE, etc...) * Output: * address of the state in the xsave area, or NULL if the * field is not present in the xsave buffer. */ -void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature) +void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) { /* * Do we even *have* xsave state? @@ -851,11 +849,11 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature) * have not enabled. Remember that pcntxt_mask is * what we write to the XCR0 register. */ - WARN_ONCE(!(xfeatures_mask & xstate_feature), + WARN_ONCE(!(xfeatures_mask & BIT_ULL(xfeature_nr)), "get of unsupported state"); /* * This assumes the last 'xsave*' instruction to - * have requested that 'xstate_feature' be saved. + * have requested that 'xfeature_nr' be saved. * If it did not, we might be seeing and old value * of the field in the buffer. * @@ -864,10 +862,10 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature) * or because the "init optimization" caused it * to not be saved. */ - if (!(xsave->header.xfeatures & xstate_feature)) + if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr))) return NULL; - return __raw_xsave_addr(xsave, xstate_feature); + return __raw_xsave_addr(xsave, xfeature_nr); } EXPORT_SYMBOL_GPL(get_xsave_addr); @@ -882,25 +880,23 @@ EXPORT_SYMBOL_GPL(get_xsave_addr); * Note that this only works on the current task. * * Inputs: - * @xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP, - * XFEATURE_MASK_SSE, etc...) + * @xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP, + * XFEATURE_SSE, etc...) * Output: * address of the state in the xsave area or NULL if the state * is not present or is in its 'init state'. */ -const void *get_xsave_field_ptr(int xsave_state) +const void *get_xsave_field_ptr(int xfeature_nr) { struct fpu *fpu = ¤t->thread.fpu; - if (!fpu->initialized) - return NULL; /* * fpu__save() takes the CPU's xstate registers * and saves them off to the 'fpu memory buffer. */ fpu__save(fpu); - return get_xsave_addr(&fpu->state.xsave, xsave_state); + return get_xsave_addr(&fpu->state.xsave, xfeature_nr); } #ifdef CONFIG_ARCH_HAS_PKEYS @@ -1016,7 +1012,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of * Copy only in-use xstates: */ if ((header.xfeatures >> i) & 1) { - void *src = __raw_xsave_addr(xsave, 1 << i); + void *src = __raw_xsave_addr(xsave, i); offset = xstate_offsets[i]; size = xstate_sizes[i]; @@ -1102,7 +1098,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i * Copy only in-use xstates: */ if ((header.xfeatures >> i) & 1) { - void *src = __raw_xsave_addr(xsave, 1 << i); + void *src = __raw_xsave_addr(xsave, i); offset = xstate_offsets[i]; size = xstate_sizes[i]; @@ -1159,7 +1155,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) u64 mask = ((u64)1 << i); if (hdr.xfeatures & mask) { - void *dst = __raw_xsave_addr(xsave, 1 << i); + void *dst = __raw_xsave_addr(xsave, i); offset = xstate_offsets[i]; size = xstate_sizes[i]; @@ -1213,7 +1209,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) u64 mask = ((u64)1 << i); if (hdr.xfeatures & mask) { - void *dst = __raw_xsave_addr(xsave, 1 << i); + void *dst = __raw_xsave_addr(xsave, i); offset = xstate_offsets[i]; size = xstate_sizes[i]; diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index ef49517f6bb2..0caf8122d680 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -678,12 +678,8 @@ static inline void *alloc_tramp(unsigned long size) { return module_alloc(size); } -static inline void tramp_free(void *tramp, int size) +static inline void tramp_free(void *tramp) { - int npages = PAGE_ALIGN(size) >> PAGE_SHIFT; - - set_memory_nx((unsigned long)tramp, npages); - set_memory_rw((unsigned long)tramp, npages); module_memfree(tramp); } #else @@ -692,7 +688,7 @@ static inline void *alloc_tramp(unsigned long size) { return NULL; } -static inline void tramp_free(void *tramp, int size) { } +static inline void tramp_free(void *tramp) { } #endif /* Defined as markers to the end of the ftrace default trampolines */ @@ -730,6 +726,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) unsigned long end_offset; unsigned long op_offset; unsigned long offset; + unsigned long npages; unsigned long size; unsigned long retq; unsigned long *ptr; @@ -762,6 +759,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) return 0; *tramp_size = size + RET_SIZE + sizeof(void *); + npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE); /* Copy ftrace_caller onto the trampoline memory */ ret = probe_kernel_read(trampoline, (void *)start_offset, size); @@ -806,9 +804,17 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) /* ALLOC_TRAMP flags lets us know we created it */ ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP; + set_vm_flush_reset_perms(trampoline); + + /* + * Module allocation needs to be completed by making the page + * executable. The page is still writable, which is a security hazard, + * but anyhow ftrace breaks W^X completely. + */ + set_memory_x((unsigned long)trampoline, npages); return (unsigned long)trampoline; fail: - tramp_free(trampoline, *tramp_size); + tramp_free(trampoline); return 0; } @@ -939,7 +945,7 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops) if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) return; - tramp_free((void *)ops->trampoline, ops->trampoline_size); + tramp_free((void *)ops->trampoline); ops->trampoline = 0; } diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d1dbe8e4eb82..bcd206c8ac90 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -265,7 +265,7 @@ ENDPROC(start_cpu0) GLOBAL(initial_code) .quad x86_64_start_kernel GLOBAL(initial_gs) - .quad INIT_PER_CPU_VAR(irq_stack_union) + .quad INIT_PER_CPU_VAR(fixed_percpu_data) GLOBAL(initial_stack) /* * The SIZEOF_PTREGS gap is a convention which helps the in-kernel diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index dfd3aca82c61..fb32925a2e62 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -905,6 +905,8 @@ int __init hpet_enable(void) return 0; hpet_set_mapping(); + if (!hpet_virt_address) + return 0; /* * Read the period and check for a sane value: diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index ff9bfd40429e..d73083021002 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -354,6 +354,7 @@ int hw_breakpoint_arch_parse(struct perf_event *bp, #endif default: WARN_ON_ONCE(1); + return -EINVAL; } /* diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 01adea278a71..6d8917875f44 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -41,13 +41,12 @@ struct idt_data { #define SYSG(_vector, _addr) \ G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS) -/* Interrupt gate with interrupt stack */ +/* + * Interrupt gate with interrupt stack. The _ist index is the index in + * the tss.ist[] array, but for the descriptor it needs to start at 1. + */ #define ISTG(_vector, _addr, _ist) \ - G(_vector, _addr, _ist, GATE_INTERRUPT, DPL0, __KERNEL_CS) - -/* System interrupt gate with interrupt stack */ -#define SISTG(_vector, _addr, _ist) \ - G(_vector, _addr, _ist, GATE_INTERRUPT, DPL3, __KERNEL_CS) + G(_vector, _addr, _ist + 1, GATE_INTERRUPT, DPL0, __KERNEL_CS) /* Task gate */ #define TSKG(_vector, _gdt) \ @@ -184,11 +183,11 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; * cpu_init() when the TSS has been initialized. */ static const __initconst struct idt_data ist_idts[] = { - ISTG(X86_TRAP_DB, debug, DEBUG_STACK), - ISTG(X86_TRAP_NMI, nmi, NMI_STACK), - ISTG(X86_TRAP_DF, double_fault, DOUBLEFAULT_STACK), + ISTG(X86_TRAP_DB, debug, IST_INDEX_DB), + ISTG(X86_TRAP_NMI, nmi, IST_INDEX_NMI), + ISTG(X86_TRAP_DF, double_fault, IST_INDEX_DF), #ifdef CONFIG_X86_MCE - ISTG(X86_TRAP_MC, &machine_check, MCE_STACK), + ISTG(X86_TRAP_MC, &machine_check, IST_INDEX_MCE), #endif }; diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 95600a99ae93..fc34816c6f04 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -51,8 +51,8 @@ static inline int check_stack_overflow(void) { return 0; } static inline void print_stack_overflow(void) { } #endif -DEFINE_PER_CPU(struct irq_stack *, hardirq_stack); -DEFINE_PER_CPU(struct irq_stack *, softirq_stack); +DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); +DEFINE_PER_CPU(struct irq_stack *, softirq_stack_ptr); static void call_on_stack(void *func, void *stack) { @@ -76,7 +76,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) u32 *isp, *prev_esp, arg1; curstk = (struct irq_stack *) current_stack(); - irqstk = __this_cpu_read(hardirq_stack); + irqstk = __this_cpu_read(hardirq_stack_ptr); /* * this is where we switch to the IRQ stack. However, if we are @@ -107,27 +107,28 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) } /* - * allocate per-cpu stacks for hardirq and for softirq processing + * Allocate per-cpu stacks for hardirq and softirq processing */ -void irq_ctx_init(int cpu) +int irq_init_percpu_irqstack(unsigned int cpu) { - struct irq_stack *irqstk; - - if (per_cpu(hardirq_stack, cpu)) - return; + int node = cpu_to_node(cpu); + struct page *ph, *ps; - irqstk = page_address(alloc_pages_node(cpu_to_node(cpu), - THREADINFO_GFP, - THREAD_SIZE_ORDER)); - per_cpu(hardirq_stack, cpu) = irqstk; + if (per_cpu(hardirq_stack_ptr, cpu)) + return 0; - irqstk = page_address(alloc_pages_node(cpu_to_node(cpu), - THREADINFO_GFP, - THREAD_SIZE_ORDER)); - per_cpu(softirq_stack, cpu) = irqstk; + ph = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); + if (!ph) + return -ENOMEM; + ps = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); + if (!ps) { + __free_pages(ph, THREAD_SIZE_ORDER); + return -ENOMEM; + } - printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", - cpu, per_cpu(hardirq_stack, cpu), per_cpu(softirq_stack, cpu)); + per_cpu(hardirq_stack_ptr, cpu) = page_address(ph); + per_cpu(softirq_stack_ptr, cpu) = page_address(ps); + return 0; } void do_softirq_own_stack(void) @@ -135,7 +136,7 @@ void do_softirq_own_stack(void) struct irq_stack *irqstk; u32 *isp, *prev_esp; - irqstk = __this_cpu_read(softirq_stack); + irqstk = __this_cpu_read(softirq_stack_ptr); /* build the stack frame on the softirq stack */ isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 0469cd078db1..6bf6517a05bb 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -18,63 +18,64 @@ #include <linux/uaccess.h> #include <linux/smp.h> #include <linux/sched/task_stack.h> + +#include <asm/cpu_entry_area.h> #include <asm/io_apic.h> #include <asm/apic.h> -int sysctl_panic_on_stackoverflow; +DEFINE_PER_CPU_PAGE_ALIGNED(struct irq_stack, irq_stack_backing_store) __visible; +DECLARE_INIT_PER_CPU(irq_stack_backing_store); -/* - * Probabilistic stack overflow check: - * - * Only check the stack in process context, because everything else - * runs on the big interrupt stacks. Checking reliably is too expensive, - * so we just check from interrupts. - */ -static inline void stack_overflow_check(struct pt_regs *regs) +bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) { -#ifdef CONFIG_DEBUG_STACKOVERFLOW -#define STACK_TOP_MARGIN 128 - struct orig_ist *oist; - u64 irq_stack_top, irq_stack_bottom; - u64 estack_top, estack_bottom; - u64 curbase = (u64)task_stack_page(current); + if (IS_ERR_OR_NULL(desc)) + return false; - if (user_mode(regs)) - return; + generic_handle_irq_desc(desc); + return true; +} - if (regs->sp >= curbase + sizeof(struct pt_regs) + STACK_TOP_MARGIN && - regs->sp <= curbase + THREAD_SIZE) - return; +#ifdef CONFIG_VMAP_STACK +/* + * VMAP the backing store with guard pages + */ +static int map_irq_stack(unsigned int cpu) +{ + char *stack = (char *)per_cpu_ptr(&irq_stack_backing_store, cpu); + struct page *pages[IRQ_STACK_SIZE / PAGE_SIZE]; + void *va; + int i; - irq_stack_top = (u64)this_cpu_ptr(irq_stack_union.irq_stack) + - STACK_TOP_MARGIN; - irq_stack_bottom = (u64)__this_cpu_read(irq_stack_ptr); - if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom) - return; + for (i = 0; i < IRQ_STACK_SIZE / PAGE_SIZE; i++) { + phys_addr_t pa = per_cpu_ptr_to_phys(stack + (i << PAGE_SHIFT)); - oist = this_cpu_ptr(&orig_ist); - estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN; - estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1]; - if (regs->sp >= estack_top && regs->sp <= estack_bottom) - return; + pages[i] = pfn_to_page(pa >> PAGE_SHIFT); + } - WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n", - current->comm, curbase, regs->sp, - irq_stack_top, irq_stack_bottom, - estack_top, estack_bottom, (void *)regs->ip); + va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL); + if (!va) + return -ENOMEM; - if (sysctl_panic_on_stackoverflow) - panic("low stack detected by irq handler - check messages\n"); -#endif + per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE; + return 0; } - -bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) +#else +/* + * If VMAP stacks are disabled due to KASAN, just use the per cpu + * backing store without guard pages. + */ +static int map_irq_stack(unsigned int cpu) { - stack_overflow_check(regs); + void *va = per_cpu_ptr(&irq_stack_backing_store, cpu); - if (IS_ERR_OR_NULL(desc)) - return false; + per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE; + return 0; +} +#endif - generic_handle_irq_desc(desc); - return true; +int irq_init_percpu_irqstack(unsigned int cpu) +{ + if (per_cpu(hardirq_stack_ptr, cpu)) + return 0; + return map_irq_stack(cpu); } diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index a0693b71cfc1..16919a9671fa 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -91,6 +91,8 @@ void __init init_IRQ(void) for (i = 0; i < nr_legacy_irqs(); i++) per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = irq_to_desc(i); + BUG_ON(irq_init_percpu_irqstack(smp_processor_id())); + x86_init.irqs.intr_init(); } @@ -104,6 +106,4 @@ void __init native_init_IRQ(void) if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) setup_irq(2, &irq2); - - irq_ctx_init(smp_processor_id()); } diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index f99bd26bd3f1..e631c358f7f4 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -37,7 +37,6 @@ static void bug_at(unsigned char *ip, int line) static void __ref __jump_label_transform(struct jump_entry *entry, enum jump_label_type type, - void *(*poker)(void *, const void *, size_t), int init) { union jump_code_union jmp; @@ -50,9 +49,6 @@ static void __ref __jump_label_transform(struct jump_entry *entry, jmp.offset = jump_entry_target(entry) - (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); - if (early_boot_irqs_disabled) - poker = text_poke_early; - if (type == JUMP_LABEL_JMP) { if (init) { expect = default_nop; line = __LINE__; @@ -75,16 +71,19 @@ static void __ref __jump_label_transform(struct jump_entry *entry, bug_at((void *)jump_entry_code(entry), line); /* - * Make text_poke_bp() a default fallback poker. + * As long as only a single processor is running and the code is still + * not marked as RO, text_poke_early() can be used; Checking that + * system_state is SYSTEM_BOOTING guarantees it. It will be set to + * SYSTEM_SCHEDULING before other cores are awaken and before the + * code is write-protected. * * At the time the change is being done, just ignore whether we * are doing nop -> jump or jump -> nop transition, and assume * always nop being the 'currently valid' instruction - * */ - if (poker) { - (*poker)((void *)jump_entry_code(entry), code, - JUMP_LABEL_NOP_SIZE); + if (init || system_state == SYSTEM_BOOTING) { + text_poke_early((void *)jump_entry_code(entry), code, + JUMP_LABEL_NOP_SIZE); return; } @@ -96,7 +95,7 @@ void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) { mutex_lock(&text_mutex); - __jump_label_transform(entry, type, NULL, 0); + __jump_label_transform(entry, type, 0); mutex_unlock(&text_mutex); } @@ -126,5 +125,5 @@ __init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, jlstate = JL_STATE_NO_UPDATE; } if (jlstate == JL_STATE_UPDATE) - __jump_label_transform(entry, type, text_poke_early, 1); + __jump_label_transform(entry, type, 1); } diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 4ff6b4cdb941..13b13311b792 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -747,7 +747,6 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip) int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) { int err; - char opc[BREAK_INSTR_SIZE]; bpt->type = BP_BREAKPOINT; err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr, @@ -759,18 +758,13 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) if (!err) return err; /* - * It is safe to call text_poke() because normal kernel execution + * It is safe to call text_poke_kgdb() because normal kernel execution * is stopped on all cores, so long as the text_mutex is not locked. */ if (mutex_is_locked(&text_mutex)) return -EBUSY; - text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr, - BREAK_INSTR_SIZE); - err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE); - if (err) - return err; - if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE)) - return -EINVAL; + text_poke_kgdb((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr, + BREAK_INSTR_SIZE); bpt->type = BP_POKE_BREAKPOINT; return err; @@ -778,22 +772,17 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) { - int err; - char opc[BREAK_INSTR_SIZE]; - if (bpt->type != BP_POKE_BREAKPOINT) goto knl_write; /* - * It is safe to call text_poke() because normal kernel execution + * It is safe to call text_poke_kgdb() because normal kernel execution * is stopped on all cores, so long as the text_mutex is not locked. */ if (mutex_is_locked(&text_mutex)) goto knl_write; - text_poke((void *)bpt->bpt_addr, bpt->saved_instr, BREAK_INSTR_SIZE); - err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE); - if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE)) - goto knl_write; - return err; + text_poke_kgdb((void *)bpt->bpt_addr, bpt->saved_instr, + BREAK_INSTR_SIZE); + return 0; knl_write: return probe_kernel_write((char *)bpt->bpt_addr, diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index a034cb808e7e..cf52ee0d8711 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -431,8 +431,21 @@ void *alloc_insn_page(void) void *page; page = module_alloc(PAGE_SIZE); - if (page) - set_memory_ro((unsigned long)page & PAGE_MASK, 1); + if (!page) + return NULL; + + set_vm_flush_reset_perms(page); + /* + * First make the page read-only, and only then make it executable to + * prevent it from being W+X in between. + */ + set_memory_ro((unsigned long)page, 1); + + /* + * TODO: Once additional kernel code protection mechanisms are set, ensure + * that the page was not maliciously altered and it is still zeroed. + */ + set_memory_x((unsigned long)page, 1); return page; } @@ -440,8 +453,6 @@ void *alloc_insn_page(void) /* Recover page to RW mode before releasing it */ void free_insn_page(void *page) { - set_memory_nx((unsigned long)page & PAGE_MASK, 1); - set_memory_rw((unsigned long)page & PAGE_MASK, 1); module_memfree(page); } @@ -569,6 +580,7 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) unsigned long *sara = stack_addr(regs); ri->ret_addr = (kprobe_opcode_t *) *sara; + ri->fp = sara; /* Replace the return addr with trampoline addr */ *sara = (unsigned long) &kretprobe_trampoline; @@ -715,6 +727,7 @@ NOKPROBE_SYMBOL(kprobe_int3_handler); * calls trampoline_handler() runs, which calls the kretprobe's handler. */ asm( + ".text\n" ".global kretprobe_trampoline\n" ".type kretprobe_trampoline, @function\n" "kretprobe_trampoline:\n" @@ -748,26 +761,48 @@ asm( NOKPROBE_SYMBOL(kretprobe_trampoline); STACK_FRAME_NON_STANDARD(kretprobe_trampoline); +static struct kprobe kretprobe_kprobe = { + .addr = (void *)kretprobe_trampoline, +}; + /* * Called from kretprobe_trampoline */ static __used void *trampoline_handler(struct pt_regs *regs) { + struct kprobe_ctlblk *kcb; struct kretprobe_instance *ri = NULL; struct hlist_head *head, empty_rp; struct hlist_node *tmp; unsigned long flags, orig_ret_address = 0; unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; kprobe_opcode_t *correct_ret_addr = NULL; + void *frame_pointer; + bool skipped = false; + + preempt_disable(); + + /* + * Set a dummy kprobe for avoiding kretprobe recursion. + * Since kretprobe never run in kprobe handler, kprobe must not + * be running at this point. + */ + kcb = get_kprobe_ctlblk(); + __this_cpu_write(current_kprobe, &kretprobe_kprobe); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; INIT_HLIST_HEAD(&empty_rp); kretprobe_hash_lock(current, &head, &flags); /* fixup registers */ #ifdef CONFIG_X86_64 regs->cs = __KERNEL_CS; + /* On x86-64, we use pt_regs->sp for return address holder. */ + frame_pointer = ®s->sp; #else regs->cs = __KERNEL_CS | get_kernel_rpl(); regs->gs = 0; + /* On x86-32, we use pt_regs->flags for return address holder. */ + frame_pointer = ®s->flags; #endif regs->ip = trampoline_address; regs->orig_ax = ~0UL; @@ -789,8 +824,25 @@ static __used void *trampoline_handler(struct pt_regs *regs) if (ri->task != current) /* another task is sharing our hash bucket */ continue; + /* + * Return probes must be pushed on this hash list correct + * order (same as return order) so that it can be poped + * correctly. However, if we find it is pushed it incorrect + * order, this means we find a function which should not be + * probed, because the wrong order entry is pushed on the + * path of processing other kretprobe itself. + */ + if (ri->fp != frame_pointer) { + if (!skipped) + pr_warn("kretprobe is stacked incorrectly. Trying to fixup.\n"); + skipped = true; + continue; + } orig_ret_address = (unsigned long)ri->ret_addr; + if (skipped) + pr_warn("%ps must be blacklisted because of incorrect kretprobe order\n", + ri->rp->kp.addr); if (orig_ret_address != trampoline_address) /* @@ -808,14 +860,15 @@ static __used void *trampoline_handler(struct pt_regs *regs) if (ri->task != current) /* another task is sharing our hash bucket */ continue; + if (ri->fp != frame_pointer) + continue; orig_ret_address = (unsigned long)ri->ret_addr; if (ri->rp && ri->rp->handler) { __this_cpu_write(current_kprobe, &ri->rp->kp); - get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; ri->ret_addr = correct_ret_addr; ri->rp->handler(ri, regs); - __this_cpu_write(current_kprobe, NULL); + __this_cpu_write(current_kprobe, &kretprobe_kprobe); } recycle_rp_inst(ri, &empty_rp); @@ -831,6 +884,9 @@ static __used void *trampoline_handler(struct pt_regs *regs) kretprobe_hash_unlock(current, &flags); + __this_cpu_write(current_kprobe, NULL); + preempt_enable(); + hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); kfree(ri); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5c93a65ee1e5..3f0cc828cc36 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -67,7 +67,7 @@ static int __init parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); -static DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64); +DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible; static int has_steal_clock = 0; /* diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 6135ae8ce036..b2463fcb20a8 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -113,7 +113,7 @@ static void do_sanity_check(struct mm_struct *mm, * tables. */ WARN_ON(!had_kernel_mapping); - if (static_cpu_has(X86_FEATURE_PTI)) + if (boot_cpu_has(X86_FEATURE_PTI)) WARN_ON(!had_user_mapping); } else { /* @@ -121,7 +121,7 @@ static void do_sanity_check(struct mm_struct *mm, * Sync the pgd to the usermode tables. */ WARN_ON(had_kernel_mapping); - if (static_cpu_has(X86_FEATURE_PTI)) + if (boot_cpu_has(X86_FEATURE_PTI)) WARN_ON(had_user_mapping); } } @@ -156,7 +156,7 @@ static void map_ldt_struct_to_user(struct mm_struct *mm) k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR); u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR); - if (static_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) + if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) set_pmd(u_pmd, *k_pmd); } @@ -181,7 +181,7 @@ static void map_ldt_struct_to_user(struct mm_struct *mm) { pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR); - if (static_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) + if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) set_pgd(kernel_to_user_pgdp(pgd), *pgd); } @@ -208,7 +208,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) spinlock_t *ptl; int i, nr_pages; - if (!static_cpu_has(X86_FEATURE_PTI)) + if (!boot_cpu_has(X86_FEATURE_PTI)) return 0; /* @@ -271,7 +271,7 @@ static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt) return; /* LDT map/unmap is only required for PTI */ - if (!static_cpu_has(X86_FEATURE_PTI)) + if (!boot_cpu_has(X86_FEATURE_PTI)) return; nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE); @@ -311,7 +311,7 @@ static void free_ldt_pgtables(struct mm_struct *mm) unsigned long start = LDT_BASE_ADDR; unsigned long end = LDT_END_ADDR; - if (!static_cpu_has(X86_FEATURE_PTI)) + if (!boot_cpu_has(X86_FEATURE_PTI)) return; tlb_gather_mmu(&tlb, mm, start, end); diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index b052e883dd8c..cfa3106faee4 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -87,7 +87,7 @@ void *module_alloc(unsigned long size) p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR + get_module_load_offset(), MODULES_END, GFP_KERNEL, - PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, + PAGE_KERNEL, 0, NUMA_NO_NODE, __builtin_return_address(0)); if (p && (kasan_module_alloc(p, size) < 0)) { vfree(p); diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 3482460d984d..1bfe5c6e6cfe 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -598,8 +598,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) mpf_base = base; mpf_found = true; - pr_info("found SMP MP-table at [mem %#010lx-%#010lx] mapped at [%p]\n", - base, base + sizeof(*mpf) - 1, mpf); + pr_info("found SMP MP-table at [mem %#010lx-%#010lx]\n", + base, base + sizeof(*mpf) - 1); memblock_reserve(base, sizeof(*mpf)); if (mpf->physptr) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 18bc9b51ac9b..3755d0310026 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -21,13 +21,14 @@ #include <linux/ratelimit.h> #include <linux/slab.h> #include <linux/export.h> +#include <linux/atomic.h> #include <linux/sched/clock.h> #if defined(CONFIG_EDAC) #include <linux/edac.h> #endif -#include <linux/atomic.h> +#include <asm/cpu_entry_area.h> #include <asm/traps.h> #include <asm/mach_traps.h> #include <asm/nmi.h> @@ -487,6 +488,23 @@ static DEFINE_PER_CPU(unsigned long, nmi_cr2); * switch back to the original IDT. */ static DEFINE_PER_CPU(int, update_debug_stack); + +static bool notrace is_debug_stack(unsigned long addr) +{ + struct cea_exception_stacks *cs = __this_cpu_read(cea_exception_stacks); + unsigned long top = CEA_ESTACK_TOP(cs, DB); + unsigned long bot = CEA_ESTACK_BOT(cs, DB1); + + if (__this_cpu_read(debug_stack_usage)) + return true; + /* + * Note, this covers the guard page between DB and DB1 as well to + * avoid two checks. But by all means @addr can never point into + * the guard page. + */ + return addr >= bot && addr < top; +} +NOKPROBE_SYMBOL(is_debug_stack); #endif dotraplinkage notrace void diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index c0e0101133f3..7bbaa6baf37f 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -121,7 +121,7 @@ DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key); void __init native_pv_lock_init(void) { - if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) + if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) static_branch_disable(&virt_spin_lock_key); } diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index c06c4c16c6b6..07c30ee17425 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -59,18 +59,34 @@ static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = { u64 perf_reg_value(struct pt_regs *regs, int idx) { + struct x86_perf_regs *perf_regs; + + if (idx >= PERF_REG_X86_XMM0 && idx < PERF_REG_X86_XMM_MAX) { + perf_regs = container_of(regs, struct x86_perf_regs, regs); + if (!perf_regs->xmm_regs) + return 0; + return perf_regs->xmm_regs[idx - PERF_REG_X86_XMM0]; + } + if (WARN_ON_ONCE(idx >= ARRAY_SIZE(pt_regs_offset))) return 0; return regs_get_register(regs, pt_regs_offset[idx]); } -#define REG_RESERVED (~((1ULL << PERF_REG_X86_MAX) - 1ULL)) - #ifdef CONFIG_X86_32 +#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_R8) | \ + (1ULL << PERF_REG_X86_R9) | \ + (1ULL << PERF_REG_X86_R10) | \ + (1ULL << PERF_REG_X86_R11) | \ + (1ULL << PERF_REG_X86_R12) | \ + (1ULL << PERF_REG_X86_R13) | \ + (1ULL << PERF_REG_X86_R14) | \ + (1ULL << PERF_REG_X86_R15)) + int perf_reg_validate(u64 mask) { - if (!mask || mask & REG_RESERVED) + if (!mask || (mask & REG_NOSUPPORT)) return -EINVAL; return 0; @@ -96,10 +112,7 @@ void perf_get_regs_user(struct perf_regs *regs_user, int perf_reg_validate(u64 mask) { - if (!mask || mask & REG_RESERVED) - return -EINVAL; - - if (mask & REG_NOSUPPORT) + if (!mask || (mask & REG_NOSUPPORT)) return -EINVAL; return 0; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 58ac7be52c7a..75fea0d48c0e 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -101,7 +101,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) dst->thread.vm86 = NULL; #endif - return fpu__copy(&dst->thread.fpu, &src->thread.fpu); + return fpu__copy(dst, src); } /* @@ -236,7 +236,7 @@ static int get_cpuid_mode(void) static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled) { - if (!static_cpu_has(X86_FEATURE_CPUID_FAULT)) + if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT)) return -ENODEV; if (cpuid_enabled) @@ -426,6 +426,8 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp, u64 msr = x86_spec_ctrl_base; bool updmsr = false; + lockdep_assert_irqs_disabled(); + /* * If TIF_SSBD is different, select the proper mitigation * method. Note that if SSBD mitigation is disabled or permanentely @@ -477,10 +479,12 @@ static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) void speculation_ctrl_update(unsigned long tif) { + unsigned long flags; + /* Forced update. Make sure all relevant TIF flags are different */ - preempt_disable(); + local_irq_save(flags); __speculation_ctrl_update(~tif, tif); - preempt_enable(); + local_irq_restore(flags); } /* Called from seccomp/prctl update */ @@ -666,7 +670,7 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) if (c->x86_vendor != X86_VENDOR_INTEL) return 0; - if (!cpu_has(c, X86_FEATURE_MWAIT) || static_cpu_has_bug(X86_BUG_MONITOR)) + if (!cpu_has(c, X86_FEATURE_MWAIT) || boot_cpu_has_bug(X86_BUG_MONITOR)) return 0; return 1; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index e471d8e6f0b2..2399e910d109 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -127,6 +127,13 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, struct task_struct *tsk; int err; + /* + * For a new task use the RESET flags value since there is no before. + * All the status flags are zero; DF and all the system flags must also + * be 0, specifically IF must be 0 because we context switch to the new + * task with interrupts disabled. + */ + frame->flags = X86_EFLAGS_FIXED; frame->bp = 0; frame->ret_addr = (unsigned long) ret_from_fork; p->thread.sp = (unsigned long) fork_frame; @@ -234,7 +241,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - switch_fpu_prepare(prev_fpu, cpu); + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) + switch_fpu_prepare(prev_fpu, cpu); /* * Save away %gs. No need to save %fs, as it was saved on the @@ -267,9 +275,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so - * the GDT and LDT are properly updated, and must be - * done before fpu__restore(), so the TS bit is up - * to date. + * the GDT and LDT are properly updated. */ arch_end_context_switch(next_p); @@ -290,10 +296,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (prev->gs | next->gs) lazy_load_gs(next->gs); - switch_fpu_finish(next_fpu, cpu); - this_cpu_write(current_task, next_p); + switch_fpu_finish(next_fpu); + /* Load the Intel cache allocation PQR MSR. */ resctrl_sched_in(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 6a62f4af9fcf..f8e1af380cdf 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -392,6 +392,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, childregs = task_pt_regs(p); fork_frame = container_of(childregs, struct fork_frame, regs); frame = &fork_frame->frame; + frame->bp = 0; frame->ret_addr = (unsigned long) ret_from_fork; p->thread.sp = (unsigned long) fork_frame; @@ -520,7 +521,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && this_cpu_read(irq_count) != -1); - switch_fpu_prepare(prev_fpu, cpu); + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) + switch_fpu_prepare(prev_fpu, cpu); /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). @@ -538,9 +540,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Leave lazy mode, flushing any hypercalls made here. This * must be done after loading TLS entries in the GDT but before - * loading segments that might reference them, and and it must - * be done before fpu__restore(), so the TS bit is up to - * date. + * loading segments that might reference them. */ arch_end_context_switch(next_p); @@ -568,14 +568,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) x86_fsgsbase_load(prev, next); - switch_fpu_finish(next_fpu, cpu); - /* * Switch the PDA and FPU contexts. */ this_cpu_write(current_task, next_p); this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); + switch_fpu_finish(next_fpu); + /* Reload sp0. */ update_task_stack(next_p); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 725624b6c0c0..09d6bded3c1e 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -81,6 +81,19 @@ static int __init set_bios_reboot(const struct dmi_system_id *d) return 0; } +/* + * Some machines don't handle the default ACPI reboot method and + * require the EFI reboot method: + */ +static int __init set_efi_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_EFI && !efi_runtime_disabled()) { + reboot_type = BOOT_EFI; + pr_info("%s series board detected. Selecting EFI-method for reboot.\n", d->ident); + } + return 0; +} + void __noreturn machine_real_restart(unsigned int type) { local_irq_disable(); @@ -108,7 +121,7 @@ void __noreturn machine_real_restart(unsigned int type) write_cr3(real_mode_header->trampoline_pgd); /* Exiting long mode will fail if CR4.PCIDE is set. */ - if (static_cpu_has(X86_FEATURE_PCID)) + if (boot_cpu_has(X86_FEATURE_PCID)) cr4_clear_bits(X86_CR4_PCIDE); #endif @@ -166,6 +179,14 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = { DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"), }, }, + { /* Handle reboot issue on Acer TravelMate X514-51T */ + .callback = set_efi_reboot, + .ident = "Acer TravelMate X514-51T", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate X514-51T"), + }, + }, /* Apple */ { /* Handle problems with rebooting on Apple MacBook5 */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3d872a527cd9..905dae880563 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -71,6 +71,7 @@ #include <linux/tboot.h> #include <linux/jiffies.h> #include <linux/mem_encrypt.h> +#include <linux/sizes.h> #include <linux/usb/xhci-dbgp.h> #include <video/edid.h> @@ -448,18 +449,17 @@ static void __init memblock_x86_reserve_range_setup_data(void) #ifdef CONFIG_KEXEC_CORE /* 16M alignment for crash kernel regions */ -#define CRASH_ALIGN (16 << 20) +#define CRASH_ALIGN SZ_16M /* * Keep the crash kernel below this limit. On 32 bits earlier kernels * would limit the kernel to the low 512 MiB due to mapping restrictions. - * On 64bit, old kexec-tools need to under 896MiB. */ #ifdef CONFIG_X86_32 -# define CRASH_ADDR_LOW_MAX (512 << 20) -# define CRASH_ADDR_HIGH_MAX (512 << 20) +# define CRASH_ADDR_LOW_MAX SZ_512M +# define CRASH_ADDR_HIGH_MAX SZ_512M #else -# define CRASH_ADDR_LOW_MAX (896UL << 20) +# define CRASH_ADDR_LOW_MAX SZ_4G # define CRASH_ADDR_HIGH_MAX MAXMEM #endif @@ -541,21 +541,27 @@ static void __init reserve_crashkernel(void) } /* 0 means: find the address automatically */ - if (crash_base <= 0) { + if (!crash_base) { /* * Set CRASH_ADDR_LOW_MAX upper bound for crash memory, - * as old kexec-tools loads bzImage below that, unless - * "crashkernel=size[KMG],high" is specified. + * crashkernel=x,high reserves memory over 4G, also allocates + * 256M extra low memory for DMA buffers and swiotlb. + * But the extra memory is not required for all machines. + * So try low memory first and fall back to high memory + * unless "crashkernel=size[KMG],high" is specified. */ - crash_base = memblock_find_in_range(CRASH_ALIGN, - high ? CRASH_ADDR_HIGH_MAX - : CRASH_ADDR_LOW_MAX, - crash_size, CRASH_ALIGN); + if (!high) + crash_base = memblock_find_in_range(CRASH_ALIGN, + CRASH_ADDR_LOW_MAX, + crash_size, CRASH_ALIGN); + if (!crash_base) + crash_base = memblock_find_in_range(CRASH_ALIGN, + CRASH_ADDR_HIGH_MAX, + crash_size, CRASH_ALIGN); if (!crash_base) { pr_info("crashkernel reservation failed - No suitable area found.\n"); return; } - } else { unsigned long long start; @@ -1005,13 +1011,11 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled(EFI_BOOT)) efi_init(); - dmi_scan_machine(); - dmi_memdev_walk(); - dmi_set_dump_stack_arch_desc(); + dmi_setup(); /* * VMware detection requires dmi to be available, so this - * needs to be done after dmi_scan_machine(), for the boot CPU. + * needs to be done after dmi_setup(), for the boot CPU. */ init_hypervisor_platform(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 4bf46575568a..86663874ef04 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -244,11 +244,6 @@ void __init setup_per_cpu_areas(void) per_cpu(x86_cpu_to_logical_apicid, cpu) = early_per_cpu_map(x86_cpu_to_logical_apicid, cpu); #endif -#ifdef CONFIG_X86_64 - per_cpu(irq_stack_ptr, cpu) = - per_cpu(irq_stack_union.irq_stack, cpu) + - IRQ_STACK_SIZE; -#endif #ifdef CONFIG_NUMA per_cpu(x86_cpu_to_node_map, cpu) = early_per_cpu_map(x86_cpu_to_node_map, cpu); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 08dfd4c1a4f9..364813cea647 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -132,16 +132,6 @@ static int restore_sigcontext(struct pt_regs *regs, COPY_SEG_CPL3(cs); COPY_SEG_CPL3(ss); -#ifdef CONFIG_X86_64 - /* - * Fix up SS if needed for the benefit of old DOSEMU and - * CRIU. - */ - if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && - user_64bit_mode(regs))) - force_valid_ss(regs); -#endif - get_user_ex(tmpflags, &sc->flags); regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); regs->orig_ax = -1; /* disable syscall checks */ @@ -150,6 +140,15 @@ static int restore_sigcontext(struct pt_regs *regs, buf = (void __user *)buf_val; } get_user_catch(err); +#ifdef CONFIG_X86_64 + /* + * Fix up SS if needed for the benefit of old DOSEMU and + * CRIU. + */ + if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs))) + force_valid_ss(regs); +#endif + err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32)); force_iret(); @@ -206,7 +205,7 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, put_user_ex(regs->ss, &sc->ss); #endif /* CONFIG_X86_32 */ - put_user_ex(fpstate, &sc->fpstate); + put_user_ex(fpstate, (unsigned long __user *)&sc->fpstate); /* non-iBCS2 extensions.. */ put_user_ex(mask, &sc->oldmask); @@ -246,7 +245,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, unsigned long sp = regs->sp; unsigned long buf_fx = 0; int onsigstack = on_sig_stack(sp); - struct fpu *fpu = ¤t->thread.fpu; + int ret; /* redzone */ if (IS_ENABLED(CONFIG_X86_64)) @@ -265,11 +264,9 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, sp = (unsigned long) ka->sa.sa_restorer; } - if (fpu->initialized) { - sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32), - &buf_fx, &math_size); - *fpstate = (void __user *)sp; - } + sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32), + &buf_fx, &math_size); + *fpstate = (void __user *)sp; sp = align_sigframe(sp - frame_size); @@ -281,8 +278,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, return (void __user *)-1L; /* save i387 and extended state */ - if (fpu->initialized && - copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size) < 0) + ret = copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size); + if (ret < 0) return (void __user *)-1L; return (void __user *)sp; @@ -461,6 +458,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, { struct rt_sigframe __user *frame; void __user *fp = NULL; + unsigned long uc_flags; int err = 0; frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp); @@ -473,9 +471,11 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, return -EFAULT; } + uc_flags = frame_uc_flags(regs); + put_user_try { /* Create the ucontext. */ - put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags); + put_user_ex(uc_flags, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); save_altstack_ex(&frame->uc.uc_stack, regs->sp); @@ -541,6 +541,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, { #ifdef CONFIG_X86_X32_ABI struct rt_sigframe_x32 __user *frame; + unsigned long uc_flags; void __user *restorer; int err = 0; void __user *fpstate = NULL; @@ -555,9 +556,11 @@ static int x32_setup_rt_frame(struct ksignal *ksig, return -EFAULT; } + uc_flags = frame_uc_flags(regs); + put_user_try { /* Create the ucontext. */ - put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags); + put_user_ex(uc_flags, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp); put_user_ex(0, &frame->uc.uc__pad0); @@ -569,7 +572,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, restorer = NULL; err |= -EFAULT; } - put_user_ex(restorer, &frame->pretcode); + put_user_ex(restorer, (unsigned long __user *)&frame->pretcode); } put_user_catch(err); err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, @@ -688,10 +691,7 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) sigset_t *set = sigmask_to_save(); compat_sigset_t *cset = (compat_sigset_t *) set; - /* - * Increment event counter and perform fixup for the pre-signal - * frame. - */ + /* Perform fixup for the pre-signal frame. */ rseq_signal_deliver(ksig, regs); /* Set up the stack frame */ @@ -763,8 +763,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) /* * Ensure the signal handler starts with the new fpu state. */ - if (fpu->initialized) - fpu__clear(fpu); + fpu__clear(fpu); } signal_setup_done(failed, ksig, stepping); } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ce1a67b70168..73e69aaaa117 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -455,7 +455,7 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) * multicore group inside a NUMA node. If this happens, we will * discard the MC level of the topology later. */ -static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { if (c->phys_proc_id == o->phys_proc_id) return true; @@ -546,7 +546,7 @@ void set_cpu_sibling_map(int cpu) for_each_cpu(i, cpu_sibling_setup_mask) { o = &cpu_data(i); - if ((i == cpu) || (has_mp && match_die(c, o))) { + if ((i == cpu) || (has_mp && match_pkg(c, o))) { link_mask(topology_core_cpumask, cpu, i); /* @@ -570,7 +570,7 @@ void set_cpu_sibling_map(int cpu) } else if (i != cpu && !c->booted_cores) c->booted_cores = cpu_data(i).booted_cores; } - if (match_die(c, o) && !topology_same_node(c, o)) + if (match_pkg(c, o) && !topology_same_node(c, o)) x86_has_numa_in_package = true; } @@ -935,20 +935,27 @@ out: return boot_error; } -void common_cpu_up(unsigned int cpu, struct task_struct *idle) +int common_cpu_up(unsigned int cpu, struct task_struct *idle) { + int ret; + /* Just in case we booted with a single CPU. */ alternatives_enable_smp(); per_cpu(current_task, cpu) = idle; + /* Initialize the interrupt stack(s) */ + ret = irq_init_percpu_irqstack(cpu); + if (ret) + return ret; + #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ - irq_ctx_init(cpu); per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); #else initial_gs = per_cpu_offset(cpu); #endif + return 0; } /* @@ -1106,7 +1113,9 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) /* the FPU context is blank, nobody can own it */ per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; - common_cpu_up(cpu, tidle); + err = common_cpu_up(cpu, tidle); + if (err) + return err; err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered); if (err) { diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 5c2d71a1dc06..2abf27d7df6b 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -12,78 +12,31 @@ #include <asm/stacktrace.h> #include <asm/unwind.h> -static int save_stack_address(struct stack_trace *trace, unsigned long addr, - bool nosched) -{ - if (nosched && in_sched_functions(addr)) - return 0; - - if (trace->skip > 0) { - trace->skip--; - return 0; - } - - if (trace->nr_entries >= trace->max_entries) - return -1; - - trace->entries[trace->nr_entries++] = addr; - return 0; -} - -static void noinline __save_stack_trace(struct stack_trace *trace, - struct task_struct *task, struct pt_regs *regs, - bool nosched) +void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, + struct task_struct *task, struct pt_regs *regs) { struct unwind_state state; unsigned long addr; - if (regs) - save_stack_address(trace, regs->ip, nosched); + if (regs && !consume_entry(cookie, regs->ip, false)) + return; for (unwind_start(&state, task, regs, NULL); !unwind_done(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); - if (!addr || save_stack_address(trace, addr, nosched)) + if (!addr || !consume_entry(cookie, addr, false)) break; } - - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } /* - * Save stack-backtrace addresses into a stack_trace buffer. + * This function returns an error if it detects any unreliable features of the + * stack. Otherwise it guarantees that the stack trace is reliable. + * + * If the task is not 'current', the caller *must* ensure the task is inactive. */ -void save_stack_trace(struct stack_trace *trace) -{ - trace->skip++; - __save_stack_trace(trace, current, NULL, false); -} -EXPORT_SYMBOL_GPL(save_stack_trace); - -void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) -{ - __save_stack_trace(trace, current, regs, false); -} - -void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) -{ - if (!try_get_task_stack(tsk)) - return; - - if (tsk == current) - trace->skip++; - __save_stack_trace(trace, tsk, NULL, true); - - put_task_stack(tsk); -} -EXPORT_SYMBOL_GPL(save_stack_trace_tsk); - -#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE - -static int __always_inline -__save_stack_trace_reliable(struct stack_trace *trace, - struct task_struct *task) +int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, + void *cookie, struct task_struct *task) { struct unwind_state state; struct pt_regs *regs; @@ -97,7 +50,7 @@ __save_stack_trace_reliable(struct stack_trace *trace, if (regs) { /* Success path for user tasks */ if (user_mode(regs)) - goto success; + return 0; /* * Kernel mode registers on the stack indicate an @@ -120,7 +73,7 @@ __save_stack_trace_reliable(struct stack_trace *trace, if (!addr) return -EINVAL; - if (save_stack_address(trace, addr, false)) + if (!consume_entry(cookie, addr, false)) return -EINVAL; } @@ -132,39 +85,9 @@ __save_stack_trace_reliable(struct stack_trace *trace, if (!(task->flags & (PF_KTHREAD | PF_IDLE))) return -EINVAL; -success: - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; - return 0; } -/* - * This function returns an error if it detects any unreliable features of the - * stack. Otherwise it guarantees that the stack trace is reliable. - * - * If the task is not 'current', the caller *must* ensure the task is inactive. - */ -int save_stack_trace_tsk_reliable(struct task_struct *tsk, - struct stack_trace *trace) -{ - int ret; - - /* - * If the task doesn't have a stack (e.g., a zombie), the stack is - * "reliably" empty. - */ - if (!try_get_task_stack(tsk)) - return 0; - - ret = __save_stack_trace_reliable(trace, tsk); - - put_task_stack(tsk); - - return ret; -} -#endif /* CONFIG_HAVE_RELIABLE_STACKTRACE */ - /* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ struct stack_frame_user { @@ -189,15 +112,15 @@ copy_stack_frame(const void __user *fp, struct stack_frame_user *frame) return ret; } -static inline void __save_stack_trace_user(struct stack_trace *trace) +void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, + const struct pt_regs *regs) { - const struct pt_regs *regs = task_pt_regs(current); const void __user *fp = (const void __user *)regs->bp; - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = regs->ip; + if (!consume_entry(cookie, regs->ip, false)) + return; - while (trace->nr_entries < trace->max_entries) { + while (1) { struct stack_frame_user frame; frame.next_fp = NULL; @@ -207,8 +130,8 @@ static inline void __save_stack_trace_user(struct stack_trace *trace) if ((unsigned long)fp < regs->sp) break; if (frame.ret_addr) { - trace->entries[trace->nr_entries++] = - frame.ret_addr; + if (!consume_entry(cookie, frame.ret_addr, false)) + return; } if (fp == frame.next_fp) break; @@ -216,14 +139,3 @@ static inline void __save_stack_trace_user(struct stack_trace *trace) } } -void save_stack_trace_user(struct stack_trace *trace) -{ - /* - * Trace user stack if we are not a kernel thread - */ - if (current->mm) { - __save_stack_trace_user(trace); - } - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; -} diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 738bf42b0218..be5bc2e47c71 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -71,7 +71,7 @@ int _debug_hotplug_cpu(int cpu, int action) case 0: ret = cpu_down(cpu); if (!ret) { - pr_info("CPU %u is now offline\n", cpu); + pr_info("DEBUG_HOTPLUG_CPU0: CPU %u is now offline\n", cpu); dev->offline = true; kobject_uevent(&dev->kobj, KOBJ_OFFLINE); } else diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d26f9e9c3d83..8b6d03e55d2f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -456,7 +456,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) * which is all zeros which indicates MPX was not * responsible for the exception. */ - bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR); + bndcsr = get_xsave_field_ptr(XFEATURE_BNDCSR); if (!bndcsr) goto exit_trap; diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 3fae23834069..15b5e98a86f9 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -185,8 +185,7 @@ static void __init cyc2ns_init_boot_cpu(void) /* * Secondary CPUs do not run through tsc_init(), so set up * all the scale factors for all CPUs, assuming the same - * speed as the bootup CPU. (cpufreq notifiers will fix this - * up if their speed diverges) + * speed as the bootup CPU. */ static void __init cyc2ns_init_secondary_cpus(void) { @@ -283,6 +282,7 @@ int __init notsc_setup(char *str) __setup("notsc", notsc_setup); static int no_sched_irq_time; +static int no_tsc_watchdog; static int __init tsc_setup(char *str) { @@ -292,6 +292,8 @@ static int __init tsc_setup(char *str) no_sched_irq_time = 1; if (!strcmp(str, "unstable")) mark_tsc_unstable("boot parameter"); + if (!strcmp(str, "nowatchdog")) + no_tsc_watchdog = 1; return 1; } @@ -937,12 +939,12 @@ void tsc_restore_sched_clock_state(void) } #ifdef CONFIG_CPU_FREQ -/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency +/* + * Frequency scaling support. Adjust the TSC based timer when the CPU frequency * changes. * - * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's - * not that important because current Opteron setups do not support - * scaling on SMP anyroads. + * NOTE: On SMP the situation is not fixable in general, so simply mark the TSC + * as unstable and give up in those cases. * * Should fix up last_tsc too. Currently gettimeofday in the * first tick after the change will be slightly wrong. @@ -956,22 +958,22 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; - unsigned long *lpj; - lpj = &boot_cpu_data.loops_per_jiffy; -#ifdef CONFIG_SMP - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - lpj = &cpu_data(freq->cpu).loops_per_jiffy; -#endif + if (num_online_cpus() > 1) { + mark_tsc_unstable("cpufreq changes on SMP"); + return 0; + } if (!ref_freq) { ref_freq = freq->old; - loops_per_jiffy_ref = *lpj; + loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy; tsc_khz_ref = tsc_khz; } + if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || - (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) { - *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); + (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) { + boot_cpu_data.loops_per_jiffy = + cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) @@ -1349,7 +1351,7 @@ static int __init init_tsc_clocksource(void) if (tsc_unstable) goto unreg; - if (tsc_clocksource_reliable) + if (tsc_clocksource_reliable || no_tsc_watchdog) clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3)) diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index a092b6b40c6b..6a38717d179c 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -369,7 +369,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) preempt_disable(); tsk->thread.sp0 += 16; - if (static_cpu_has(X86_FEATURE_SEP)) { + if (boot_cpu_has(X86_FEATURE_SEP)) { tsk->thread.sysenter_cs = 0; refresh_sysenter_cs(&tsk->thread); } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index bad8c51fee6e..0850b5149345 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -141,11 +141,11 @@ SECTIONS *(.text.__x86.indirect_thunk) __indirect_thunk_end = .; #endif - - /* End of text section */ - _etext = .; } :text = 0x9090 + /* End of text section */ + _etext = .; + NOTES :text :note EXCEPTION_TABLE(16) :text = 0x9090 @@ -362,7 +362,7 @@ SECTIONS .bss : AT(ADDR(.bss) - LOAD_OFFSET) { __bss_start = .; *(.bss..page_aligned) - *(.bss) + *(BSS_MAIN) BSS_DECRYPTED . = ALIGN(PAGE_SIZE); __bss_stop = .; @@ -403,7 +403,8 @@ SECTIONS */ #define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load INIT_PER_CPU(gdt_page); -INIT_PER_CPU(irq_stack_union); +INIT_PER_CPU(fixed_percpu_data); +INIT_PER_CPU(irq_stack_backing_store); /* * Build-time check on the image size: @@ -412,8 +413,8 @@ INIT_PER_CPU(irq_stack_union); "kernel image bigger than KERNEL_IMAGE_SIZE"); #ifdef CONFIG_SMP -. = ASSERT((irq_stack_union == 0), - "irq_stack_union is not at start of per-cpu area"); +. = ASSERT((fixed_percpu_data == 0), + "fixed_percpu_data is not at start of per-cpu area"); #endif #endif /* CONFIG_X86_32 */ |