diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-10-30 15:40:57 -1000 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-10-30 15:40:57 -1000 |
commit | f0d25b5d0f8ef0ad35f1beff17da5843279d47a1 (patch) | |
tree | 73bb0fea8ed4a96d01517349dfb2588c3ae37e6d /arch/x86 | |
parent | 1641b9b04002c22f616a51a164c04b7f679d241f (diff) | |
parent | a1e2b8b36820d8c91275f207e77e91645b7c6836 (diff) | |
download | lwn-f0d25b5d0f8ef0ad35f1beff17da5843279d47a1.tar.gz lwn-f0d25b5d0f8ef0ad35f1beff17da5843279d47a1.zip |
Merge tag 'x86-mm-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm handling updates from Ingo Molnar:
- Add new NX-stack self-test
- Improve NUMA partial-CFMWS handling
- Fix #VC handler bugs resulting in SEV-SNP boot failures
- Drop the 4MB memory size restriction on minimal NUMA nodes
- Reorganize headers a bit, in preparation to header dependency
reduction efforts
- Misc cleanups & fixes
* tag 'x86-mm-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/mm: Drop the 4 MB restriction on minimal NUMA node memory size
selftests/x86/lam: Zero out buffer for readlink()
x86/sev: Drop unneeded #include
x86/sev: Move sev_setup_arch() to mem_encrypt.c
x86/tdx: Replace deprecated strncpy() with strtomem_pad()
selftests/x86/mm: Add new test that userspace stack is in fact NX
x86/sev: Make boot_ghcb_page[] static
x86/boot: Move x86_cache_alignment initialization to correct spot
x86/sev-es: Set x86_virt_bits to the correct value straight away, instead of a two-phase approach
x86/sev-es: Allow copy_from_kernel_nofault() in earlier boot
x86_64: Show CR4.PSE on auxiliaries like on BSP
x86/iommu/docs: Update AMD IOMMU specification document URL
x86/sev/docs: Update document URL in amd-memory-encryption.rst
x86/mm: Move arch_memory_failure() and arch_is_platform_page() definitions from <asm/processor.h> to <asm/pgtable.h>
ACPI/NUMA: Apply SRAT proximity domain to entire CFMWS window
x86/numa: Introduce numa_fill_memblks()
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/boot/compressed/sev.c | 2 | ||||
-rw-r--r-- | arch/x86/coco/tdx/tdx.c | 2 | ||||
-rw-r--r-- | arch/x86/include/asm/mem_encrypt.h | 4 | ||||
-rw-r--r-- | arch/x86/include/asm/numa.h | 7 | ||||
-rw-r--r-- | arch/x86/include/asm/pgtable.h | 8 | ||||
-rw-r--r-- | arch/x86/include/asm/processor.h | 8 | ||||
-rw-r--r-- | arch/x86/include/asm/sparsemem.h | 2 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/common.c | 40 | ||||
-rw-r--r-- | arch/x86/kernel/head_64.S | 4 | ||||
-rw-r--r-- | arch/x86/kernel/setup.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/maccess.c | 19 | ||||
-rw-r--r-- | arch/x86/mm/mem_encrypt.c | 34 | ||||
-rw-r--r-- | arch/x86/mm/mem_encrypt_amd.c | 36 | ||||
-rw-r--r-- | arch/x86/mm/numa.c | 87 |
14 files changed, 168 insertions, 87 deletions
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 7816ff55025d..454acd7a2daf 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -25,7 +25,7 @@ #include "error.h" #include "../msr.h" -struct ghcb boot_ghcb_page __aligned(PAGE_SIZE); +static struct ghcb boot_ghcb_page __aligned(PAGE_SIZE); struct ghcb *boot_ghcb; /* diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 1d6b863c42b0..2e1be592c220 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -119,7 +119,7 @@ static void __noreturn tdx_panic(const char *msg) } message; /* VMM assumes '\0' in byte 65, if the message took all 64 bytes */ - strncpy(message.str, msg, 64); + strtomem_pad(message.str, msg, '\0'); args.r8 = message.r8; args.r9 = message.r9; diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 473b16d73b47..359ada486fa9 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -19,8 +19,10 @@ #ifdef CONFIG_X86_MEM_ENCRYPT void __init mem_encrypt_init(void); +void __init mem_encrypt_setup_arch(void); #else static inline void mem_encrypt_init(void) { } +static inline void __init mem_encrypt_setup_arch(void) { } #endif #ifdef CONFIG_AMD_MEM_ENCRYPT @@ -43,7 +45,6 @@ void __init sme_map_bootdata(char *real_mode_data); void __init sme_unmap_bootdata(char *real_mode_data); void __init sme_early_init(void); -void __init sev_setup_arch(void); void __init sme_encrypt_kernel(struct boot_params *bp); void __init sme_enable(struct boot_params *bp); @@ -73,7 +74,6 @@ static inline void __init sme_map_bootdata(char *real_mode_data) { } static inline void __init sme_unmap_bootdata(char *real_mode_data) { } static inline void __init sme_early_init(void) { } -static inline void __init sev_setup_arch(void) { } static inline void __init sme_encrypt_kernel(struct boot_params *bp) { } static inline void __init sme_enable(struct boot_params *bp) { } diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index e3bae2b60a0d..ef2844d69173 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -12,13 +12,6 @@ #define NR_NODE_MEMBLKS (MAX_NUMNODES*2) -/* - * Too small node sizes may confuse the VM badly. Usually they - * result from BIOS bugs. So dont recognize nodes as standalone - * NUMA entities that have less than this amount of RAM listed: - */ -#define NODE_MIN_SIZE (4*1024*1024) - extern int numa_off; /* diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index e02b179ec659..57bab91bbf50 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1716,6 +1716,14 @@ static inline bool pud_user_accessible_page(pud_t pud) } #endif +#ifdef CONFIG_X86_SGX +int arch_memory_failure(unsigned long pfn, int flags); +#define arch_memory_failure arch_memory_failure + +bool arch_is_platform_page(u64 paddr); +#define arch_is_platform_page arch_is_platform_page +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 6e30b27b1ebe..621ff0803b04 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -724,14 +724,6 @@ enum mds_mitigations { MDS_MITIGATION_VMWERV, }; -#ifdef CONFIG_X86_SGX -int arch_memory_failure(unsigned long pfn, int flags); -#define arch_memory_failure arch_memory_failure - -bool arch_is_platform_page(u64 paddr); -#define arch_is_platform_page arch_is_platform_page -#endif - extern bool gds_ucode_mitigated(void); #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index 64df897c0ee3..1be13b2dfe8b 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h @@ -37,6 +37,8 @@ extern int phys_to_target_node(phys_addr_t start); #define phys_to_target_node phys_to_target_node extern int memory_add_physaddr_to_nid(u64 start); #define memory_add_physaddr_to_nid memory_add_physaddr_to_nid +extern int numa_fill_memblks(u64 start, u64 end); +#define numa_fill_memblks numa_fill_memblks #endif #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0a3ea787ac51..3f8de9bce61a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1115,18 +1115,34 @@ void get_cpu_cap(struct cpuinfo_x86 *c) void get_cpu_address_sizes(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; + bool vp_bits_from_cpuid = true; - if (c->extended_cpuid_level >= 0x80000008) { + if (!cpu_has(c, X86_FEATURE_CPUID) || + (c->extended_cpuid_level < 0x80000008)) + vp_bits_from_cpuid = false; + + if (vp_bits_from_cpuid) { cpuid(0x80000008, &eax, &ebx, &ecx, &edx); c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_phys_bits = eax & 0xff; + } else { + if (IS_ENABLED(CONFIG_X86_64)) { + c->x86_clflush_size = 64; + c->x86_phys_bits = 36; + c->x86_virt_bits = 48; + } else { + c->x86_clflush_size = 32; + c->x86_virt_bits = 32; + c->x86_phys_bits = 32; + + if (cpu_has(c, X86_FEATURE_PAE) || + cpu_has(c, X86_FEATURE_PSE36)) + c->x86_phys_bits = 36; + } } -#ifdef CONFIG_X86_32 - else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36)) - c->x86_phys_bits = 36; -#endif c->x86_cache_bits = c->x86_phys_bits; + c->x86_cache_alignment = c->x86_clflush_size; } static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) @@ -1580,17 +1596,6 @@ static void __init cpu_parse_early_param(void) */ static void __init early_identify_cpu(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_64 - c->x86_clflush_size = 64; - c->x86_phys_bits = 36; - c->x86_virt_bits = 48; -#else - c->x86_clflush_size = 32; - c->x86_phys_bits = 32; - c->x86_virt_bits = 32; -#endif - c->x86_cache_alignment = c->x86_clflush_size; - memset(&c->x86_capability, 0, sizeof(c->x86_capability)); c->extended_cpuid_level = 0; @@ -1602,7 +1607,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) cpu_detect(c); get_cpu_vendor(c); get_cpu_cap(c); - get_cpu_address_sizes(c); setup_force_cpu_cap(X86_FEATURE_CPUID); cpu_parse_early_param(); @@ -1618,6 +1622,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_clear_cpu_cap(X86_FEATURE_CPUID); } + get_cpu_address_sizes(c); + setup_force_cpu_cap(X86_FEATURE_ALWAYS); cpu_set_bug_bits(c); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 6bbbef6586d1..086a2c3aaaa0 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -179,8 +179,8 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) movl $0, %ecx #endif - /* Enable PAE mode, PGE and LA57 */ - orl $(X86_CR4_PAE | X86_CR4_PGE), %ecx + /* Enable PAE mode, PSE, PGE and LA57 */ + orl $(X86_CR4_PAE | X86_CR4_PSE | X86_CR4_PGE), %ecx #ifdef CONFIG_X86_5LEVEL testl $1, __pgtable_l5_enabled(%rip) jz 1f diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3e05ecf8cf8d..ccd3ad29a1dc 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1120,7 +1120,7 @@ void __init setup_arch(char **cmdline_p) * Needs to run after memblock setup because it needs the physical * memory size. */ - sev_setup_arch(); + mem_encrypt_setup_arch(); efi_fake_memmap(); efi_find_mirror(); diff --git a/arch/x86/mm/maccess.c b/arch/x86/mm/maccess.c index 5a53c2cc169c..6993f026adec 100644 --- a/arch/x86/mm/maccess.c +++ b/arch/x86/mm/maccess.c @@ -9,12 +9,21 @@ bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) unsigned long vaddr = (unsigned long)unsafe_src; /* - * Range covering the highest possible canonical userspace address - * as well as non-canonical address range. For the canonical range - * we also need to include the userspace guard page. + * Do not allow userspace addresses. This disallows + * normal userspace and the userspace guard page: */ - return vaddr >= TASK_SIZE_MAX + PAGE_SIZE && - __is_canonical_address(vaddr, boot_cpu_data.x86_virt_bits); + if (vaddr < TASK_SIZE_MAX + PAGE_SIZE) + return false; + + /* + * Allow everything during early boot before 'x86_virt_bits' + * is initialized. Needed for instruction decoding in early + * exception handlers. + */ + if (!boot_cpu_data.x86_virt_bits) + return true; + + return __is_canonical_address(vaddr, boot_cpu_data.x86_virt_bits); } #else bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 9f27e14e185f..c290c55b632b 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -12,6 +12,7 @@ #include <linux/swiotlb.h> #include <linux/cc_platform.h> #include <linux/mem_encrypt.h> +#include <linux/virtio_anchor.h> /* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */ bool force_dma_unencrypted(struct device *dev) @@ -86,3 +87,36 @@ void __init mem_encrypt_init(void) print_mem_encrypt_feature_info(); } + +void __init mem_encrypt_setup_arch(void) +{ + phys_addr_t total_mem = memblock_phys_mem_size(); + unsigned long size; + + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) + return; + + /* + * For SEV and TDX, all DMA has to occur via shared/unencrypted pages. + * Kernel uses SWIOTLB to make this happen without changing device + * drivers. However, depending on the workload being run, the + * default 64MB of SWIOTLB may not be enough and SWIOTLB may + * run out of buffers for DMA, resulting in I/O errors and/or + * performance degradation especially with high I/O workloads. + * + * Adjust the default size of SWIOTLB using a percentage of guest + * memory for SWIOTLB buffers. Also, as the SWIOTLB bounce buffer + * memory is allocated from low memory, ensure that the adjusted size + * is within the limits of low available memory. + * + * The percentage of guest memory used here for SWIOTLB buffers + * is more of an approximation of the static adjustment which + * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6% + */ + size = total_mem * 6 / 100; + size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G); + swiotlb_adjust_size(size); + + /* Set restricted memory access for virtio. */ + virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); +} diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index 6faea41e99b6..a68f2dda0948 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -19,8 +19,6 @@ #include <linux/kernel.h> #include <linux/bitops.h> #include <linux/dma-mapping.h> -#include <linux/virtio_config.h> -#include <linux/virtio_anchor.h> #include <linux/cc_platform.h> #include <asm/tlbflush.h> @@ -215,40 +213,6 @@ void __init sme_map_bootdata(char *real_mode_data) __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true); } -void __init sev_setup_arch(void) -{ - phys_addr_t total_mem = memblock_phys_mem_size(); - unsigned long size; - - if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) - return; - - /* - * For SEV, all DMA has to occur via shared/unencrypted pages. - * SEV uses SWIOTLB to make this happen without changing device - * drivers. However, depending on the workload being run, the - * default 64MB of SWIOTLB may not be enough and SWIOTLB may - * run out of buffers for DMA, resulting in I/O errors and/or - * performance degradation especially with high I/O workloads. - * - * Adjust the default size of SWIOTLB for SEV guests using - * a percentage of guest memory for SWIOTLB buffers. - * Also, as the SWIOTLB bounce buffer memory is allocated - * from low memory, ensure that the adjusted size is within - * the limits of low available memory. - * - * The percentage of guest memory used here for SWIOTLB buffers - * is more of an approximation of the static adjustment which - * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6% - */ - size = total_mem * 6 / 100; - size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G); - swiotlb_adjust_size(size); - - /* Set restricted memory access for virtio. */ - virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); -} - static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot) { unsigned long pfn = 0; diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index c79f12e449ea..4a17f663f949 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -12,6 +12,7 @@ #include <linux/nodemask.h> #include <linux/sched.h> #include <linux/topology.h> +#include <linux/sort.h> #include <asm/e820/api.h> #include <asm/proto.h> @@ -602,13 +603,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) if (start >= end) continue; - /* - * Don't confuse VM with a node that doesn't have the - * minimum amount of memory: - */ - if (end && (end - start) < NODE_MIN_SIZE) - continue; - alloc_node_data(nid); } @@ -964,4 +958,83 @@ int memory_add_physaddr_to_nid(u64 start) return nid; } EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); + +static int __init cmp_memblk(const void *a, const void *b) +{ + const struct numa_memblk *ma = *(const struct numa_memblk **)a; + const struct numa_memblk *mb = *(const struct numa_memblk **)b; + + return ma->start - mb->start; +} + +static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; + +/** + * numa_fill_memblks - Fill gaps in numa_meminfo memblks + * @start: address to begin fill + * @end: address to end fill + * + * Find and extend numa_meminfo memblks to cover the @start-@end + * physical address range, such that the first memblk includes + * @start, the last memblk includes @end, and any gaps in between + * are filled. + * + * RETURNS: + * 0 : Success + * NUMA_NO_MEMBLK : No memblk exists in @start-@end range + */ + +int __init numa_fill_memblks(u64 start, u64 end) +{ + struct numa_memblk **blk = &numa_memblk_list[0]; + struct numa_meminfo *mi = &numa_meminfo; + int count = 0; + u64 prev_end; + + /* + * Create a list of pointers to numa_meminfo memblks that + * overlap start, end. Exclude (start == bi->end) since + * end addresses in both a CFMWS range and a memblk range + * are exclusive. + * + * This list of pointers is used to make in-place changes + * that fill out the numa_meminfo memblks. + */ + for (int i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; + + if (start < bi->end && end >= bi->start) { + blk[count] = &mi->blk[i]; + count++; + } + } + if (!count) + return NUMA_NO_MEMBLK; + + /* Sort the list of pointers in memblk->start order */ + sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL); + + /* Make sure the first/last memblks include start/end */ + blk[0]->start = min(blk[0]->start, start); + blk[count - 1]->end = max(blk[count - 1]->end, end); + + /* + * Fill any gaps by tracking the previous memblks + * end address and backfilling to it if needed. + */ + prev_end = blk[0]->end; + for (int i = 1; i < count; i++) { + struct numa_memblk *curr = blk[i]; + + if (prev_end >= curr->start) { + if (prev_end < curr->end) + prev_end = curr->end; + } else { + curr->start = prev_end; + prev_end = curr->end; + } + } + return 0; +} + #endif |