From 2a25dc7c79c92c6cba45c6218c49395173be80bf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 17 Sep 2018 16:29:07 +0200 Subject: x86/mm/init32: Mark text and rodata RO in one go The sequence of marking text and rodata read-only in 32bit init is: set_ro(text); kernel_set_to_readonly = 1; set_ro(rodata); When kernel_set_to_readonly is 1 it enables the protection mechanism in CPA for the read only regions. With the upcoming checks for existing mappings this consequently triggers the warning about an existing mapping being incorrect vs. static protections because rodata has not been converted yet. There is no technical reason to split the two, so just combine the RO protection to convert text and rodata in one go. Convert the printks to pr_info while at it. Reported-by: kernel test robot Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Peter Zijlstra Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180917143545.731701535@linutronix.de --- arch/x86/mm/init_32.c | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 979e0a02cbe1..142c7d9f89cc 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -923,34 +923,19 @@ static void mark_nxdata_nx(void) void mark_rodata_ro(void) { unsigned long start = PFN_ALIGN(_text); - unsigned long size = PFN_ALIGN(_etext) - start; + unsigned long size = (unsigned long)__end_rodata - start; set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); - printk(KERN_INFO "Write protecting the kernel text: %luk\n", + pr_info("Write protecting kernel text and read-only data: %luk\n", size >> 10); kernel_set_to_readonly = 1; #ifdef CONFIG_CPA_DEBUG - printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", - start, start+size); - set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT); - - printk(KERN_INFO "Testing CPA: write protecting again\n"); - set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); -#endif - - start += size; - size = (unsigned long)__end_rodata - start; - set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); - printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", - size >> 10); - -#ifdef CONFIG_CPA_DEBUG - printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size); + pr_info("Testing CPA: Reverting %lx-%lx\n", start, start + size); set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT); - printk(KERN_INFO "Testing CPA: write protecting again\n"); + pr_info("Testing CPA: write protecting again\n"); set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); #endif mark_nxdata_nx(); -- cgit v1.2.3 From 8679de0959e65ee7f78db6405a8d23e61665751d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 17 Sep 2018 16:29:08 +0200 Subject: x86/mm/cpa: Split, rename and clean up try_preserve_large_page() Avoid the extra variable and gotos by splitting the function into the actual algorithm and a callable function which contains the lock protection. Rename it to should_split_large_page() while at it so the return values make actually sense. Clean up the code flow, comments and general whitespace damage while at it. No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Peter Zijlstra Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180917143545.830507216@linutronix.de --- arch/x86/mm/pageattr.c | 121 ++++++++++++++++++++++++------------------------- 1 file changed, 60 insertions(+), 61 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 51a5a69ecac9..36f5d711ddbe 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -421,18 +421,18 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, */ pte_t *lookup_address(unsigned long address, unsigned int *level) { - return lookup_address_in_pgd(pgd_offset_k(address), address, level); + return lookup_address_in_pgd(pgd_offset_k(address), address, level); } EXPORT_SYMBOL_GPL(lookup_address); static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, unsigned int *level) { - if (cpa->pgd) + if (cpa->pgd) return lookup_address_in_pgd(cpa->pgd + pgd_index(address), address, level); - return lookup_address(address, level); + return lookup_address(address, level); } /* @@ -549,27 +549,22 @@ static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot) return prot; } -static int -try_preserve_large_page(pte_t *kpte, unsigned long address, - struct cpa_data *cpa) +static int __should_split_large_page(pte_t *kpte, unsigned long address, + struct cpa_data *cpa) { - unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn; - pte_t new_pte, old_pte, *tmp; + unsigned long numpages, pmask, psize, lpaddr, addr, pfn, old_pfn; pgprot_t old_prot, new_prot, req_prot; - int i, do_split = 1; + pte_t new_pte, old_pte, *tmp; enum pg_level level; + int i; - if (cpa->force_split) - return 1; - - spin_lock(&pgd_lock); /* * Check for races, another CPU might have split this page * up already: */ tmp = _lookup_address_cpa(cpa, address, &level); if (tmp != kpte) - goto out_unlock; + return 1; switch (level) { case PG_LEVEL_2M: @@ -581,8 +576,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, old_pfn = pud_pfn(*(pud_t *)kpte); break; default: - do_split = -EINVAL; - goto out_unlock; + return -EINVAL; } psize = page_level_size(level); @@ -592,8 +586,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * Calculate the number of pages, which fit into this large * page starting at address: */ - nextpage_addr = (address + psize) & pmask; - numpages = (nextpage_addr - address) >> PAGE_SHIFT; + lpaddr = (address + psize) & pmask; + numpages = (lpaddr - address) >> PAGE_SHIFT; if (numpages < cpa->numpages) cpa->numpages = numpages; @@ -620,57 +614,62 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, pgprot_val(req_prot) |= _PAGE_PSE; /* - * old_pfn points to the large page base pfn. So we need - * to add the offset of the virtual address: + * old_pfn points to the large page base pfn. So we need to add the + * offset of the virtual address: */ pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT); cpa->pfn = pfn; - new_prot = static_protections(req_prot, address, pfn); + /* + * Calculate the large page base address and the number of 4K pages + * in the large page + */ + lpaddr = address & pmask; + numpages = psize >> PAGE_SHIFT; /* - * We need to check the full range, whether - * static_protection() requires a different pgprot for one of - * the pages in the range we try to preserve: + * Make sure that the requested pgprot does not violate the static + * protections. Check the full large page whether one of the pages + * in it results in a different pgprot than the first one of the + * requested range. If yes, then the page needs to be split. */ - addr = address & pmask; + new_prot = static_protections(req_prot, address, pfn); pfn = old_pfn; - for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) { + for (i = 0, addr = lpaddr; i < numpages; i++, addr += PAGE_SIZE, pfn++) { pgprot_t chk_prot = static_protections(req_prot, addr, pfn); if (pgprot_val(chk_prot) != pgprot_val(new_prot)) - goto out_unlock; + return 1; } - /* - * If there are no changes, return. maxpages has been updated - * above: - */ - if (pgprot_val(new_prot) == pgprot_val(old_prot)) { - do_split = 0; - goto out_unlock; - } + /* If there are no changes, return. */ + if (pgprot_val(new_prot) == pgprot_val(old_prot)) + return 0; /* - * We need to change the attributes. Check, whether we can - * change the large page in one go. We request a split, when - * the address is not aligned and the number of pages is - * smaller than the number of pages in the large page. Note - * that we limited the number of possible pages already to - * the number of pages in the large page. + * Verify that the address is aligned and the number of pages + * covers the full page. */ - if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) { - /* - * The address is aligned and the number of pages - * covers the full page. - */ - new_pte = pfn_pte(old_pfn, new_prot); - __set_pmd_pte(kpte, address, new_pte); - cpa->flags |= CPA_FLUSHTLB; - do_split = 0; - } + if (address != lpaddr || cpa->numpages != numpages) + return 1; + + /* All checks passed. Update the large page mapping. */ + new_pte = pfn_pte(old_pfn, new_prot); + __set_pmd_pte(kpte, address, new_pte); + cpa->flags |= CPA_FLUSHTLB; + return 0; +} + +static int should_split_large_page(pte_t *kpte, unsigned long address, + struct cpa_data *cpa) +{ + int do_split; + + if (cpa->force_split) + return 1; -out_unlock: + spin_lock(&pgd_lock); + do_split = __should_split_large_page(kpte, address, cpa); spin_unlock(&pgd_lock); return do_split; @@ -1273,7 +1272,7 @@ repeat: * Check, whether we can keep the large page intact * and just change the pte: */ - do_split = try_preserve_large_page(kpte, address, cpa); + do_split = should_split_large_page(kpte, address, cpa); /* * When the range fits into the existing large page, * return. cp->numpages and cpa->tlbflush have been updated in @@ -1288,23 +1287,23 @@ repeat: err = split_large_page(cpa, kpte, address); if (!err) { /* - * Do a global flush tlb after splitting the large page - * and before we do the actual change page attribute in the PTE. - * - * With out this, we violate the TLB application note, that says - * "The TLBs may contain both ordinary and large-page + * Do a global flush tlb after splitting the large page + * and before we do the actual change page attribute in the PTE. + * + * With out this, we violate the TLB application note, that says + * "The TLBs may contain both ordinary and large-page * translations for a 4-KByte range of linear addresses. This * may occur if software modifies the paging structures so that * the page size used for the address range changes. If the two * translations differ with respect to page frame or attributes * (e.g., permissions), processor behavior is undefined and may * be implementation-specific." - * - * We do this global tlb flush inside the cpa_lock, so that we + * + * We do this global tlb flush inside the cpa_lock, so that we * don't allow any other cpu, with stale tlb entries change the * page attribute in parallel, that also falls into the * just split large page entry. - */ + */ flush_tlb_all(); goto repeat; } -- cgit v1.2.3 From afd7969a99e072e6aa0d88511176d4d2f3009fd9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 17 Sep 2018 16:29:09 +0200 Subject: x86/mm/cpa: Rework static_protections() static_protections() is pretty unreadable. Split it up into separate checks for each protection area. Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Peter Zijlstra Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180917143545.913005317@linutronix.de --- arch/x86/mm/pageattr.c | 162 ++++++++++++++++++++++++++++++------------------- 1 file changed, 98 insertions(+), 64 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 36f5d711ddbe..9731aeeebe71 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -286,84 +286,118 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache, } } +#ifdef CONFIG_PCI_BIOS /* - * Certain areas of memory on x86 require very specific protection flags, - * for example the BIOS area or kernel text. Callers don't always get this - * right (again, ioremap() on BIOS memory is not uncommon) so this function - * checks and fixes these known static required protection bits. + * The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS + * based config access (CONFIG_PCI_GOBIOS) support. */ -static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, - unsigned long pfn) -{ - pgprot_t forbidden = __pgprot(0); +#define BIOS_PFN PFN_DOWN(BIOS_BEGIN) +#define BIOS_PFN_END PFN_DOWN(BIOS_END) - /* - * The BIOS area between 640k and 1Mb needs to be executable for - * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. - */ -#ifdef CONFIG_PCI_BIOS - if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) - pgprot_val(forbidden) |= _PAGE_NX; +static pgprotval_t protect_pci_bios(unsigned long pfn) +{ + if (pcibios_enabled && within(pfn, BIOS_PFN, BIOS_PFN_END)) + return _PAGE_NX; + return 0; +} +#else +static pgprotval_t protect_pci_bios(unsigned long pfn) +{ + return 0; +} #endif - /* - * The kernel text needs to be executable for obvious reasons - * Does not cover __inittext since that is gone later on. On - * 64bit we do not enforce !NX on the low mapping - */ - if (within(address, (unsigned long)_text, (unsigned long)_etext)) - pgprot_val(forbidden) |= _PAGE_NX; +/* + * The .rodata section needs to be read-only. Using the pfn catches all + * aliases. This also includes __ro_after_init, so do not enforce until + * kernel_set_to_readonly is true. + */ +static pgprotval_t protect_rodata(unsigned long pfn) +{ + unsigned long start_pfn = __pa_symbol(__start_rodata) >> PAGE_SHIFT; + unsigned long end_pfn = __pa_symbol(__end_rodata) >> PAGE_SHIFT; - /* - * The .rodata section needs to be read-only. Using the pfn - * catches all aliases. This also includes __ro_after_init, - * so do not enforce until kernel_set_to_readonly is true. - */ - if (kernel_set_to_readonly && - within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT, - __pa_symbol(__end_rodata) >> PAGE_SHIFT)) - pgprot_val(forbidden) |= _PAGE_RW; + if (kernel_set_to_readonly && within(pfn, start_pfn, end_pfn)) + return _PAGE_RW; + return 0; +} + +/* + * Protect kernel text against becoming non executable by forbidding + * _PAGE_NX. This protects only the high kernel mapping (_text -> _etext) + * out of which the kernel actually executes. Do not protect the low + * mapping. + * + * This does not cover __inittext since that is gone after boot. + */ +static pgprotval_t protect_kernel_text(unsigned long address) +{ + if (within(address, (unsigned long)_text, (unsigned long)_etext)) + return _PAGE_NX; + return 0; +} #if defined(CONFIG_X86_64) +/* + * Once the kernel maps the text as RO (kernel_set_to_readonly is set), + * kernel text mappings for the large page aligned text, rodata sections + * will be always read-only. For the kernel identity mappings covering the + * holes caused by this alignment can be anything that user asks. + * + * This will preserve the large page mappings for kernel text/data at no + * extra cost. + */ +static pgprotval_t protect_kernel_text_ro(unsigned long address) +{ + unsigned long end = (unsigned long)__end_rodata_hpage_align; + unsigned long start = (unsigned long)_text; + unsigned int level; + + if (!kernel_set_to_readonly || !within(address, start, end)) + return 0; /* - * Once the kernel maps the text as RO (kernel_set_to_readonly is set), - * kernel text mappings for the large page aligned text, rodata sections - * will be always read-only. For the kernel identity mappings covering - * the holes caused by this alignment can be anything that user asks. + * Don't enforce the !RW mapping for the kernel text mapping, if + * the current mapping is already using small page mapping. No + * need to work hard to preserve large page mappings in this case. * - * This will preserve the large page mappings for kernel text/data - * at no extra cost. + * This also fixes the Linux Xen paravirt guest boot failure caused + * by unexpected read-only mappings for kernel identity + * mappings. In this paravirt guest case, the kernel text mapping + * and the kernel identity mapping share the same page-table pages, + * so the protections for kernel text and identity mappings have to + * be the same. */ - if (kernel_set_to_readonly && - within(address, (unsigned long)_text, - (unsigned long)__end_rodata_hpage_align)) { - unsigned int level; - - /* - * Don't enforce the !RW mapping for the kernel text mapping, - * if the current mapping is already using small page mapping. - * No need to work hard to preserve large page mappings in this - * case. - * - * This also fixes the Linux Xen paravirt guest boot failure - * (because of unexpected read-only mappings for kernel identity - * mappings). In this paravirt guest case, the kernel text - * mapping and the kernel identity mapping share the same - * page-table pages. Thus we can't really use different - * protections for the kernel text and identity mappings. Also, - * these shared mappings are made of small page mappings. - * Thus this don't enforce !RW mapping for small page kernel - * text mapping logic will help Linux Xen parvirt guest boot - * as well. - */ - if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) - pgprot_val(forbidden) |= _PAGE_RW; - } + if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) + return _PAGE_RW; + return 0; +} +#else +static pgprotval_t protect_kernel_text_ro(unsigned long address) +{ + return 0; +} #endif - prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); +/* + * Certain areas of memory on x86 require very specific protection flags, + * for example the BIOS area or kernel text. Callers don't always get this + * right (again, ioremap() on BIOS memory is not uncommon) so this function + * checks and fixes these known static required protection bits. + */ +static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, + unsigned long pfn) +{ + pgprotval_t forbidden; + + /* Operate on the virtual address */ + forbidden = protect_kernel_text(address); + forbidden |= protect_kernel_text_ro(address); - return prot; + /* Check the PFN directly */ + forbidden |= protect_pci_bios(pfn); + forbidden |= protect_rodata(pfn); + + return __pgprot(pgprot_val(prot) & ~forbidden); } /* -- cgit v1.2.3 From 91ee8f5c1f50a1f4096c178a93a8da46ce3f6cc8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 17 Sep 2018 16:29:10 +0200 Subject: x86/mm/cpa: Allow range check for static protections Checking static protections only page by page is slow especially for huge pages. To allow quick checks over a complete range, add the ability to do that. Make the checks inclusive so the ranges can be directly used for debug output later. No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Peter Zijlstra Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180917143545.995734490@linutronix.de --- arch/x86/mm/pageattr.c | 69 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 44 insertions(+), 25 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 9731aeeebe71..61ff27848452 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -286,22 +286,29 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache, } } +static bool overlaps(unsigned long r1_start, unsigned long r1_end, + unsigned long r2_start, unsigned long r2_end) +{ + return (r1_start <= r2_end && r1_end >= r2_start) || + (r2_start <= r1_end && r2_end >= r1_start); +} + #ifdef CONFIG_PCI_BIOS /* * The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS * based config access (CONFIG_PCI_GOBIOS) support. */ #define BIOS_PFN PFN_DOWN(BIOS_BEGIN) -#define BIOS_PFN_END PFN_DOWN(BIOS_END) +#define BIOS_PFN_END PFN_DOWN(BIOS_END - 1) -static pgprotval_t protect_pci_bios(unsigned long pfn) +static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn) { - if (pcibios_enabled && within(pfn, BIOS_PFN, BIOS_PFN_END)) + if (pcibios_enabled && overlaps(spfn, epfn, BIOS_PFN, BIOS_PFN_END)) return _PAGE_NX; return 0; } #else -static pgprotval_t protect_pci_bios(unsigned long pfn) +static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn) { return 0; } @@ -312,12 +319,17 @@ static pgprotval_t protect_pci_bios(unsigned long pfn) * aliases. This also includes __ro_after_init, so do not enforce until * kernel_set_to_readonly is true. */ -static pgprotval_t protect_rodata(unsigned long pfn) +static pgprotval_t protect_rodata(unsigned long spfn, unsigned long epfn) { - unsigned long start_pfn = __pa_symbol(__start_rodata) >> PAGE_SHIFT; - unsigned long end_pfn = __pa_symbol(__end_rodata) >> PAGE_SHIFT; + unsigned long epfn_ro, spfn_ro = PFN_DOWN(__pa_symbol(__start_rodata)); + + /* + * Note: __end_rodata is at page aligned and not inclusive, so + * subtract 1 to get the last enforced PFN in the rodata area. + */ + epfn_ro = PFN_DOWN(__pa_symbol(__end_rodata)) - 1; - if (kernel_set_to_readonly && within(pfn, start_pfn, end_pfn)) + if (kernel_set_to_readonly && overlaps(spfn, epfn, spfn_ro, epfn_ro)) return _PAGE_RW; return 0; } @@ -330,9 +342,12 @@ static pgprotval_t protect_rodata(unsigned long pfn) * * This does not cover __inittext since that is gone after boot. */ -static pgprotval_t protect_kernel_text(unsigned long address) +static pgprotval_t protect_kernel_text(unsigned long start, unsigned long end) { - if (within(address, (unsigned long)_text, (unsigned long)_etext)) + unsigned long t_end = (unsigned long)_etext - 1; + unsigned long t_start = (unsigned long)_text; + + if (overlaps(start, end, t_start, t_end)) return _PAGE_NX; return 0; } @@ -347,13 +362,14 @@ static pgprotval_t protect_kernel_text(unsigned long address) * This will preserve the large page mappings for kernel text/data at no * extra cost. */ -static pgprotval_t protect_kernel_text_ro(unsigned long address) +static pgprotval_t protect_kernel_text_ro(unsigned long start, + unsigned long end) { - unsigned long end = (unsigned long)__end_rodata_hpage_align; - unsigned long start = (unsigned long)_text; + unsigned long t_end = (unsigned long)__end_rodata_hpage_align - 1; + unsigned long t_start = (unsigned long)_text; unsigned int level; - if (!kernel_set_to_readonly || !within(address, start, end)) + if (!kernel_set_to_readonly || !overlaps(start, end, t_start, t_end)) return 0; /* * Don't enforce the !RW mapping for the kernel text mapping, if @@ -367,12 +383,13 @@ static pgprotval_t protect_kernel_text_ro(unsigned long address) * so the protections for kernel text and identity mappings have to * be the same. */ - if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) + if (lookup_address(start, &level) && (level != PG_LEVEL_4K)) return _PAGE_RW; return 0; } #else -static pgprotval_t protect_kernel_text_ro(unsigned long address) +static pgprotval_t protect_kernel_text_ro(unsigned long start, + unsigned long end) { return 0; } @@ -384,18 +401,20 @@ static pgprotval_t protect_kernel_text_ro(unsigned long address) * right (again, ioremap() on BIOS memory is not uncommon) so this function * checks and fixes these known static required protection bits. */ -static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, - unsigned long pfn) +static inline pgprot_t static_protections(pgprot_t prot, unsigned long start, + unsigned long pfn, unsigned long npg) { pgprotval_t forbidden; + unsigned long end; /* Operate on the virtual address */ - forbidden = protect_kernel_text(address); - forbidden |= protect_kernel_text_ro(address); + end = start + npg * PAGE_SIZE - 1; + forbidden = protect_kernel_text(start, end); + forbidden |= protect_kernel_text_ro(start, end); /* Check the PFN directly */ - forbidden |= protect_pci_bios(pfn); - forbidden |= protect_rodata(pfn); + forbidden |= protect_pci_bios(pfn, pfn + npg - 1); + forbidden |= protect_rodata(pfn, pfn + npg - 1); return __pgprot(pgprot_val(prot) & ~forbidden); } @@ -667,10 +686,10 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address, * in it results in a different pgprot than the first one of the * requested range. If yes, then the page needs to be split. */ - new_prot = static_protections(req_prot, address, pfn); + new_prot = static_protections(req_prot, address, pfn, 1); pfn = old_pfn; for (i = 0, addr = lpaddr; i < numpages; i++, addr += PAGE_SIZE, pfn++) { - pgprot_t chk_prot = static_protections(req_prot, addr, pfn); + pgprot_t chk_prot = static_protections(req_prot, addr, pfn, 1); if (pgprot_val(chk_prot) != pgprot_val(new_prot)) return 1; @@ -1280,7 +1299,7 @@ repeat: pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); - new_prot = static_protections(new_prot, address, pfn); + new_prot = static_protections(new_prot, address, pfn, 1); new_prot = pgprot_clear_protnone_bits(new_prot); -- cgit v1.2.3 From 4046460b867f8b041c81c26c09d3bcad6d6e814e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 17 Sep 2018 16:29:11 +0200 Subject: x86/mm/cpa: Add debug mechanism The whole static protection magic is silently fixing up anything which is handed in. That's just wrong. The offending call sites need to be fixed. Add a debug mechanism which emits a warning if a requested mapping needs to be fixed up. The DETECT debug mechanism is really not meant to be enabled except for developers, so limit the output hard to the protection fixups. Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Peter Zijlstra Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180917143546.078998733@linutronix.de --- arch/x86/mm/pageattr.c | 61 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 52 insertions(+), 9 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 61ff27848452..1f2519055b30 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -42,6 +42,13 @@ struct cpa_data { struct page **pages; }; +enum cpa_warn { + CPA_PROTECT, + CPA_DETECT, +}; + +static const int cpa_warn_level = CPA_PROTECT; + /* * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) * using cpa_lock. So that we don't allow any other cpu, with stale large tlb @@ -395,6 +402,28 @@ static pgprotval_t protect_kernel_text_ro(unsigned long start, } #endif +static inline bool conflicts(pgprot_t prot, pgprotval_t val) +{ + return (pgprot_val(prot) & ~val) != pgprot_val(prot); +} + +static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val, + unsigned long start, unsigned long end, + unsigned long pfn, const char *txt) +{ + static const char *lvltxt[] = { + [CPA_PROTECT] = "protect", + [CPA_DETECT] = "detect", + }; + + if (warnlvl > cpa_warn_level || !conflicts(prot, val)) + return; + + pr_warn("CPA %8s %10s: 0x%016lx - 0x%016lx PFN %lx req %016llx prevent %016llx\n", + lvltxt[warnlvl], txt, start, end, pfn, (unsigned long long)pgprot_val(prot), + (unsigned long long)val); +} + /* * Certain areas of memory on x86 require very specific protection flags, * for example the BIOS area or kernel text. Callers don't always get this @@ -402,19 +431,31 @@ static pgprotval_t protect_kernel_text_ro(unsigned long start, * checks and fixes these known static required protection bits. */ static inline pgprot_t static_protections(pgprot_t prot, unsigned long start, - unsigned long pfn, unsigned long npg) + unsigned long pfn, unsigned long npg, + int warnlvl) { - pgprotval_t forbidden; + pgprotval_t forbidden, res; unsigned long end; /* Operate on the virtual address */ end = start + npg * PAGE_SIZE - 1; - forbidden = protect_kernel_text(start, end); - forbidden |= protect_kernel_text_ro(start, end); + + res = protect_kernel_text(start, end); + check_conflict(warnlvl, prot, res, start, end, pfn, "Text NX"); + forbidden = res; + + res = protect_kernel_text_ro(start, end); + check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO"); + forbidden |= res; /* Check the PFN directly */ - forbidden |= protect_pci_bios(pfn, pfn + npg - 1); - forbidden |= protect_rodata(pfn, pfn + npg - 1); + res = protect_pci_bios(pfn, pfn + npg - 1); + check_conflict(warnlvl, prot, res, start, end, pfn, "PCIBIOS NX"); + forbidden |= res; + + res = protect_rodata(pfn, pfn + npg - 1); + check_conflict(warnlvl, prot, res, start, end, pfn, "Rodata RO"); + forbidden |= res; return __pgprot(pgprot_val(prot) & ~forbidden); } @@ -686,10 +727,11 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address, * in it results in a different pgprot than the first one of the * requested range. If yes, then the page needs to be split. */ - new_prot = static_protections(req_prot, address, pfn, 1); + new_prot = static_protections(req_prot, address, pfn, 1, CPA_DETECT); pfn = old_pfn; for (i = 0, addr = lpaddr; i < numpages; i++, addr += PAGE_SIZE, pfn++) { - pgprot_t chk_prot = static_protections(req_prot, addr, pfn, 1); + pgprot_t chk_prot = static_protections(req_prot, addr, pfn, 1, + CPA_DETECT); if (pgprot_val(chk_prot) != pgprot_val(new_prot)) return 1; @@ -1299,7 +1341,8 @@ repeat: pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); - new_prot = static_protections(new_prot, address, pfn, 1); + new_prot = static_protections(new_prot, address, pfn, 1, + CPA_PROTECT); new_prot = pgprot_clear_protnone_bits(new_prot); -- cgit v1.2.3 From 5c280cf6081ff99078e28b51172d78359f194fd9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 17 Sep 2018 16:29:12 +0200 Subject: x86/mm/cpa: Add large page preservation statistics The large page preservation mechanism is just magic and provides no information at all. Add optional statistic output in debugfs so the magic can be evaluated. Defaults is off. Output: 1G pages checked: 2 1G pages sameprot: 0 1G pages preserved: 0 2M pages checked: 540 2M pages sameprot: 466 2M pages preserved: 47 4K pages checked: 800770 4K pages set-checked: 7668 Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Peter Zijlstra Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180917143546.160867778@linutronix.de --- arch/x86/Kconfig | 8 ++++ arch/x86/mm/pageattr.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 105 insertions(+), 2 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1a0be022f91d..65728bb1182c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1491,6 +1491,14 @@ config X86_DIRECT_GBPAGES supports them), so don't confuse the user by printing that we have them enabled. +config X86_CPA_STATISTICS + bool "Enable statistic for Change Page Attribute" + depends on DEBUG_FS + ---help--- + Expose statistics about the Change Page Attribute mechanims, which + helps to determine the effectivness of preserving large and huge + page mappings when mapping protections are changed. + config ARCH_HAS_MEM_ENCRYPT def_bool y diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1f2519055b30..cdf52eb86f3a 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -101,6 +101,95 @@ void arch_report_meminfo(struct seq_file *m) static inline void split_page_count(int level) { } #endif +#ifdef CONFIG_X86_CPA_STATISTICS + +static unsigned long cpa_1g_checked; +static unsigned long cpa_1g_sameprot; +static unsigned long cpa_1g_preserved; +static unsigned long cpa_2m_checked; +static unsigned long cpa_2m_sameprot; +static unsigned long cpa_2m_preserved; +static unsigned long cpa_4k_checked; +static unsigned long cpa_4k_install; + +static inline void cpa_inc_1g_checked(void) +{ + cpa_1g_checked++; +} + +static inline void cpa_inc_2m_checked(void) +{ + cpa_2m_checked++; +} + +static inline void cpa_inc_4k_checked(void) +{ + cpa_4k_checked++; +} + +static inline void cpa_inc_4k_install(void) +{ + cpa_4k_install++; +} + +static inline void cpa_inc_lp_sameprot(int level) +{ + if (level == PG_LEVEL_1G) + cpa_1g_sameprot++; + else + cpa_2m_sameprot++; +} + +static inline void cpa_inc_lp_preserved(int level) +{ + if (level == PG_LEVEL_1G) + cpa_1g_preserved++; + else + cpa_2m_preserved++; +} + +static int cpastats_show(struct seq_file *m, void *p) +{ + seq_printf(m, "1G pages checked: %16lu\n", cpa_1g_checked); + seq_printf(m, "1G pages sameprot: %16lu\n", cpa_1g_sameprot); + seq_printf(m, "1G pages preserved: %16lu\n", cpa_1g_preserved); + seq_printf(m, "2M pages checked: %16lu\n", cpa_2m_checked); + seq_printf(m, "2M pages sameprot: %16lu\n", cpa_2m_sameprot); + seq_printf(m, "2M pages preserved: %16lu\n", cpa_2m_preserved); + seq_printf(m, "4K pages checked: %16lu\n", cpa_4k_checked); + seq_printf(m, "4K pages set-checked: %16lu\n", cpa_4k_install); + return 0; +} + +static int cpastats_open(struct inode *inode, struct file *file) +{ + return single_open(file, cpastats_show, NULL); +} + +static const struct file_operations cpastats_fops = { + .open = cpastats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init cpa_stats_init(void) +{ + debugfs_create_file("cpa_stats", S_IRUSR, arch_debugfs_dir, NULL, + &cpastats_fops); + return 0; +} +late_initcall(cpa_stats_init); +#else +static inline void cpa_inc_1g_checked(void) { } +static inline void cpa_inc_2m_checked(void) { } +static inline void cpa_inc_4k_checked(void) { } +static inline void cpa_inc_4k_install(void) { } +static inline void cpa_inc_lp_sameprot(int level) { } +static inline void cpa_inc_lp_preserved(int level) { } +#endif + + static inline int within(unsigned long addr, unsigned long start, unsigned long end) { @@ -664,10 +753,12 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address, case PG_LEVEL_2M: old_prot = pmd_pgprot(*(pmd_t *)kpte); old_pfn = pmd_pfn(*(pmd_t *)kpte); + cpa_inc_2m_checked(); break; case PG_LEVEL_1G: old_prot = pud_pgprot(*(pud_t *)kpte); old_pfn = pud_pfn(*(pud_t *)kpte); + cpa_inc_1g_checked(); break; default: return -EINVAL; @@ -732,14 +823,16 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address, for (i = 0, addr = lpaddr; i < numpages; i++, addr += PAGE_SIZE, pfn++) { pgprot_t chk_prot = static_protections(req_prot, addr, pfn, 1, CPA_DETECT); - + cpa_inc_4k_checked(); if (pgprot_val(chk_prot) != pgprot_val(new_prot)) return 1; } /* If there are no changes, return. */ - if (pgprot_val(new_prot) == pgprot_val(old_prot)) + if (pgprot_val(new_prot) == pgprot_val(old_prot)) { + cpa_inc_lp_sameprot(level); return 0; + } /* * Verify that the address is aligned and the number of pages @@ -752,6 +845,7 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address, new_pte = pfn_pte(old_pfn, new_prot); __set_pmd_pte(kpte, address, new_pte); cpa->flags |= CPA_FLUSHTLB; + cpa_inc_lp_preserved(level); return 0; } @@ -1341,6 +1435,7 @@ repeat: pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); + cpa_inc_4k_install(); new_prot = static_protections(new_prot, address, pfn, 1, CPA_PROTECT); -- cgit v1.2.3 From 69c31e69df3d2efc4ad7f53d500fdd920d3865a4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 17 Sep 2018 16:29:13 +0200 Subject: x86/mm/cpa: Avoid static protection checks on unmap If the new pgprot has the PRESENT bit cleared, then conflicts vs. RW/NX are completely irrelevant. Before: 1G pages checked: 2 1G pages sameprot: 0 1G pages preserved: 0 2M pages checked: 540 2M pages sameprot: 466 2M pages preserved: 47 4K pages checked: 800770 4K pages set-checked: 7668 After: 1G pages checked: 2 1G pages sameprot: 0 1G pages preserved: 0 2M pages checked: 540 2M pages sameprot: 466 2M pages preserved: 47 4K pages checked: 800709 4K pages set-checked: 7668 Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Peter Zijlstra Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180917143546.245849757@linutronix.de --- arch/x86/mm/pageattr.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index cdf52eb86f3a..8f9083eb21ac 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -526,6 +526,13 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long start, pgprotval_t forbidden, res; unsigned long end; + /* + * There is no point in checking RW/NX conflicts when the requested + * mapping is setting the page !PRESENT. + */ + if (!(pgprot_val(prot) & _PAGE_PRESENT)) + return prot; + /* Operate on the virtual address */ end = start + npg * PAGE_SIZE - 1; -- cgit v1.2.3 From f61c5ba2885eabc7bc4b0b2f5232f475216ba446 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 17 Sep 2018 16:29:14 +0200 Subject: x86/mm/cpa: Add sanity check for existing mappings With the range check it is possible to do a quick verification that the current mapping is correct vs. the static protection areas. In case a incorrect mapping is detected a warning is emitted and the large page is split up. If the large page is a 2M page, then the split code is forced to check the static protections for the PTE entries to fix up the incorrectness. For 1G pages this can't be done easily because that would require to either find the offending 2M areas before the split or afterwards. For now just warn about that case and revisit it when reported. Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Peter Zijlstra Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180917143546.331408643@linutronix.de --- arch/x86/mm/pageattr.c | 77 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 67 insertions(+), 10 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 8f9083eb21ac..19781b0ab4b4 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -37,12 +37,14 @@ struct cpa_data { unsigned long numpages; int flags; unsigned long pfn; - unsigned force_split : 1; + unsigned force_split : 1, + force_static_prot : 1; int curpage; struct page **pages; }; enum cpa_warn { + CPA_CONFLICT, CPA_PROTECT, CPA_DETECT, }; @@ -501,6 +503,7 @@ static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val, unsigned long pfn, const char *txt) { static const char *lvltxt[] = { + [CPA_CONFLICT] = "conflict", [CPA_PROTECT] = "protect", [CPA_DETECT] = "detect", }; @@ -743,7 +746,7 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address, struct cpa_data *cpa) { unsigned long numpages, pmask, psize, lpaddr, addr, pfn, old_pfn; - pgprot_t old_prot, new_prot, req_prot; + pgprot_t old_prot, new_prot, req_prot, chk_prot; pte_t new_pte, old_pte, *tmp; enum pg_level level; int i; @@ -819,6 +822,23 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address, lpaddr = address & pmask; numpages = psize >> PAGE_SHIFT; + /* + * Sanity check that the existing mapping is correct versus the static + * protections. static_protections() guards against !PRESENT, so no + * extra conditional required here. + */ + chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages, + CPA_CONFLICT); + + if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) { + /* + * Split the large page and tell the split code to + * enforce static protections. + */ + cpa->force_static_prot = 1; + return 1; + } + /* * Make sure that the requested pgprot does not violate the static * protections. Check the full large page whether one of the pages @@ -828,8 +848,8 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address, new_prot = static_protections(req_prot, address, pfn, 1, CPA_DETECT); pfn = old_pfn; for (i = 0, addr = lpaddr; i < numpages; i++, addr += PAGE_SIZE, pfn++) { - pgprot_t chk_prot = static_protections(req_prot, addr, pfn, 1, - CPA_DETECT); + chk_prot = static_protections(req_prot, addr, pfn, 1, + CPA_DETECT); cpa_inc_4k_checked(); if (pgprot_val(chk_prot) != pgprot_val(new_prot)) return 1; @@ -871,15 +891,50 @@ static int should_split_large_page(pte_t *kpte, unsigned long address, return do_split; } +static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn, + pgprot_t ref_prot, unsigned long address, + unsigned long size) +{ + unsigned int npg = PFN_DOWN(size); + pgprot_t prot; + + /* + * If should_split_large_page() discovered an inconsistent mapping, + * remove the invalid protection in the split mapping. + */ + if (!cpa->force_static_prot) + goto set; + + prot = static_protections(ref_prot, address, pfn, npg, CPA_PROTECT); + + if (pgprot_val(prot) == pgprot_val(ref_prot)) + goto set; + + /* + * If this is splitting a PMD, fix it up. PUD splits cannot be + * fixed trivially as that would require to rescan the newly + * installed PMD mappings after returning from split_large_page() + * so an eventual further split can allocate the necessary PTE + * pages. Warn for now and revisit it in case this actually + * happens. + */ + if (size == PAGE_SIZE) + ref_prot = prot; + else + pr_warn_once("CPA: Cannot fixup static protections for PUD split\n"); +set: + set_pte(pte, pfn_pte(pfn, ref_prot)); +} + static int __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, struct page *base) { + unsigned long lpaddr, lpinc, ref_pfn, pfn, pfninc = 1; pte_t *pbase = (pte_t *)page_address(base); - unsigned long ref_pfn, pfn, pfninc = 1; unsigned int i, level; - pte_t *tmp; pgprot_t ref_prot; + pte_t *tmp; spin_lock(&pgd_lock); /* @@ -902,15 +957,17 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, * PAT bit to correct position. */ ref_prot = pgprot_large_2_4k(ref_prot); - ref_pfn = pmd_pfn(*(pmd_t *)kpte); + lpaddr = address & PMD_MASK; + lpinc = PAGE_SIZE; break; case PG_LEVEL_1G: ref_prot = pud_pgprot(*(pud_t *)kpte); ref_pfn = pud_pfn(*(pud_t *)kpte); pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; - + lpaddr = address & PUD_MASK; + lpinc = PMD_SIZE; /* * Clear the PSE flags if the PRESENT flag is not set * otherwise pmd_present/pmd_huge will return true @@ -931,8 +988,8 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, * Get the target pfn from the original entry: */ pfn = ref_pfn; - for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) - set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); + for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc, lpaddr += lpinc) + split_set_pte(cpa, pbase + i, pfn, ref_prot, lpaddr, lpinc); if (virt_addr_valid(address)) { unsigned long pfn = PFN_DOWN(__pa(address)); -- cgit v1.2.3 From 1c4b406ee89c2c4210f2e19b97d39215f445c316 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 17 Sep 2018 16:29:15 +0200 Subject: x86/mm/cpa: Optimize same protection check When the existing mapping is correct and the new requested page protections are the same as the existing ones, then further checks can be omitted and the large page can be preserved. The slow path 4k wise check will not come up with a different result. Before: 1G pages checked: 2 1G pages sameprot: 0 1G pages preserved: 0 2M pages checked: 540 2M pages sameprot: 466 2M pages preserved: 47 4K pages checked: 800709 4K pages set-checked: 7668 After: 1G pages checked: 2 1G pages sameprot: 0 1G pages preserved: 0 2M pages checked: 538 2M pages sameprot: 466 2M pages preserved: 47 4K pages checked: 560642 4K pages set-checked: 7668 Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Peter Zijlstra Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180917143546.424477581@linutronix.de --- arch/x86/mm/pageattr.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 19781b0ab4b4..5160334f9095 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -839,6 +839,20 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address, return 1; } + /* + * Optimization: If the requested pgprot is the same as the current + * pgprot, then the large page can be preserved and no updates are + * required independent of alignment and length of the requested + * range. The above already established that the current pgprot is + * correct, which in consequence makes the requested pgprot correct + * as well if it is the same. The static protection scan below will + * not come to a different conclusion. + */ + if (pgprot_val(req_prot) == pgprot_val(old_prot)) { + cpa_inc_lp_sameprot(level); + return 0; + } + /* * Make sure that the requested pgprot does not violate the static * protections. Check the full large page whether one of the pages -- cgit v1.2.3 From 9cc9f17a5a0a8564b41b7c5c460e7f078c42d712 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 17 Sep 2018 16:29:16 +0200 Subject: x86/mm/cpa: Do the range check early To avoid excessive 4k wise checks in the common case do a quick check first whether the requested new page protections conflict with a static protection area in the large page. If there is no conflict then the decision whether to preserve or to split the page can be made immediately. If the requested range covers the full large page, preserve it. Otherwise split it up. No point in doing a slow crawl in 4k steps. Before: 1G pages checked: 2 1G pages sameprot: 0 1G pages preserved: 0 2M pages checked: 538 2M pages sameprot: 466 2M pages preserved: 47 4K pages checked: 560642 4K pages set-checked: 7668 After: 1G pages checked: 2 1G pages sameprot: 0 1G pages preserved: 0 2M pages checked: 541 2M pages sameprot: 466 2M pages preserved: 47 4K pages checked: 514 4K pages set-checked: 7668 Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Peter Zijlstra Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180917143546.507259989@linutronix.de --- arch/x86/mm/pageattr.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 5160334f9095..bbc5eb5cbc9d 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -854,10 +854,28 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address, } /* - * Make sure that the requested pgprot does not violate the static - * protections. Check the full large page whether one of the pages - * in it results in a different pgprot than the first one of the - * requested range. If yes, then the page needs to be split. + * Optimization: Check whether the requested pgprot is conflicting + * with a static protection requirement in the large page. If not, + * then checking whether the requested range is fully covering the + * large page can be done right here. + */ + new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages, + CPA_DETECT); + + if (pgprot_val(req_prot) == pgprot_val(new_prot)) { + if (address != lpaddr || cpa->numpages != numpages) + return 1; + goto setlp; + } + + /* + * Slow path. The full large page check above established that the + * requested pgprot cannot be applied to the full large page due to + * conflicting requirements of static protection regions. It might + * turn out that the whole requested range is covered by the + * modified protection of the first 4k segment at @address. This + * might result in the ability to preserve the large page + * nevertheless. */ new_prot = static_protections(req_prot, address, pfn, 1, CPA_DETECT); pfn = old_pfn; @@ -882,6 +900,7 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address, if (address != lpaddr || cpa->numpages != numpages) return 1; +setlp: /* All checks passed. Update the large page mapping. */ new_pte = pfn_pte(old_pfn, new_prot); __set_pmd_pte(kpte, address, new_pte); -- cgit v1.2.3 From 585948f4f695b07204702cfee0f828424af32aa7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 17 Sep 2018 16:29:17 +0200 Subject: x86/mm/cpa: Avoid the 4k pages check completely The extra loop which tries hard to preserve large pages in case of conflicts with static protection regions turns out to be not preserving anything, at least not in the experiments which have been conducted. There might be corner cases in which the code would be able to preserve a large page oaccsionally, but it's really not worth the extra code and the cycles wasted in the common case. Before: 1G pages checked: 2 1G pages sameprot: 0 1G pages preserved: 0 2M pages checked: 541 2M pages sameprot: 466 2M pages preserved: 47 4K pages checked: 514 4K pages set-checked: 7668 After: 1G pages checked: 2 1G pages sameprot: 0 1G pages preserved: 0 2M pages checked: 538 2M pages sameprot: 466 2M pages preserved: 47 4K pages set-checked: 7668 Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Peter Zijlstra Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180917143546.589642503@linutronix.de --- arch/x86/mm/pageattr.c | 64 +++++++++++++------------------------------------- 1 file changed, 16 insertions(+), 48 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index bbc5eb5cbc9d..4e55ded01be5 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -111,7 +111,6 @@ static unsigned long cpa_1g_preserved; static unsigned long cpa_2m_checked; static unsigned long cpa_2m_sameprot; static unsigned long cpa_2m_preserved; -static unsigned long cpa_4k_checked; static unsigned long cpa_4k_install; static inline void cpa_inc_1g_checked(void) @@ -124,11 +123,6 @@ static inline void cpa_inc_2m_checked(void) cpa_2m_checked++; } -static inline void cpa_inc_4k_checked(void) -{ - cpa_4k_checked++; -} - static inline void cpa_inc_4k_install(void) { cpa_4k_install++; @@ -158,7 +152,6 @@ static int cpastats_show(struct seq_file *m, void *p) seq_printf(m, "2M pages checked: %16lu\n", cpa_2m_checked); seq_printf(m, "2M pages sameprot: %16lu\n", cpa_2m_sameprot); seq_printf(m, "2M pages preserved: %16lu\n", cpa_2m_preserved); - seq_printf(m, "4K pages checked: %16lu\n", cpa_4k_checked); seq_printf(m, "4K pages set-checked: %16lu\n", cpa_4k_install); return 0; } @@ -185,7 +178,6 @@ late_initcall(cpa_stats_init); #else static inline void cpa_inc_1g_checked(void) { } static inline void cpa_inc_2m_checked(void) { } -static inline void cpa_inc_4k_checked(void) { } static inline void cpa_inc_4k_install(void) { } static inline void cpa_inc_lp_sameprot(int level) { } static inline void cpa_inc_lp_preserved(int level) { } @@ -745,11 +737,10 @@ static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot) static int __should_split_large_page(pte_t *kpte, unsigned long address, struct cpa_data *cpa) { - unsigned long numpages, pmask, psize, lpaddr, addr, pfn, old_pfn; + unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn; pgprot_t old_prot, new_prot, req_prot, chk_prot; pte_t new_pte, old_pte, *tmp; enum pg_level level; - int i; /* * Check for races, another CPU might have split this page @@ -854,53 +845,30 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address, } /* - * Optimization: Check whether the requested pgprot is conflicting - * with a static protection requirement in the large page. If not, - * then checking whether the requested range is fully covering the - * large page can be done right here. + * If the requested range does not cover the full page, split it up */ - new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages, - CPA_DETECT); - - if (pgprot_val(req_prot) == pgprot_val(new_prot)) { - if (address != lpaddr || cpa->numpages != numpages) - return 1; - goto setlp; - } + if (address != lpaddr || cpa->numpages != numpages) + return 1; /* - * Slow path. The full large page check above established that the - * requested pgprot cannot be applied to the full large page due to - * conflicting requirements of static protection regions. It might - * turn out that the whole requested range is covered by the - * modified protection of the first 4k segment at @address. This - * might result in the ability to preserve the large page - * nevertheless. + * Check whether the requested pgprot is conflicting with a static + * protection requirement in the large page. */ - new_prot = static_protections(req_prot, address, pfn, 1, CPA_DETECT); - pfn = old_pfn; - for (i = 0, addr = lpaddr; i < numpages; i++, addr += PAGE_SIZE, pfn++) { - chk_prot = static_protections(req_prot, addr, pfn, 1, - CPA_DETECT); - cpa_inc_4k_checked(); - if (pgprot_val(chk_prot) != pgprot_val(new_prot)) - return 1; - } - - /* If there are no changes, return. */ - if (pgprot_val(new_prot) == pgprot_val(old_prot)) { - cpa_inc_lp_sameprot(level); - return 0; - } + new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages, + CPA_DETECT); /* - * Verify that the address is aligned and the number of pages - * covers the full page. + * If there is a conflict, split the large page. + * + * There used to be a 4k wise evaluation trying really hard to + * preserve the large pages, but experimentation has shown, that this + * does not help at all. There might be corner cases which would + * preserve one large page occasionally, but it's really not worth the + * extra code and cycles for the common case. */ - if (address != lpaddr || cpa->numpages != numpages) + if (pgprot_val(req_prot) != pgprot_val(new_prot)) return 1; -setlp: /* All checks passed. Update the large page mapping. */ new_pte = pfn_pte(old_pfn, new_prot); __set_pmd_pte(kpte, address, new_pte); -- cgit v1.2.3 From c6185b1f21a47af94617fde3af7e803817b522a9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 19 Sep 2018 10:50:17 +0200 Subject: x86/mm/cpa: Use flush_tlb_all() Instead of open-coding it.. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180919085947.831102058@infradead.org --- arch/x86/mm/pageattr.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 4e55ded01be5..a22f6b71a308 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -285,16 +285,6 @@ static void cpa_flush_all(unsigned long cache) on_each_cpu(__cpa_flush_all, (void *) cache, 1); } -static void __cpa_flush_range(void *arg) -{ - /* - * We could optimize that further and do individual per page - * tlb invalidates for a low number of pages. Caveat: we must - * flush the high aliases on 64bit as well. - */ - __flush_tlb_all(); -} - static void cpa_flush_range(unsigned long start, int numpages, int cache) { unsigned int i, level; @@ -303,7 +293,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); WARN_ON(PAGE_ALIGN(start) != start); - on_each_cpu(__cpa_flush_range, NULL, 1); + flush_tlb_all(); if (!cache) return; -- cgit v1.2.3 From c0a759abf5a686a37b9204c13b7e281fe516c8f0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 19 Sep 2018 10:50:18 +0200 Subject: x86/mm/cpa: Move flush_tlb_all() There is an atom errata, where we do a local TLB invalidate right before we return and then do a global TLB invalidate. Move the global invalidate up a little bit and avoid the local invalidate entirely. This does put the global invalidate under pgd_lock, but that shouldn't matter. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180919085947.882287392@infradead.org --- arch/x86/mm/pageattr.c | 44 +++++++++++++++++--------------------------- 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a22f6b71a308..b6a4c638f086 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -999,14 +999,24 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE))); /* - * Intel Atom errata AAH41 workaround. + * Do a global flush tlb after splitting the large page + * and before we do the actual change page attribute in the PTE. * - * The real fix should be in hw or in a microcode update, but - * we also probabilistically try to reduce the window of having - * a large TLB mixed with 4K TLBs while instruction fetches are - * going on. + * Without this, we violate the TLB application note, that says: + * "The TLBs may contain both ordinary and large-page + * translations for a 4-KByte range of linear addresses. This + * may occur if software modifies the paging structures so that + * the page size used for the address range changes. If the two + * translations differ with respect to page frame or attributes + * (e.g., permissions), processor behavior is undefined and may + * be implementation-specific." + * + * We do this global tlb flush inside the cpa_lock, so that we + * don't allow any other cpu, with stale tlb entries change the + * page attribute in parallel, that also falls into the + * just split large page entry. */ - __flush_tlb_all(); + flush_tlb_all(); spin_unlock(&pgd_lock); return 0; @@ -1531,28 +1541,8 @@ repeat: * We have to split the large page: */ err = split_large_page(cpa, kpte, address); - if (!err) { - /* - * Do a global flush tlb after splitting the large page - * and before we do the actual change page attribute in the PTE. - * - * With out this, we violate the TLB application note, that says - * "The TLBs may contain both ordinary and large-page - * translations for a 4-KByte range of linear addresses. This - * may occur if software modifies the paging structures so that - * the page size used for the address range changes. If the two - * translations differ with respect to page frame or attributes - * (e.g., permissions), processor behavior is undefined and may - * be implementation-specific." - * - * We do this global tlb flush inside the cpa_lock, so that we - * don't allow any other cpu, with stale tlb entries change the - * page attribute in parallel, that also falls into the - * just split large page entry. - */ - flush_tlb_all(); + if (!err) goto repeat; - } return err; } -- cgit v1.2.3 From ddd07b750382adc2b78fdfbec47af8a6e0d8ef37 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 19 Sep 2018 10:50:19 +0200 Subject: x86/mm/cpa: Unconditionally avoid WBINDV when we can CAT has happened, WBINDV is bad (even before CAT blowing away the entire cache on a multi-core platform wasn't nice), try not to use it ever. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180919085947.933674526@infradead.org --- arch/x86/mm/pageattr.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index b6a4c638f086..bd9b0ac07352 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -319,26 +319,12 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache, int in_flags, struct page **pages) { unsigned int i, level; -#ifdef CONFIG_PREEMPT - /* - * Avoid wbinvd() because it causes latencies on all CPUs, - * regardless of any CPU isolation that may be in effect. - * - * This should be extended for CAT enabled systems independent of - * PREEMPT because wbinvd() does not respect the CAT partitions and - * this is exposed to unpriviledged users through the graphics - * subsystem. - */ - unsigned long do_wbinvd = 0; -#else - unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */ -#endif BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); - on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1); + flush_tlb_all(); - if (!cache || do_wbinvd) + if (!cache) return; /* -- cgit v1.2.3 From a7295fd53c39ce781a9792c9dd2c8747bf274160 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 19 Sep 2018 10:50:20 +0200 Subject: x86/mm/cpa: Use flush_tlb_kernel_range() Both cpa_flush_range() and cpa_flush_array() have a well specified range, use that to do a range based TLB invalidate. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180919085947.985193217@infradead.org --- arch/x86/mm/pageattr.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index bd9b0ac07352..02eb18403594 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -293,7 +293,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); WARN_ON(PAGE_ALIGN(start) != start); - flush_tlb_all(); + flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages); if (!cache) return; @@ -315,14 +315,15 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) } } -static void cpa_flush_array(unsigned long *start, int numpages, int cache, +static void cpa_flush_array(unsigned long baddr, unsigned long *start, + int numpages, int cache, int in_flags, struct page **pages) { unsigned int i, level; BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); - flush_tlb_all(); + flush_tlb_kernel_range(baddr, baddr + PAGE_SIZE * numpages); if (!cache) return; @@ -1757,7 +1758,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, */ if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) { if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { - cpa_flush_array(addr, numpages, cache, + cpa_flush_array(baddr, addr, numpages, cache, cpa.flags, pages); } else cpa_flush_range(baddr, numpages, cache); -- cgit v1.2.3 From 5f464b33b17219a233af1267c621632225bc7acc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 19 Sep 2018 10:50:21 +0200 Subject: x86/mm/cpa: Move CLFLUSH test into cpa_flush_range() Rather than guarding all cpa_flush_range() uses with a CLFLUSH test, put it inside. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180919085948.036195503@infradead.org --- arch/x86/mm/pageattr.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 02eb18403594..3cc4a2ae4dbb 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -293,6 +293,11 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); WARN_ON(PAGE_ALIGN(start) != start); + if (!static_cpu_has(X86_FEATURE_CLFLUSH)) { + cpa_flush_all(cache); + return; + } + flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages); if (!cache) @@ -2078,10 +2083,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) /* * Before changing the encryption attribute, we need to flush caches. */ - if (static_cpu_has(X86_FEATURE_CLFLUSH)) - cpa_flush_range(start, numpages, 1); - else - cpa_flush_all(1); + cpa_flush_range(start, numpages, 1); ret = __change_page_attr_set_clr(&cpa, 1); @@ -2092,10 +2094,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) * in case TLB flushing gets optimized in the cpa_flush_range() * path use the same logic as above. */ - if (static_cpu_has(X86_FEATURE_CLFLUSH)) - cpa_flush_range(start, numpages, 0); - else - cpa_flush_all(0); + cpa_flush_range(start, numpages, 0); return ret; } -- cgit v1.2.3 From fce2ce9544e9f098ba828442221ce99c2a5ecb0f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 19 Sep 2018 10:50:22 +0200 Subject: x86/mm/cpa: Move CLFLUSH test into cpa_flush_array() Rather than guarding cpa_flush_array() users with a CLFLUSH test, put it inside. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180919085948.087848187@infradead.org --- arch/x86/mm/pageattr.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 3cc4a2ae4dbb..33d89d505f93 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -328,6 +328,11 @@ static void cpa_flush_array(unsigned long baddr, unsigned long *start, BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); + if (!static_cpu_has(X86_FEATURE_CLFLUSH)) { + cpa_flush_all(cache); + return; + } + flush_tlb_kernel_range(baddr, baddr + PAGE_SIZE * numpages); if (!cache) @@ -1756,19 +1761,19 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, cache = !!pgprot2cachemode(mask_set); /* - * On success we use CLFLUSH, when the CPU supports it to - * avoid the WBINVD. If the CPU does not support it and in the - * error case we fall back to cpa_flush_all (which uses - * WBINVD): + * On error; flush everything to be sure. */ - if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) { - if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { - cpa_flush_array(baddr, addr, numpages, cache, - cpa.flags, pages); - } else - cpa_flush_range(baddr, numpages, cache); - } else + if (ret) { cpa_flush_all(cache); + goto out; + } + + if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { + cpa_flush_array(baddr, addr, numpages, cache, + cpa.flags, pages); + } else { + cpa_flush_range(baddr, numpages, cache); + } out: return ret; -- cgit v1.2.3 From 47e262ac5b84015c4a101ff51767c464fb7497a6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 19 Sep 2018 10:50:23 +0200 Subject: x86/mm/cpa: Factor common code between cpa_flush_*() The start of cpa_flush_range() and cpa_flush_array() is the same, use a common function. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180919085948.138859183@infradead.org --- arch/x86/mm/pageattr.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 33d89d505f93..dc552824e86a 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -285,22 +285,28 @@ static void cpa_flush_all(unsigned long cache) on_each_cpu(__cpa_flush_all, (void *) cache, 1); } -static void cpa_flush_range(unsigned long start, int numpages, int cache) +static bool __cpa_flush_range(unsigned long start, int numpages, int cache) { - unsigned int i, level; - unsigned long addr; - BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); + WARN_ON(PAGE_ALIGN(start) != start); if (!static_cpu_has(X86_FEATURE_CLFLUSH)) { cpa_flush_all(cache); - return; + return true; } flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages); - if (!cache) + return !cache; +} + +static void cpa_flush_range(unsigned long start, int numpages, int cache) +{ + unsigned int i, level; + unsigned long addr; + + if (__cpa_flush_range(start, numpages, cache)) return; /* @@ -326,16 +332,7 @@ static void cpa_flush_array(unsigned long baddr, unsigned long *start, { unsigned int i, level; - BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); - - if (!static_cpu_has(X86_FEATURE_CLFLUSH)) { - cpa_flush_all(cache); - return; - } - - flush_tlb_kernel_range(baddr, baddr + PAGE_SIZE * numpages); - - if (!cache) + if (__cpa_flush_range(baddr, numpages, cache)) return; /* -- cgit v1.2.3 From 7904ba8a66f400182a204893c92098994e22a88d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 19 Sep 2018 10:50:24 +0200 Subject: x86/mm/cpa: Optimize __cpa_flush_range() If we IPI for WBINDV, then we might as well kill the entire TLB too. But if we don't have to invalidate cache, there is no reason not to use a range TLB flush. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180919085948.195633798@infradead.org --- arch/x86/mm/pageattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index dc552824e86a..62bb30b4bd2a 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -291,7 +291,7 @@ static bool __cpa_flush_range(unsigned long start, int numpages, int cache) WARN_ON(PAGE_ALIGN(start) != start); - if (!static_cpu_has(X86_FEATURE_CLFLUSH)) { + if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) { cpa_flush_all(cache); return true; } -- cgit v1.2.3 From b3541fbc3c25bf2ba6d03a450c07f824e318f9b9 Mon Sep 17 00:00:00 2001 From: Takuya Yamamoto Date: Wed, 29 Aug 2018 16:27:30 +0900 Subject: x86/mm: Fix typo in comment Signed-off-by: Takuya Yamamoto Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20180829072730.988-1-tkyymmt01@gmail.com --- arch/x86/mm/extable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 45f5d6cf65ae..9ceb940334cd 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -230,7 +230,7 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr) * result in a hard-to-debug panic. * * Keep in mind that not all vectors actually get here. Early - * fage faults, for example, are special. + * page faults, for example, are special. */ if (fixup_exception(regs, trapnr)) return; -- cgit v1.2.3 From c3a7a61c192ec350330128edb13db33a9bc0ace1 Mon Sep 17 00:00:00 2001 From: Lianbo Jiang Date: Thu, 27 Sep 2018 15:19:51 +0800 Subject: x86/ioremap: Add an ioremap_encrypted() helper When SME is enabled, the memory is encrypted in the first kernel. In this case, SME also needs to be enabled in the kdump kernel, and we have to remap the old memory with the memory encryption mask. The case of concern here is if SME is active in the first kernel, and it is active too in the kdump kernel. There are four cases to be considered: a. dump vmcore It is encrypted in the first kernel, and needs be read out in the kdump kernel. b. crash notes When dumping vmcore, the people usually need to read useful information from notes, and the notes is also encrypted. c. iommu device table It's encrypted in the first kernel, kdump kernel needs to access its content to analyze and get information it needs. d. mmio of AMD iommu not encrypted in both kernels Add a new bool parameter @encrypted to __ioremap_caller(). If set, memory will be remapped with the SME mask. Add a new function ioremap_encrypted() to explicitly pass in a true value for @encrypted. Use ioremap_encrypted() for the above a, b, c cases. [ bp: cleanup commit message, extern defs in io.h and drop forgotten include. ] Signed-off-by: Lianbo Jiang Signed-off-by: Borislav Petkov Reviewed-by: Tom Lendacky Cc: kexec@lists.infradead.org Cc: tglx@linutronix.de Cc: mingo@redhat.com Cc: hpa@zytor.com Cc: akpm@linux-foundation.org Cc: dan.j.williams@intel.com Cc: bhelgaas@google.com Cc: baiyaowei@cmss.chinamobile.com Cc: tiwai@suse.de Cc: brijesh.singh@amd.com Cc: dyoung@redhat.com Cc: bhe@redhat.com Cc: jroedel@suse.de Link: https://lkml.kernel.org/r/20180927071954.29615-2-lijiang@redhat.com --- arch/x86/include/asm/io.h | 3 ++- arch/x86/mm/ioremap.c | 24 ++++++++++++++++-------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 6de64840dd22..6df53efcecfd 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -187,11 +187,12 @@ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size) #define ioremap_nocache ioremap_nocache extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); #define ioremap_uc ioremap_uc - extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); #define ioremap_cache ioremap_cache extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val); #define ioremap_prot ioremap_prot +extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size); +#define ioremap_encrypted ioremap_encrypted /** * ioremap - map bus memory into CPU space diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index c63a545ec199..24e0920a9b25 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -131,7 +131,8 @@ static void __ioremap_check_mem(resource_size_t addr, unsigned long size, * caller shouldn't need to know that small detail. */ static void __iomem *__ioremap_caller(resource_size_t phys_addr, - unsigned long size, enum page_cache_mode pcm, void *caller) + unsigned long size, enum page_cache_mode pcm, + void *caller, bool encrypted) { unsigned long offset, vaddr; resource_size_t last_addr; @@ -199,7 +200,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, * resulting mapping. */ prot = PAGE_KERNEL_IO; - if (sev_active() && mem_flags.desc_other) + if ((sev_active() && mem_flags.desc_other) || encrypted) prot = pgprot_encrypted(prot); switch (pcm) { @@ -291,7 +292,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS; return __ioremap_caller(phys_addr, size, pcm, - __builtin_return_address(0)); + __builtin_return_address(0), false); } EXPORT_SYMBOL(ioremap_nocache); @@ -324,7 +325,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size) enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC; return __ioremap_caller(phys_addr, size, pcm, - __builtin_return_address(0)); + __builtin_return_address(0), false); } EXPORT_SYMBOL_GPL(ioremap_uc); @@ -341,7 +342,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc); void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size) { return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC, - __builtin_return_address(0)); + __builtin_return_address(0), false); } EXPORT_SYMBOL(ioremap_wc); @@ -358,14 +359,21 @@ EXPORT_SYMBOL(ioremap_wc); void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size) { return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT, - __builtin_return_address(0)); + __builtin_return_address(0), false); } EXPORT_SYMBOL(ioremap_wt); +void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size) +{ + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB, + __builtin_return_address(0), true); +} +EXPORT_SYMBOL(ioremap_encrypted); + void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size) { return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB, - __builtin_return_address(0)); + __builtin_return_address(0), false); } EXPORT_SYMBOL(ioremap_cache); @@ -374,7 +382,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, { return __ioremap_caller(phys_addr, size, pgprot2cachemode(__pgprot(prot_val)), - __builtin_return_address(0)); + __builtin_return_address(0), false); } EXPORT_SYMBOL(ioremap_prot); -- cgit v1.2.3 From 9cf38d5559e813cccdba8b44c82cc46ba48d0896 Mon Sep 17 00:00:00 2001 From: Lianbo Jiang Date: Sun, 30 Sep 2018 11:10:31 +0800 Subject: kexec: Allocate decrypted control pages for kdump if SME is enabled When SME is enabled in the first kernel, it needs to allocate decrypted pages for kdump because when the kdump kernel boots, these pages need to be accessed decrypted in the initial boot stage, before SME is enabled. [ bp: clean up text. ] Signed-off-by: Lianbo Jiang Signed-off-by: Borislav Petkov Reviewed-by: Tom Lendacky Cc: kexec@lists.infradead.org Cc: tglx@linutronix.de Cc: mingo@redhat.com Cc: hpa@zytor.com Cc: akpm@linux-foundation.org Cc: dan.j.williams@intel.com Cc: bhelgaas@google.com Cc: baiyaowei@cmss.chinamobile.com Cc: tiwai@suse.de Cc: brijesh.singh@amd.com Cc: dyoung@redhat.com Cc: bhe@redhat.com Cc: jroedel@suse.de Link: https://lkml.kernel.org/r/20180930031033.22110-3-lijiang@redhat.com --- kernel/kexec_core.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 23a83a4da38a..86ef06d3dbe3 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -471,6 +471,10 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, } } + /* Ensure that these pages are decrypted if SME is enabled. */ + if (pages) + arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0); + return pages; } @@ -867,6 +871,7 @@ static int kimage_load_crash_segment(struct kimage *image, result = -ENOMEM; goto out; } + arch_kexec_post_alloc_pages(page_address(page), 1, 0); ptr = kmap(page); ptr += maddr & ~PAGE_MASK; mchunk = min_t(size_t, mbytes, @@ -884,6 +889,7 @@ static int kimage_load_crash_segment(struct kimage *image, result = copy_from_user(ptr, buf, uchunk); kexec_flush_icache_page(page); kunmap(page); + arch_kexec_pre_free_pages(page_address(page), 1); if (result) { result = -EFAULT; goto out; -- cgit v1.2.3 From 8780158cf977ea5f9912931a30b3d575b36dba22 Mon Sep 17 00:00:00 2001 From: Lianbo Jiang Date: Sun, 30 Sep 2018 11:10:32 +0800 Subject: iommu/amd: Remap the IOMMU device table with the memory encryption mask for kdump The kdump kernel copies the IOMMU device table from the old device table which is encrypted when SME is enabled in the first kernel. So remap the old device table with the memory encryption mask in the kdump kernel. [ bp: Massage commit message. ] Signed-off-by: Lianbo Jiang Signed-off-by: Borislav Petkov Reviewed-by: Tom Lendacky Acked-by: Joerg Roedel Cc: kexec@lists.infradead.org Cc: tglx@linutronix.de Cc: mingo@redhat.com Cc: hpa@zytor.com Cc: akpm@linux-foundation.org Cc: dan.j.williams@intel.com Cc: bhelgaas@google.com Cc: baiyaowei@cmss.chinamobile.com Cc: tiwai@suse.de Cc: brijesh.singh@amd.com Cc: dyoung@redhat.com Cc: bhe@redhat.com Link: https://lkml.kernel.org/r/20180930031033.22110-4-lijiang@redhat.com --- drivers/iommu/amd_iommu_init.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 84b3e4445d46..3931c7de7c69 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -902,12 +902,22 @@ static bool copy_device_table(void) } } - old_devtb_phys = entry & PAGE_MASK; + /* + * When SME is enabled in the first kernel, the entry includes the + * memory encryption mask(sme_me_mask), we must remove the memory + * encryption mask to obtain the true physical address in kdump kernel. + */ + old_devtb_phys = __sme_clr(entry) & PAGE_MASK; + if (old_devtb_phys >= 0x100000000ULL) { pr_err("The address of old device table is above 4G, not trustworthy!\n"); return false; } - old_devtb = memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB); + old_devtb = (sme_active() && is_kdump_kernel()) + ? (__force void *)ioremap_encrypted(old_devtb_phys, + dev_table_size) + : memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB); + if (!old_devtb) return false; -- cgit v1.2.3 From 992b649a3f013465d8128da02e5449def662a4c3 Mon Sep 17 00:00:00 2001 From: Lianbo Jiang Date: Sun, 30 Sep 2018 16:37:41 +0800 Subject: kdump, proc/vmcore: Enable kdumping encrypted memory with SME enabled In the kdump kernel, the memory of the first kernel needs to be dumped into the vmcore file. If SME is enabled in the first kernel, the old memory has to be remapped with the memory encryption mask in order to access it properly. Split copy_oldmem_page() functionality to handle encrypted memory properly. [ bp: Heavily massage everything. ] Signed-off-by: Lianbo Jiang Signed-off-by: Borislav Petkov Cc: kexec@lists.infradead.org Cc: tglx@linutronix.de Cc: mingo@redhat.com Cc: hpa@zytor.com Cc: akpm@linux-foundation.org Cc: dan.j.williams@intel.com Cc: bhelgaas@google.com Cc: baiyaowei@cmss.chinamobile.com Cc: tiwai@suse.de Cc: brijesh.singh@amd.com Cc: dyoung@redhat.com Cc: bhe@redhat.com Cc: jroedel@suse.de Link: https://lkml.kernel.org/r/be7b47f9-6be6-e0d1-2c2a-9125bc74b818@redhat.com --- arch/x86/kernel/crash_dump_64.c | 60 ++++++++++++++++++++++++++++------------- fs/proc/vmcore.c | 24 ++++++++++++----- include/linux/crash_dump.h | 4 +++ 3 files changed, 63 insertions(+), 25 deletions(-) diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 4f2e0778feac..eb8ab3915268 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -11,40 +11,62 @@ #include #include -/** - * copy_oldmem_page - copy one page from "oldmem" - * @pfn: page frame number to be copied - * @buf: target memory address for the copy; this can be in kernel address - * space or user address space (see @userbuf) - * @csize: number of bytes to copy - * @offset: offset in bytes into the page (based on pfn) to begin the copy - * @userbuf: if set, @buf is in user address space, use copy_to_user(), - * otherwise @buf is in kernel address space, use memcpy(). - * - * Copy a page from "oldmem". For this page, there is no pte mapped - * in the current kernel. We stitch up a pte, similar to kmap_atomic. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, int userbuf) +static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, + unsigned long offset, int userbuf, + bool encrypted) { void *vaddr; if (!csize) return 0; - vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE); + if (encrypted) + vaddr = (__force void *)ioremap_encrypted(pfn << PAGE_SHIFT, PAGE_SIZE); + else + vaddr = (__force void *)ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE); + if (!vaddr) return -ENOMEM; if (userbuf) { - if (copy_to_user(buf, vaddr + offset, csize)) { - iounmap(vaddr); + if (copy_to_user((void __user *)buf, vaddr + offset, csize)) { + iounmap((void __iomem *)vaddr); return -EFAULT; } } else memcpy(buf, vaddr + offset, csize); set_iounmap_nonlazy(); - iounmap(vaddr); + iounmap((void __iomem *)vaddr); return csize; } + +/** + * copy_oldmem_page - copy one page of memory + * @pfn: page frame number to be copied + * @buf: target memory address for the copy; this can be in kernel address + * space or user address space (see @userbuf) + * @csize: number of bytes to copy + * @offset: offset in bytes into the page (based on pfn) to begin the copy + * @userbuf: if set, @buf is in user address space, use copy_to_user(), + * otherwise @buf is in kernel address space, use memcpy(). + * + * Copy a page from the old kernel's memory. For this page, there is no pte + * mapped in the current kernel. We stitch up a pte, similar to kmap_atomic. + */ +ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, + unsigned long offset, int userbuf) +{ + return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, false); +} + +/** + * copy_oldmem_page_encrypted - same as copy_oldmem_page() above but ioremap the + * memory with the encryption mask set to accomodate kdump on SME-enabled + * machines. + */ +ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize, + unsigned long offset, int userbuf) +{ + return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true); +} diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index cbde728f8ac6..42c32d06f7da 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -24,6 +24,8 @@ #include #include #include +#include +#include #include #include "internal.h" @@ -98,7 +100,8 @@ static int pfn_is_ram(unsigned long pfn) /* Reads a page from the oldmem device from given offset. */ static ssize_t read_from_oldmem(char *buf, size_t count, - u64 *ppos, int userbuf) + u64 *ppos, int userbuf, + bool encrypted) { unsigned long pfn, offset; size_t nr_bytes; @@ -120,8 +123,15 @@ static ssize_t read_from_oldmem(char *buf, size_t count, if (pfn_is_ram(pfn) == 0) memset(buf, 0, nr_bytes); else { - tmp = copy_oldmem_page(pfn, buf, nr_bytes, - offset, userbuf); + if (encrypted) + tmp = copy_oldmem_page_encrypted(pfn, buf, + nr_bytes, + offset, + userbuf); + else + tmp = copy_oldmem_page(pfn, buf, nr_bytes, + offset, userbuf); + if (tmp < 0) return tmp; } @@ -155,7 +165,7 @@ void __weak elfcorehdr_free(unsigned long long addr) */ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) { - return read_from_oldmem(buf, count, ppos, 0); + return read_from_oldmem(buf, count, ppos, 0, false); } /* @@ -163,7 +173,7 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) */ ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) { - return read_from_oldmem(buf, count, ppos, 0); + return read_from_oldmem(buf, count, ppos, 0, sme_active()); } /* @@ -173,6 +183,7 @@ int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from, unsigned long pfn, unsigned long size, pgprot_t prot) { + prot = pgprot_encrypted(prot); return remap_pfn_range(vma, from, pfn, size, prot); } @@ -351,7 +362,8 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, m->offset + m->size - *fpos, buflen); start = m->paddr + *fpos - m->offset; - tmp = read_from_oldmem(buffer, tsz, &start, userbuf); + tmp = read_from_oldmem(buffer, tsz, &start, + userbuf, sme_active()); if (tmp < 0) return tmp; buflen -= tsz; diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 3e4ba9d753c8..f774c5eb9e3c 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -26,6 +26,10 @@ extern int remap_oldmem_pfn_range(struct vm_area_struct *vma, extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, unsigned long, int); +extern ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, + size_t csize, unsigned long offset, + int userbuf); + void vmcore_cleanup(void); /* Architecture code defines this if there are other possible ELF -- cgit v1.2.3 From 06d4a462e954756f3d3d54e6f3f1bdc2e6f592a9 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Sat, 6 Oct 2018 16:43:25 +0800 Subject: x86/KASLR: Update KERNEL_IMAGE_SIZE description Currently CONFIG_RANDOMIZE_BASE=y is set by default, which makes some of the old comments above the KERNEL_IMAGE_SIZE definition out of date. Update them to the current state of affairs. Signed-off-by: Baoquan He Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: corbet@lwn.net Cc: linux-doc@vger.kernel.org Cc: thgarnie@google.com Link: http://lkml.kernel.org/r/20181006084327.27467-2-bhe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/page_64_types.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 6afac386a434..cd0cf1c568b4 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -59,13 +59,16 @@ #endif /* - * Kernel image size is limited to 1GiB due to the fixmap living in the - * next 1GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). Use - * 512MiB by default, leaving 1.5GiB for modules once the page tables - * are fully set up. If kernel ASLR is configured, it can extend the - * kernel page table mapping, reducing the size of the modules area. + * Maximum kernel image size is limited to 1 GiB, due to the fixmap living + * in the next 1 GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). + * + * On KASLR use 1 GiB by default, leaving 1 GiB for modules once the + * page tables are fully set up. + * + * If KASLR is disabled we can shrink it to 0.5 GiB and increase the size + * of the modules area to 1.5 GiB. */ -#if defined(CONFIG_RANDOMIZE_BASE) +#ifdef CONFIG_RANDOMIZE_BASE #define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024) #else #define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) -- cgit v1.2.3 From 5b12904065798fee8b153a506ac7b72d5ebbe26c Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Sat, 6 Oct 2018 16:43:26 +0800 Subject: x86/mm/doc: Clean up the x86-64 virtual memory layout descriptions In Documentation/x86/x86_64/mm.txt, the description of the x86-64 virtual memory layout has become a confusing hodgepodge of inconsistencies: - there's a hard to read mixture of 'TB' and 'bits' notation - the entries sometimes mention a size in the description and sometimes not - sometimes they list holes by address, sometimes only as an 'unused hole' line So make it all a coherent, readable, well organized description. Signed-off-by: Baoquan He Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: corbet@lwn.net Cc: linux-doc@vger.kernel.org Cc: thgarnie@google.com Link: http://lkml.kernel.org/r/20181006084327.27467-3-bhe@redhat.com Signed-off-by: Ingo Molnar --- Documentation/x86/x86_64/mm.txt | 84 ++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 5432a96d31ff..b4bc95c9790e 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -1,55 +1,55 @@ Virtual memory map with 4 level page tables: -0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm -hole caused by [47:63] sign extension -ffff800000000000 - ffff87ffffffffff (=43 bits) guard hole, reserved for hypervisor -ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory -ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole -ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space -ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole -ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) -... unused hole ... -ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) -... unused hole ... +0000000000000000 - 00007fffffffffff (=47 bits, 128 TB) user space, different per mm + hole caused by [47:63] sign extension +ffff800000000000 - ffff87ffffffffff (=43 bits, 8 TB) guard hole, reserved for hypervisor +ffff880000000000 - ffffc7ffffffffff (=46 bits, 64 TB) direct mapping of all phys. memory (page_offset_base) +ffffc80000000000 - ffffc8ffffffffff (=40 bits, 1 TB) unused hole +ffffc90000000000 - ffffe8ffffffffff (=45 bits, 32 TB) vmalloc/ioremap space (vmalloc_base) +ffffe90000000000 - ffffe9ffffffffff (=40 bits, 1 TB) unused hole +ffffea0000000000 - ffffeaffffffffff (=40 bits, 1 TB) virtual memory map (vmemmap_base) +ffffeb0000000000 - ffffebffffffffff (=40 bits, 1 TB) unused hole +ffffec0000000000 - fffffbffffffffff (=44 bits, 16 TB) kasan shadow memory +fffffc0000000000 - fffffdffffffffff (=41 bits, 2 TB) unused hole vaddr_end for KASLR -fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping -fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI -ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks -... unused hole ... -ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space -... unused hole ... -ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 -ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space +fffffe0000000000 - fffffe7fffffffff (=39 bits, 512 GB) cpu_entry_area mapping +fffffe8000000000 - fffffeffffffffff (=39 bits, 512 GB) LDT remap for PTI +ffffff0000000000 - ffffff7fffffffff (=39 bits, 512 GB) %esp fixup stacks +ffffff8000000000 - fffffffeefffffff (~39 bits, ~507 GB) unused hole +ffffffef00000000 - fffffffeffffffff (=36 bits, 64 GB) EFI region mapping space +ffffffff00000000 - ffffffff7fffffff (=31 bits, 2 GB) unused hole +ffffffff80000000 - ffffffff9fffffff (=29 bits, 512 MB) kernel text mapping, from phys 0 +ffffffffa0000000 - fffffffffeffffff (~31 bits, 1520 MB) module mapping space [fixmap start] - ffffffffff5fffff kernel-internal fixmap range -ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI -ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole +ffffffffff600000 - ffffffffff600fff ( =4 kB) legacy vsyscall ABI +ffffffffffe00000 - ffffffffffffffff ( =2 MB) unused hole Virtual memory map with 5 level page tables: -0000000000000000 - 00ffffffffffffff (=56 bits) user space, different per mm -hole caused by [56:63] sign extension -ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor -ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory -ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI -ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB) -ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole -ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) -... unused hole ... -ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB) -... unused hole ... +0000000000000000 - 00ffffffffffffff (=56 bits, 64 PB) user space, different per mm + hole caused by [56:63] sign extension +ff00000000000000 - ff0fffffffffffff (=52 bits, 4 PB) guard hole, reserved for hypervisor +ff10000000000000 - ff8fffffffffffff (=55 bits, 32 PB) direct mapping of all phys. memory (page_offset_base) +ff90000000000000 - ff9fffffffffffff (=52 bits, 4 PB) LDT remap for PTI +ffa0000000000000 - ffd1ffffffffffff (=53 bits, 12800 TB) vmalloc/ioremap space (vmalloc_base) +ffd2000000000000 - ffd3ffffffffffff (=49 bits, 512 TB) unused hole +ffd4000000000000 - ffd5ffffffffffff (=49 bits, 512 TB) virtual memory map (vmemmap_base) +ffd6000000000000 - ffdeffffffffffff (~51 bits, 2304 TB) unused hole +ffdf000000000000 - fffffdffffffffff (~53 bits, ~8 PB) kasan shadow memory +fffffc0000000000 - fffffdffffffffff (=41 bits, 2 TB) unused hole vaddr_end for KASLR -fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping -... unused hole ... -ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks -... unused hole ... -ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space -... unused hole ... -ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 -ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space +fffffe0000000000 - fffffe7fffffffff (=39 bits, 512 GB) cpu_entry_area mapping +fffffe8000000000 - fffffeffffffffff (=39 bits, 512 GB) unused hole +ffffff0000000000 - ffffff7fffffffff (=39 bits, 512 GB) %esp fixup stacks +ffffff8000000000 - ffffffeeffffffff (~39 bits, 444 GB) unused hole +ffffffef00000000 - fffffffeffffffff (=36 bits, 64 GB) EFI region mapping space +ffffffff00000000 - ffffffff7fffffff (31 bits, 2 GB) unused hole +ffffffff80000000 - ffffffff9fffffff (=29 bits, 512 MB) kernel text mapping, from phys 0 +ffffffffa0000000 - fffffffffeffffff (~31 bits, 1520 MB) module mapping space [fixmap start] - ffffffffff5fffff kernel-internal fixmap range -ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI -ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole +ffffffffff600000 - ffffffffff600fff ( =4 kB) legacy vsyscall ABI +ffffffffffe00000 - ffffffffffffffff ( =2 MB) unused hole Architecture defines a 64-bit virtual address. Implementations can support less. Currently supported are 48- and 57-bit virtual addresses. Bits 63 -- cgit v1.2.3 From 32b89760ddf4477da436c272be2abc016e169031 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 6 Oct 2018 13:41:12 +0200 Subject: x86/mm/doc: Enhance the x86-64 virtual memory layout descriptions After the cleanups from Baoquan He, make it even more readable: - Remove the 'bits' area size column: it's pretty pointless and was even wrong for some of the entries. Given that MB, GB, TB, PT are 10, 20, 30 and 40 bits, a "8 TB" size description makes it obvious that it's 43 bits. - Introduce an "offset" column: -------------------------------------------------------------------------------- start addr | offset | end addr | size | VM area description -----------------|------------|------------------|---------|-------------------- ... ffff880000000000 | -120 TB | ffffc7ffffffffff | 64 TB | direct mapping of all physical memory (page_offset_base), this is what limits max physical memory supported. The -120 TB notation makes it obvious where this particular virtual memory region starts: 120 TB down from the top of the 64-bit virtual memory space. Especially the layout of the kernel mappings is a *lot* more obvious when written this way, plus it's much easier to compare it with the size column and understand/check/validate and modify the kernel's layout in the future. - Mark the part from where the 47-bit and 56-bit kernel layouts are 100% identical, this starts at the -512 GB offset and the EFI region. - Re-shuffle the size desciptions to be continous blocks of sizes, instead of the often mixed size. I.e. write "0.5 TB" instead of "512 GB" if we are still in the TB-granular region of the map. - Make the 47-bit and 56-bit descriptions use the *exact* same layout and wording, and only differ where there's a material difference. This makes it easy to compare the two tables side by side by switching between two terminal tabs. - Plus enhance a lot of other stylistic/typographical details: make the tables explicitly tabular, add headers, enhance certain entries, etc. etc. Note that there are some apparent errors in the tables as well, but I'll fix them in a separate patch to make it easier to review/validate. Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: corbet@lwn.net Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: thgarnie@google.com Signed-off-by: Ingo Molnar --- Documentation/x86/x86_64/mm.txt | 171 ++++++++++++++++++++++++++++------------ 1 file changed, 120 insertions(+), 51 deletions(-) diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index b4bc95c9790e..702898633b00 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -1,55 +1,124 @@ +==================================================== +Complete virtual memory map with 4-level page tables +==================================================== -Virtual memory map with 4 level page tables: - -0000000000000000 - 00007fffffffffff (=47 bits, 128 TB) user space, different per mm - hole caused by [47:63] sign extension -ffff800000000000 - ffff87ffffffffff (=43 bits, 8 TB) guard hole, reserved for hypervisor -ffff880000000000 - ffffc7ffffffffff (=46 bits, 64 TB) direct mapping of all phys. memory (page_offset_base) -ffffc80000000000 - ffffc8ffffffffff (=40 bits, 1 TB) unused hole -ffffc90000000000 - ffffe8ffffffffff (=45 bits, 32 TB) vmalloc/ioremap space (vmalloc_base) -ffffe90000000000 - ffffe9ffffffffff (=40 bits, 1 TB) unused hole -ffffea0000000000 - ffffeaffffffffff (=40 bits, 1 TB) virtual memory map (vmemmap_base) -ffffeb0000000000 - ffffebffffffffff (=40 bits, 1 TB) unused hole -ffffec0000000000 - fffffbffffffffff (=44 bits, 16 TB) kasan shadow memory -fffffc0000000000 - fffffdffffffffff (=41 bits, 2 TB) unused hole - vaddr_end for KASLR -fffffe0000000000 - fffffe7fffffffff (=39 bits, 512 GB) cpu_entry_area mapping -fffffe8000000000 - fffffeffffffffff (=39 bits, 512 GB) LDT remap for PTI -ffffff0000000000 - ffffff7fffffffff (=39 bits, 512 GB) %esp fixup stacks -ffffff8000000000 - fffffffeefffffff (~39 bits, ~507 GB) unused hole -ffffffef00000000 - fffffffeffffffff (=36 bits, 64 GB) EFI region mapping space -ffffffff00000000 - ffffffff7fffffff (=31 bits, 2 GB) unused hole -ffffffff80000000 - ffffffff9fffffff (=29 bits, 512 MB) kernel text mapping, from phys 0 -ffffffffa0000000 - fffffffffeffffff (~31 bits, 1520 MB) module mapping space -[fixmap start] - ffffffffff5fffff kernel-internal fixmap range -ffffffffff600000 - ffffffffff600fff ( =4 kB) legacy vsyscall ABI -ffffffffffe00000 - ffffffffffffffff ( =2 MB) unused hole - -Virtual memory map with 5 level page tables: - -0000000000000000 - 00ffffffffffffff (=56 bits, 64 PB) user space, different per mm - hole caused by [56:63] sign extension -ff00000000000000 - ff0fffffffffffff (=52 bits, 4 PB) guard hole, reserved for hypervisor -ff10000000000000 - ff8fffffffffffff (=55 bits, 32 PB) direct mapping of all phys. memory (page_offset_base) -ff90000000000000 - ff9fffffffffffff (=52 bits, 4 PB) LDT remap for PTI -ffa0000000000000 - ffd1ffffffffffff (=53 bits, 12800 TB) vmalloc/ioremap space (vmalloc_base) -ffd2000000000000 - ffd3ffffffffffff (=49 bits, 512 TB) unused hole -ffd4000000000000 - ffd5ffffffffffff (=49 bits, 512 TB) virtual memory map (vmemmap_base) -ffd6000000000000 - ffdeffffffffffff (~51 bits, 2304 TB) unused hole -ffdf000000000000 - fffffdffffffffff (~53 bits, ~8 PB) kasan shadow memory -fffffc0000000000 - fffffdffffffffff (=41 bits, 2 TB) unused hole - vaddr_end for KASLR -fffffe0000000000 - fffffe7fffffffff (=39 bits, 512 GB) cpu_entry_area mapping -fffffe8000000000 - fffffeffffffffff (=39 bits, 512 GB) unused hole -ffffff0000000000 - ffffff7fffffffff (=39 bits, 512 GB) %esp fixup stacks -ffffff8000000000 - ffffffeeffffffff (~39 bits, 444 GB) unused hole -ffffffef00000000 - fffffffeffffffff (=36 bits, 64 GB) EFI region mapping space -ffffffff00000000 - ffffffff7fffffff (31 bits, 2 GB) unused hole -ffffffff80000000 - ffffffff9fffffff (=29 bits, 512 MB) kernel text mapping, from phys 0 -ffffffffa0000000 - fffffffffeffffff (~31 bits, 1520 MB) module mapping space -[fixmap start] - ffffffffff5fffff kernel-internal fixmap range -ffffffffff600000 - ffffffffff600fff ( =4 kB) legacy vsyscall ABI -ffffffffffe00000 - ffffffffffffffff ( =2 MB) unused hole +Notes: + + - Negative addresses such as "-23 TB" are absolute addresses in bytes, counted down + from the top of the 64-bit address space. It's easier to understand the layout + when seen both in absolute addresses and in distance-from-top notation. + + For example 0xffffe90000000000 == -23 TB, it's 23 TB lower than the top of the + 64-bit address space (ffffffffffffffff). + + Note that as we get closer to the top of the address space, the notation changes + from TB to GB and then MB/KB. + + - "16M TB" might look weird at first sight, but it's an easier to visualize size + notation than "16 EB", which few will recognize at first sight as 16 exabytes. + It also shows it nicely how incredibly large 64-bit address space is. + +======================================================================================================================== + Start addr | Offset | End addr | Size | VM area description +======================================================================================================================== + | | | | + 0000000000000000 | 0 | 00007fffffffffff | 128 TB | user-space virtual memory, different per mm +__________________|____________|__________________|_________|___________________________________________________________ + | | | | + 0000800000000000 | +128 TB | ffff7fffffffffff | ~16M TB | ... huge, almost 64 bits wide hole of non-canonical + | | | | virtual memory addresses up to the -128 TB + | | | | starting offset of kernel mappings. +__________________|____________|__________________|_________|___________________________________________________________ + | + | Kernel-space virtual memory, shared between all processes: +____________________________________________________________|___________________________________________________________ + | | | | + ffff800000000000 | -128 TB | ffff87ffffffffff | 8 TB | ... guard hole, also reserved for hypervisor + ffff880000000000 | -120 TB | ffffc7ffffffffff | 64 TB | direct mapping of all physical memory (page_offset_base) + ffffc80000000000 | -56 TB | ffffc8ffffffffff | 1 TB | ... unused hole + ffffc90000000000 | -55 TB | ffffe8ffffffffff | 32 TB | vmalloc/ioremap space (vmalloc_base) + ffffe90000000000 | -23 TB | ffffe9ffffffffff | 1 TB | ... unused hole + ffffea0000000000 | -22 TB | ffffeaffffffffff | 1 TB | virtual memory map (vmemmap_base) + ffffeb0000000000 | -21 TB | ffffebffffffffff | 1 TB | ... unused hole + ffffec0000000000 | -20 TB | fffffbffffffffff | 16 TB | KASAN shadow memory + fffffc0000000000 | -4 TB | fffffdffffffffff | 2 TB | ... unused hole + | | | | vaddr_end for KASLR + fffffe0000000000 | -2 TB | fffffe7fffffffff | 0.5 TB | cpu_entry_area mapping + fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | LDT remap for PTI + ffffff0000000000 | -1 TB | ffffff7fffffffff | 0.5 TB | %esp fixup stacks +__________________|____________|__________________|_________|____________________________________________________________ + | + | Identical layout to the 47-bit one from here on: +____________________________________________________________|____________________________________________________________ + | | | | + ffffff8000000000 | -512 GB | ffffffeeffffffff | 444 GB | ... unused hole + ffffffef00000000 | -68 GB | fffffffeffffffff | 64 GB | EFI region mapping space + ffffffff00000000 | -4 GB | ffffffff7fffffff | 2 GB | ... unused hole + ffffffff80000000 | -2 GB | ffffffff9fffffff | 512 MB | kernel text mapping, mapped to physical address 0 + ffffffff80000000 |-2048 MB | | | + ffffffffa0000000 |-1536 MB | fffffffffeffffff | 1520 MB | module mapping space + ffffffffff000000 | -16 MB | | | + FIXADDR_START | ~-11 MB | ffffffffff5fffff | ~0.5 MB | kernel-internal fixmap range, variable size and offset + ffffffffff600000 | -10 MB | ffffffffff600fff | 4 kB | legacy vsyscall ABI + ffffffffffe00000 | -2 MB | ffffffffffffffff | 2 MB | ... unused hole +__________________|____________|__________________|_________|___________________________________________________________ + + +==================================================== +Complete virtual memory map with 5-level page tables +==================================================== + +Notes: + + - With 56-bit addresses, user-space memory gets expanded by a factor of 512x, + from 0.125 PB to 64 PB. All kernel mappings shift down to the -64 PT starting + offset and many of the regions expand to support the much larger physical + memory supported. + +======================================================================================================================== + Start addr | Offset | End addr | Size | VM area description +======================================================================================================================== + | | | | + 0000000000000000 | 0 | 00ffffffffffffff | 64 PB | user-space virtual memory, different per mm +__________________|____________|__________________|_________|___________________________________________________________ + | | | | + 0000800000000000 | +64 PB | ffff7fffffffffff | ~16K PB | ... huge, still almost 64 bits wide hole of non-canonical + | | | | virtual memory addresses up to the -128 TB + | | | | starting offset of kernel mappings. +__________________|____________|__________________|_________|___________________________________________________________ + | + | Kernel-space virtual memory, shared between all processes: +____________________________________________________________|___________________________________________________________ + | | | | + ff00000000000000 | -64 PB | ff0fffffffffffff | 4 PB | ... guard hole, also reserved for hypervisor + ff10000000000000 | -60 PB | ff8fffffffffffff | 32 PB | direct mapping of all physical memory (page_offset_base) + ff90000000000000 | -28 PB | ff9fffffffffffff | 4 PB | LDT remap for PTI + ffa0000000000000 | -24 PB | ffd1ffffffffffff | 12.5 PB | vmalloc/ioremap space (vmalloc_base) + ffd2000000000000 | -11.5 PB | ffd3ffffffffffff | 0.5 PB | ... unused hole + ffd4000000000000 | -11 PB | ffd5ffffffffffff | 0.5 PB | virtual memory map (vmemmap_base) + ffd6000000000000 | -10.5 PB | ffdeffffffffffff | 2.25 PB | ... unused hole + ffdf000000000000 | -8.25 PB | fffffdffffffffff | ~8 PB | KASAN shadow memory + fffffc0000000000 | -4 TB | fffffdffffffffff | 2 TB | ... unused hole + | | | | vaddr_end for KASLR + fffffe0000000000 | -2 TB | fffffe7fffffffff | 0.5 TB | cpu_entry_area mapping + fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | ... unused hole + ffffff0000000000 | -1 TB | ffffff7fffffffff | 0.5 TB | %esp fixup stacks +__________________|____________|__________________|_________|____________________________________________________________ + | + | Identical layout to the 47-bit one from here on: +____________________________________________________________|____________________________________________________________ + | | | | + ffffff8000000000 | -512 GB | ffffffeeffffffff | 444 GB | ... unused hole + ffffffef00000000 | -68 GB | fffffffeffffffff | 64 GB | EFI region mapping space + ffffffff00000000 | -4 GB | ffffffff7fffffff | 2 GB | ... unused hole + ffffffff80000000 | -2 GB | ffffffff9fffffff | 512 MB | kernel text mapping, mapped to physical address 0 + ffffffff80000000 |-2048 MB | | | + ffffffffa0000000 |-1536 MB | fffffffffeffffff | 1520 MB | module mapping space + ffffffffff000000 | -16 MB | | | + FIXADDR_START | ~-11 MB | ffffffffff5fffff | ~0.5 MB | kernel-internal fixmap range, variable size and offset + ffffffffff600000 | -10 MB | ffffffffff600fff | 4 kB | legacy vsyscall ABI + ffffffffffe00000 | -2 MB | ffffffffffffffff | 2 MB | ... unused hole +__________________|____________|__________________|_________|___________________________________________________________ Architecture defines a 64-bit virtual address. Implementations can support less. Currently supported are 48- and 57-bit virtual addresses. Bits 63 -- cgit v1.2.3 From cf089611f4c446285046fcd426d90c18f37d2905 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 8 Oct 2018 10:05:20 +0200 Subject: proc/vmcore: Fix i386 build error of missing copy_oldmem_page_encrypted() Lianbo reported a build error with a particular 32-bit config, see Link below for details. Provide a weak copy_oldmem_page_encrypted() function which architectures can override, in the same manner other functionality in that file is supplied. Reported-by: Lianbo Jiang Signed-off-by: Borislav Petkov CC: x86@kernel.org Link: http://lkml.kernel.org/r/710b9d95-2f70-eadf-c4a1-c3dc80ee4ebb@redhat.com --- fs/proc/vmcore.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 42c32d06f7da..91ae16fbd7d5 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -187,6 +187,16 @@ int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma, return remap_pfn_range(vma, from, pfn, size, prot); } +/* + * Architectures which support memory encryption override this. + */ +ssize_t __weak +copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize, + unsigned long offset, int userbuf) +{ + return copy_oldmem_page(pfn, buf, csize, offset, userbuf); +} + /* * Copy to either kernel or user space */ -- cgit v1.2.3 From a31acd3ee8f7dbc0370bdf4a4bfef7a8c13c7542 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 26 Aug 2018 12:56:48 +0200 Subject: x86/mm: Page size aware flush_tlb_mm_range() Use the new tlb_get_unmap_shift() to determine the stride of the INVLPG loop. Cc: Nick Piggin Cc: Will Deacon Cc: "Aneesh Kumar K.V" Cc: Andrew Morton Cc: Dave Hansen Signed-off-by: Peter Zijlstra (Intel) --- arch/x86/include/asm/tlb.h | 21 ++++++++++++++------- arch/x86/include/asm/tlbflush.h | 12 ++++++++---- arch/x86/kernel/ldt.c | 2 +- arch/x86/kernel/vm86_32.c | 2 +- arch/x86/mm/tlb.c | 17 ++++++++--------- mm/pgtable-generic.c | 1 + 6 files changed, 33 insertions(+), 22 deletions(-) diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index cb0a1f470980..afbe7d1e68cf 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -6,16 +6,23 @@ #define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) -#define tlb_flush(tlb) \ -{ \ - if (!tlb->fullmm && !tlb->need_flush_all) \ - flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end, 0UL); \ - else \ - flush_tlb_mm_range(tlb->mm, 0UL, TLB_FLUSH_ALL, 0UL); \ -} +static inline void tlb_flush(struct mmu_gather *tlb); #include +static inline void tlb_flush(struct mmu_gather *tlb) +{ + unsigned long start = 0UL, end = TLB_FLUSH_ALL; + unsigned int stride_shift = tlb_get_unmap_shift(tlb); + + if (!tlb->fullmm && !tlb->need_flush_all) { + start = tlb->start; + end = tlb->end; + } + + flush_tlb_mm_range(tlb->mm, start, end, stride_shift); +} + /* * While x86 architecture in general requires an IPI to perform TLB * shootdown, enablement code for several hypervisors overrides diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 58ce5288878e..671f65309ce7 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -547,23 +547,27 @@ struct flush_tlb_info { unsigned long start; unsigned long end; u64 new_tlb_gen; + unsigned int stride_shift; }; #define local_flush_tlb() __flush_tlb() #define flush_tlb_mm(mm) flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL) -#define flush_tlb_range(vma, start, end) \ - flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags) +#define flush_tlb_range(vma, start, end) \ + flush_tlb_mm_range((vma)->vm_mm, start, end, \ + ((vma)->vm_flags & VM_HUGETLB) \ + ? huge_page_shift(hstate_vma(vma)) \ + : PAGE_SHIFT) extern void flush_tlb_all(void); extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, - unsigned long end, unsigned long vmflag); + unsigned long end, unsigned int stride_shift); extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) { - flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE); + flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT); } void native_flush_tlb_others(const struct cpumask *cpumask, diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 733e6ace0fa4..7fdb2414ca65 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -273,7 +273,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) map_ldt_struct_to_user(mm); va = (unsigned long)ldt_slot_va(slot); - flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0); + flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, PAGE_SHIFT); ldt->slot = slot; return 0; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 1c03e4aa6474..52fed70f671e 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -199,7 +199,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) pte_unmap_unlock(pte, ptl); out: up_write(&mm->mmap_sem); - flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, 0UL); + flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT); } diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index e96b99eb800c..6aa195796dec 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -528,17 +528,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, f->new_tlb_gen == local_tlb_gen + 1 && f->new_tlb_gen == mm_tlb_gen) { /* Partial flush */ - unsigned long addr; - unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; + unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift; + unsigned long addr = f->start; - addr = f->start; while (addr < f->end) { __flush_tlb_one_user(addr); - addr += PAGE_SIZE; + addr += 1UL << f->stride_shift; } if (local) - count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); - trace_tlb_flush(reason, nr_pages); + count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate); + trace_tlb_flush(reason, nr_invalidate); } else { /* Full flush. */ local_flush_tlb(); @@ -623,12 +622,13 @@ void native_flush_tlb_others(const struct cpumask *cpumask, static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, - unsigned long end, unsigned long vmflag) + unsigned long end, unsigned int stride_shift) { int cpu; struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = { .mm = mm, + .stride_shift = stride_shift, }; cpu = get_cpu(); @@ -638,8 +638,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, /* Should we flush just the requested range? */ if ((end != TLB_FLUSH_ALL) && - !(vmflag & VM_HUGETLB) && - ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) { + ((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) { info.start = start; info.end = end; } else { diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index cf2af04b34b9..532c29276fce 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -8,6 +8,7 @@ */ #include +#include #include #include -- cgit v1.2.3 From 5462bc3a9a3c38328bbbd276d51164c7cf21d6a8 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 25 Sep 2018 23:58:38 -0400 Subject: x86/mm/tlb: Always use lazy TLB mode On most workloads, the number of context switches far exceeds the number of TLB flushes sent. Optimizing the context switches, by always using lazy TLB mode, speeds up those workloads. This patch results in about a 1% reduction in CPU use on a two socket Broadwell system running a memcache like workload. Cc: npiggin@gmail.com Cc: efault@gmx.de Cc: will.deacon@arm.com Cc: Linus Torvalds Cc: Thomas Gleixner Cc: kernel-team@fb.com Cc: hpa@zytor.com Cc: luto@kernel.org Tested-by: Song Liu Signed-off-by: Rik van Riel (cherry picked from commit 95b0e6357d3e4e05349668940d7ff8f3b7e7e11e) Acked-by: Dave Hansen Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180716190337.26133-7-riel@surriel.com --- arch/x86/include/asm/tlbflush.h | 16 ---------------- arch/x86/mm/tlb.c | 15 +-------------- 2 files changed, 1 insertion(+), 30 deletions(-) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 671f65309ce7..d6c0cd9e9591 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) #define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr) #endif -static inline bool tlb_defer_switch_to_init_mm(void) -{ - /* - * If we have PCID, then switching to init_mm is reasonably - * fast. If we don't have PCID, then switching to init_mm is - * quite slow, so we try to defer it in the hopes that we can - * avoid it entirely. The latter approach runs the risk of - * receiving otherwise unnecessary IPIs. - * - * This choice is just a heuristic. The tlb code can handle this - * function returning true or false regardless of whether we have - * PCID. - */ - return !static_cpu_has(X86_FEATURE_PCID); -} - struct tlb_context { u64 ctx_id; u64 tlb_gen; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 6aa195796dec..54a5870190a6 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -368,20 +368,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) return; - if (tlb_defer_switch_to_init_mm()) { - /* - * There's a significant optimization that may be possible - * here. We have accurate enough TLB flush tracking that we - * don't need to maintain coherence of TLB per se when we're - * lazy. We do, however, need to maintain coherence of - * paging-structure caches. We could, in principle, leave our - * old mm loaded and only switch to init_mm when - * tlb_remove_page() happens. - */ - this_cpu_write(cpu_tlbstate.is_lazy, true); - } else { - switch_mm(NULL, &init_mm, NULL); - } + this_cpu_write(cpu_tlbstate.is_lazy, true); } /* -- cgit v1.2.3 From 12c4d978fd170ccdd7260ec11f93b11e46904228 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 25 Sep 2018 23:58:39 -0400 Subject: x86/mm/tlb: Restructure switch_mm_irqs_off() Move some code that will be needed for the lazy -> !lazy state transition when a lazy TLB CPU has gotten out of date. No functional changes, since the if (real_prev == next) branch always returns. (cherry picked from commit 61d0beb5796ab11f7f3bf38cb2eccc6579aaa70b) Cc: npiggin@gmail.com Cc: efault@gmx.de Cc: will.deacon@arm.com Cc: Linus Torvalds Cc: Thomas Gleixner Cc: songliubraving@fb.com Cc: kernel-team@fb.com Cc: hpa@zytor.com Suggested-by: Andy Lutomirski Signed-off-by: Rik van Riel Acked-by: Dave Hansen Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180716190337.26133-4-riel@surriel.com --- arch/x86/mm/tlb.c | 66 +++++++++++++++++++++++++++---------------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 54a5870190a6..9fb30d27854b 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -187,6 +187,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); unsigned cpu = smp_processor_id(); u64 next_tlb_gen; + bool need_flush; + u16 new_asid; /* * NB: The scheduler will call us with prev == next when switching @@ -252,8 +254,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, return; } else { - u16 new_asid; - bool need_flush; u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); /* @@ -308,44 +308,44 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, /* Let nmi_uaccess_okay() know that we're changing CR3. */ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); barrier(); + } - if (need_flush) { - this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); - this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - load_new_mm_cr3(next->pgd, new_asid, true); - - /* - * NB: This gets called via leave_mm() in the idle path - * where RCU functions differently. Tracing normally - * uses RCU, so we need to use the _rcuidle variant. - * - * (There is no good reason for this. The idle code should - * be rearranged to call this before rcu_idle_enter().) - */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); - } else { - /* The new ASID is already up to date. */ - load_new_mm_cr3(next->pgd, new_asid, false); - - /* See above wrt _rcuidle. */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); - } + if (need_flush) { + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); + load_new_mm_cr3(next->pgd, new_asid, true); /* - * Record last user mm's context id, so we can avoid - * flushing branch buffer with IBPB if we switch back - * to the same user. + * NB: This gets called via leave_mm() in the idle path + * where RCU functions differently. Tracing normally + * uses RCU, so we need to use the _rcuidle variant. + * + * (There is no good reason for this. The idle code should + * be rearranged to call this before rcu_idle_enter().) */ - if (next != &init_mm) - this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); - - /* Make sure we write CR3 before loaded_mm. */ - barrier(); + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + } else { + /* The new ASID is already up to date. */ + load_new_mm_cr3(next->pgd, new_asid, false); - this_cpu_write(cpu_tlbstate.loaded_mm, next); - this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); + /* See above wrt _rcuidle. */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); } + /* + * Record last user mm's context id, so we can avoid + * flushing branch buffer with IBPB if we switch back + * to the same user. + */ + if (next != &init_mm) + this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); + + /* Make sure we write CR3 before loaded_mm. */ + barrier(); + + this_cpu_write(cpu_tlbstate.loaded_mm, next); + this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); + load_mm_cr4(next); switch_ldt(real_prev, next); } -- cgit v1.2.3 From c3f7f2c7eba1a53d2e5ffbc2dcc9a20c5f094890 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 25 Sep 2018 23:58:40 -0400 Subject: smp: use __cpumask_set_cpu in on_each_cpu_cond The code in on_each_cpu_cond sets CPUs in a locally allocated bitmask, which should never be used by other CPUs simultaneously. There is no need to use locked memory accesses to set the bits in this bitmap. Switch to __cpumask_set_cpu. Cc: npiggin@gmail.com Cc: mingo@kernel.org Cc: will.deacon@arm.com Cc: songliubraving@fb.com Cc: kernel-team@fb.com Cc: hpa@zytor.com Suggested-by: Peter Zijlstra Signed-off-by: Rik van Riel Reviewed-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180926035844.1420-4-riel@surriel.com --- kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/smp.c b/kernel/smp.c index d86eec5f51c1..a7d4f9f50a49 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -682,7 +682,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), preempt_disable(); for_each_online_cpu(cpu) if (cond_func(cpu, info)) - cpumask_set_cpu(cpu, cpus); + __cpumask_set_cpu(cpu, cpus); on_each_cpu_mask(cpus, func, info, wait); preempt_enable(); free_cpumask_var(cpus); -- cgit v1.2.3 From 7d49b28a80b830c3ca876d33bedc58d62a78e16f Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 25 Sep 2018 23:58:41 -0400 Subject: smp,cpumask: introduce on_each_cpu_cond_mask Introduce a variant of on_each_cpu_cond that iterates only over the CPUs in a cpumask, in order to avoid making callbacks for every single CPU in the system when we only need to test a subset. Cc: npiggin@gmail.com Cc: mingo@kernel.org Cc: will.deacon@arm.com Cc: songliubraving@fb.com Cc: kernel-team@fb.com Cc: hpa@zytor.com Cc: luto@kernel.org Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180926035844.1420-5-riel@surriel.com --- include/linux/smp.h | 4 ++++ kernel/smp.c | 17 +++++++++++++---- kernel/up.c | 14 +++++++++++--- 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/include/linux/smp.h b/include/linux/smp.h index 9fb239e12b82..a56f08ff3097 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -53,6 +53,10 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), smp_call_func_t func, void *info, bool wait, gfp_t gfp_flags); +void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags, const struct cpumask *mask); + int smp_call_function_single_async(int cpu, call_single_data_t *csd); #ifdef CONFIG_SMP diff --git a/kernel/smp.c b/kernel/smp.c index a7d4f9f50a49..163c451af42e 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -669,9 +669,9 @@ EXPORT_SYMBOL(on_each_cpu_mask); * You must not call this function with disabled interrupts or * from a hardware interrupt handler or from a bottom half handler. */ -void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), +void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info), smp_call_func_t func, void *info, bool wait, - gfp_t gfp_flags) + gfp_t gfp_flags, const struct cpumask *mask) { cpumask_var_t cpus; int cpu, ret; @@ -680,7 +680,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) { preempt_disable(); - for_each_online_cpu(cpu) + for_each_cpu(cpu, mask) if (cond_func(cpu, info)) __cpumask_set_cpu(cpu, cpus); on_each_cpu_mask(cpus, func, info, wait); @@ -692,7 +692,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), * just have to IPI them one by one. */ preempt_disable(); - for_each_online_cpu(cpu) + for_each_cpu(cpu, mask) if (cond_func(cpu, info)) { ret = smp_call_function_single(cpu, func, info, wait); @@ -701,6 +701,15 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), preempt_enable(); } } +EXPORT_SYMBOL(on_each_cpu_cond_mask); + +void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags) +{ + on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, + cpu_online_mask); +} EXPORT_SYMBOL(on_each_cpu_cond); static void do_nothing(void *unused) diff --git a/kernel/up.c b/kernel/up.c index 42c46bf3e0a5..ff536f9cc8a2 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -68,9 +68,9 @@ EXPORT_SYMBOL(on_each_cpu_mask); * Preemption is disabled here to make sure the cond_func is called under the * same condtions in UP and SMP. */ -void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), - smp_call_func_t func, void *info, bool wait, - gfp_t gfp_flags) +void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags, const struct cpumask *mask) { unsigned long flags; @@ -82,6 +82,14 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), } preempt_enable(); } +EXPORT_SYMBOL(on_each_cpu_cond_mask); + +void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags) +{ + on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, NULL); +} EXPORT_SYMBOL(on_each_cpu_cond); int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) -- cgit v1.2.3 From 016c4d92cd16f569c6485ae62b076c1a4b779536 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 25 Sep 2018 23:58:42 -0400 Subject: x86/mm/tlb: Add freed_tables argument to flush_tlb_mm_range Add an argument to flush_tlb_mm_range to indicate whether page tables are about to be freed after this TLB flush. This allows for an optimization of flush_tlb_mm_range to skip CPUs in lazy TLB mode. No functional changes. Cc: npiggin@gmail.com Cc: mingo@kernel.org Cc: will.deacon@arm.com Cc: songliubraving@fb.com Cc: kernel-team@fb.com Cc: luto@kernel.org Cc: hpa@zytor.com Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180926035844.1420-6-riel@surriel.com --- arch/x86/include/asm/tlb.h | 2 +- arch/x86/include/asm/tlbflush.h | 10 ++++++---- arch/x86/kernel/ldt.c | 2 +- arch/x86/kernel/vm86_32.c | 2 +- arch/x86/mm/tlb.c | 3 ++- 5 files changed, 11 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index afbe7d1e68cf..404b8b1d44f5 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -20,7 +20,7 @@ static inline void tlb_flush(struct mmu_gather *tlb) end = tlb->end; } - flush_tlb_mm_range(tlb->mm, start, end, stride_shift); + flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables); } /* diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index d6c0cd9e9591..1dea9860ce5b 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -536,22 +536,24 @@ struct flush_tlb_info { #define local_flush_tlb() __flush_tlb() -#define flush_tlb_mm(mm) flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL) +#define flush_tlb_mm(mm) \ + flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true) #define flush_tlb_range(vma, start, end) \ flush_tlb_mm_range((vma)->vm_mm, start, end, \ ((vma)->vm_flags & VM_HUGETLB) \ ? huge_page_shift(hstate_vma(vma)) \ - : PAGE_SHIFT) + : PAGE_SHIFT, false) extern void flush_tlb_all(void); extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, - unsigned long end, unsigned int stride_shift); + unsigned long end, unsigned int stride_shift, + bool freed_tables); extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) { - flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT); + flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false); } void native_flush_tlb_others(const struct cpumask *cpumask, diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 7fdb2414ca65..ab18e0884dc6 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -273,7 +273,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) map_ldt_struct_to_user(mm); va = (unsigned long)ldt_slot_va(slot); - flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, PAGE_SHIFT); + flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, PAGE_SHIFT, false); ldt->slot = slot; return 0; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 52fed70f671e..c2fd39752da8 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -199,7 +199,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) pte_unmap_unlock(pte, ptl); out: up_write(&mm->mmap_sem); - flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT); + flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false); } diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 9fb30d27854b..14bf39fc0447 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -609,7 +609,8 @@ void native_flush_tlb_others(const struct cpumask *cpumask, static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, - unsigned long end, unsigned int stride_shift) + unsigned long end, unsigned int stride_shift, + bool freed_tables) { int cpu; -- cgit v1.2.3 From 97807813fe7074ee865d6bc1df1d0f8fb878ee9d Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 25 Sep 2018 23:58:43 -0400 Subject: x86/mm/tlb: Add freed_tables element to flush_tlb_info Pass the information on to native_flush_tlb_others. No functional changes. Cc: npiggin@gmail.com Cc: mingo@kernel.org Cc: will.deacon@arm.com Cc: songliubraving@fb.com Cc: kernel-team@fb.com Cc: hpa@zytor.com Cc: luto@kernel.org Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180926035844.1420-7-riel@surriel.com --- arch/x86/include/asm/tlbflush.h | 1 + arch/x86/mm/tlb.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 1dea9860ce5b..323a313947e0 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -532,6 +532,7 @@ struct flush_tlb_info { unsigned long end; u64 new_tlb_gen; unsigned int stride_shift; + bool freed_tables; }; #define local_flush_tlb() __flush_tlb() diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 14bf39fc0447..92e46f4c058c 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -617,6 +617,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = { .mm = mm, .stride_shift = stride_shift, + .freed_tables = freed_tables, }; cpu = get_cpu(); -- cgit v1.2.3 From 145f573b89a62bf53cfc0144fa9b1c56b0f70b45 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 25 Sep 2018 23:58:44 -0400 Subject: x86/mm/tlb: Make lazy TLB mode lazier Lazy TLB mode can result in an idle CPU being woken up by a TLB flush, when all it really needs to do is reload %CR3 at the next context switch, assuming no page table pages got freed. Memory ordering is used to prevent race conditions between switch_mm_irqs_off, which checks whether .tlb_gen changed, and the TLB invalidation code, which increments .tlb_gen whenever page table entries get invalidated. The atomic increment in inc_mm_tlb_gen is its own barrier; the context switch code adds an explicit barrier between reading tlbstate.is_lazy and next->context.tlb_gen. CPUs in lazy TLB mode remain part of the mm_cpumask(mm), both because that allows TLB flush IPIs to be sent at page table freeing time, and because the cache line bouncing on the mm_cpumask(mm) was responsible for about half the CPU use in switch_mm_irqs_off(). We can change native_flush_tlb_others() without touching other (paravirt) implementations of flush_tlb_others() because we'll be flushing less. The existing implementations flush more and are therefore still correct. Cc: npiggin@gmail.com Cc: mingo@kernel.org Cc: will.deacon@arm.com Cc: kernel-team@fb.com Cc: luto@kernel.org Cc: hpa@zytor.com Tested-by: Song Liu Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180926035844.1420-8-riel@surriel.com --- arch/x86/mm/tlb.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 58 insertions(+), 9 deletions(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 92e46f4c058c..7d68489cfdb1 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -185,6 +185,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, { struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy); unsigned cpu = smp_processor_id(); u64 next_tlb_gen; bool need_flush; @@ -242,17 +243,40 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, next->context.ctx_id); /* - * We don't currently support having a real mm loaded without - * our cpu set in mm_cpumask(). We have all the bookkeeping - * in place to figure out whether we would need to flush - * if our cpu were cleared in mm_cpumask(), but we don't - * currently use it. + * Even in lazy TLB mode, the CPU should stay set in the + * mm_cpumask. The TLB shootdown code can figure out from + * from cpu_tlbstate.is_lazy whether or not to send an IPI. */ if (WARN_ON_ONCE(real_prev != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))) cpumask_set_cpu(cpu, mm_cpumask(next)); - return; + /* + * If the CPU is not in lazy TLB mode, we are just switching + * from one thread in a process to another thread in the same + * process. No TLB flush required. + */ + if (!was_lazy) + return; + + /* + * Read the tlb_gen to check whether a flush is needed. + * If the TLB is up to date, just use it. + * The barrier synchronizes with the tlb_gen increment in + * the TLB shootdown code. + */ + smp_mb(); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) == + next_tlb_gen) + return; + + /* + * TLB contents went out of date while we were in lazy + * mode. Fall through to the TLB switching code below. + */ + new_asid = prev_asid; + need_flush = true; } else { u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); @@ -346,8 +370,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, this_cpu_write(cpu_tlbstate.loaded_mm, next); this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); - load_mm_cr4(next); - switch_ldt(real_prev, next); + if (next != real_prev) { + load_mm_cr4(next); + switch_ldt(real_prev, next); + } } /* @@ -455,6 +481,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, * paging-structure cache to avoid speculatively reading * garbage into our TLB. Since switching to init_mm is barely * slower than a minimal flush, just switch to init_mm. + * + * This should be rare, with native_flush_tlb_others skipping + * IPIs to lazy TLB mode CPUs. */ switch_mm_irqs_off(NULL, &init_mm, NULL); return; @@ -557,6 +586,11 @@ static void flush_tlb_func_remote(void *info) flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN); } +static bool tlb_is_not_lazy(int cpu, void *data) +{ + return !per_cpu(cpu_tlbstate.is_lazy, cpu); +} + void native_flush_tlb_others(const struct cpumask *cpumask, const struct flush_tlb_info *info) { @@ -592,8 +626,23 @@ void native_flush_tlb_others(const struct cpumask *cpumask, (void *)info, 1); return; } - smp_call_function_many(cpumask, flush_tlb_func_remote, + + /* + * If no page tables were freed, we can skip sending IPIs to + * CPUs in lazy TLB mode. They will flush the CPU themselves + * at the next context switch. + * + * However, if page tables are getting freed, we need to send the + * IPI everywhere, to prevent CPUs in lazy TLB mode from tripping + * up on the new contents of what used to be page tables, while + * doing a speculative memory access. + */ + if (info->freed_tables) + smp_call_function_many(cpumask, flush_tlb_func_remote, (void *)info, 1); + else + on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote, + (void *)info, 1, GFP_ATOMIC, cpumask); } /* -- cgit v1.2.3 From 164477c2331be75d9bd57fb76704e676b2bcd1cd Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 28 Sep 2018 09:02:20 -0700 Subject: x86/mm: Clarify hardware vs. software "error_code" We pass around a variable called "error_code" all around the page fault code. Sounds simple enough, especially since "error_code" looks like it exactly matches the values that the hardware gives us on the stack to report the page fault error code (PFEC in SDM parlance). But, that's not how it works. For part of the page fault handler, "error_code" does exactly match PFEC. But, during later parts, it diverges and starts to mean something a bit different. Give it two names for its two jobs. The place it diverges is also really screwy. It's only in a spot where the hardware tells us we have kernel-mode access that occurred while we were in usermode accessing user-controlled address space. Add a warning in there. Cc: x86@kernel.org Cc: Jann Horn Cc: Sean Christopherson Cc: Thomas Gleixner Cc: Andy Lutomirski Signed-off-by: Dave Hansen Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180928160220.4A2272C9@viggo.jf.intel.com --- arch/x86/mm/fault.c | 77 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 25 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 47bebfe6efa7..cd08f4fef836 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1208,9 +1208,10 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) * routines. */ static noinline void -__do_page_fault(struct pt_regs *regs, unsigned long error_code, +__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, unsigned long address) { + unsigned long sw_error_code; struct vm_area_struct *vma; struct task_struct *tsk; struct mm_struct *mm; @@ -1236,17 +1237,17 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * nothing more. * * This verifies that the fault happens in kernel space - * (error_code & 4) == 0, and that the fault was not a - * protection error (error_code & 9) == 0. + * (hw_error_code & 4) == 0, and that the fault was not a + * protection error (hw_error_code & 9) == 0. */ if (unlikely(fault_in_kernel_space(address))) { - if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { + if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { if (vmalloc_fault(address) >= 0) return; } /* Can handle a stale RO->RW TLB: */ - if (spurious_fault(error_code, address)) + if (spurious_fault(hw_error_code, address)) return; /* kprobes don't want to hook the spurious faults: */ @@ -1256,7 +1257,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * Don't take the mm semaphore here. If we fixup a prefetch * fault we could otherwise deadlock: */ - bad_area_nosemaphore(regs, error_code, address, NULL); + bad_area_nosemaphore(regs, hw_error_code, address, NULL); return; } @@ -1265,11 +1266,11 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, if (unlikely(kprobes_fault(regs))) return; - if (unlikely(error_code & X86_PF_RSVD)) - pgtable_bad(regs, error_code, address); + if (unlikely(hw_error_code & X86_PF_RSVD)) + pgtable_bad(regs, hw_error_code, address); - if (unlikely(smap_violation(error_code, regs))) { - bad_area_nosemaphore(regs, error_code, address, NULL); + if (unlikely(smap_violation(hw_error_code, regs))) { + bad_area_nosemaphore(regs, hw_error_code, address, NULL); return; } @@ -1278,10 +1279,17 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * in a region with pagefaults disabled then we must not take the fault */ if (unlikely(faulthandler_disabled() || !mm)) { - bad_area_nosemaphore(regs, error_code, address, NULL); + bad_area_nosemaphore(regs, hw_error_code, address, NULL); return; } + /* + * hw_error_code is literally the "page fault error code" passed to + * the kernel directly from the hardware. But, we will shortly be + * modifying it in software, so give it a new name. + */ + sw_error_code = hw_error_code; + /* * It's safe to allow irq's after cr2 has been saved and the * vmalloc fault has been handled. @@ -1291,7 +1299,26 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, */ if (user_mode(regs)) { local_irq_enable(); - error_code |= X86_PF_USER; + /* + * Up to this point, X86_PF_USER set in hw_error_code + * indicated a user-mode access. But, after this, + * X86_PF_USER in sw_error_code will indicate either + * that, *or* an implicit kernel(supervisor)-mode access + * which originated from user mode. + */ + if (!(hw_error_code & X86_PF_USER)) { + /* + * The CPU was in user mode, but the CPU says + * the fault was not a user-mode access. + * Must be an implicit kernel-mode access, + * which we do not expect to happen in the + * user address space. + */ + pr_warn_once("kernel-mode error from user-mode: %lx\n", + hw_error_code); + + sw_error_code |= X86_PF_USER; + } flags |= FAULT_FLAG_USER; } else { if (regs->flags & X86_EFLAGS_IF) @@ -1300,9 +1327,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - if (error_code & X86_PF_WRITE) + if (sw_error_code & X86_PF_WRITE) flags |= FAULT_FLAG_WRITE; - if (error_code & X86_PF_INSTR) + if (sw_error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; /* @@ -1322,9 +1349,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * space check, thus avoiding the deadlock: */ if (unlikely(!down_read_trylock(&mm->mmap_sem))) { - if (!(error_code & X86_PF_USER) && + if (!(sw_error_code & X86_PF_USER) && !search_exception_tables(regs->ip)) { - bad_area_nosemaphore(regs, error_code, address, NULL); + bad_area_nosemaphore(regs, sw_error_code, address, NULL); return; } retry: @@ -1340,16 +1367,16 @@ retry: vma = find_vma(mm, address); if (unlikely(!vma)) { - bad_area(regs, error_code, address); + bad_area(regs, sw_error_code, address); return; } if (likely(vma->vm_start <= address)) goto good_area; if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { - bad_area(regs, error_code, address); + bad_area(regs, sw_error_code, address); return; } - if (error_code & X86_PF_USER) { + if (sw_error_code & X86_PF_USER) { /* * Accessing the stack below %sp is always a bug. * The large cushion allows instructions like enter @@ -1357,12 +1384,12 @@ retry: * 32 pointers and then decrements %sp by 65535.) */ if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { - bad_area(regs, error_code, address); + bad_area(regs, sw_error_code, address); return; } } if (unlikely(expand_stack(vma, address))) { - bad_area(regs, error_code, address); + bad_area(regs, sw_error_code, address); return; } @@ -1371,8 +1398,8 @@ retry: * we can handle it.. */ good_area: - if (unlikely(access_error(error_code, vma))) { - bad_area_access_error(regs, error_code, address, vma); + if (unlikely(access_error(sw_error_code, vma))) { + bad_area_access_error(regs, sw_error_code, address, vma); return; } @@ -1414,13 +1441,13 @@ good_area: return; /* Not returning to user mode? Handle exceptions or die: */ - no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); + no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR); return; } up_read(&mm->mmap_sem); if (unlikely(fault & VM_FAULT_ERROR)) { - mm_fault_error(regs, error_code, address, &pkey, fault); + mm_fault_error(regs, sw_error_code, address, &pkey, fault); return; } -- cgit v1.2.3 From 8fed62000039058adfd8b663344e2f448aed1e7a Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 28 Sep 2018 09:02:22 -0700 Subject: x86/mm: Break out kernel address space handling The page fault handler (__do_page_fault()) basically has two sections: one for handling faults in the kernel portion of the address space and another for faults in the user portion of the address space. But, these two parts don't stick out that well. Let's make that more clear from code separation and naming. Pull kernel fault handling into its own helper, and reflect that naming by renaming spurious_fault() -> spurious_kernel_fault(). Also, rewrite the vmalloc() handling comment a bit. It was a bit stale and also glossed over the reserved bit handling. Cc: x86@kernel.org Cc: Jann Horn Cc: Sean Christopherson Cc: Thomas Gleixner Cc: Andy Lutomirski Signed-off-by: Dave Hansen Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180928160222.401F4E10@viggo.jf.intel.com --- arch/x86/mm/fault.c | 101 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 62 insertions(+), 39 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index cd08f4fef836..c7e32f453852 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1032,7 +1032,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, } } -static int spurious_fault_check(unsigned long error_code, pte_t *pte) +static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte) { if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) return 0; @@ -1071,7 +1071,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte) * (Optional Invalidation). */ static noinline int -spurious_fault(unsigned long error_code, unsigned long address) +spurious_kernel_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; p4d_t *p4d; @@ -1102,27 +1102,27 @@ spurious_fault(unsigned long error_code, unsigned long address) return 0; if (p4d_large(*p4d)) - return spurious_fault_check(error_code, (pte_t *) p4d); + return spurious_kernel_fault_check(error_code, (pte_t *) p4d); pud = pud_offset(p4d, address); if (!pud_present(*pud)) return 0; if (pud_large(*pud)) - return spurious_fault_check(error_code, (pte_t *) pud); + return spurious_kernel_fault_check(error_code, (pte_t *) pud); pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return 0; if (pmd_large(*pmd)) - return spurious_fault_check(error_code, (pte_t *) pmd); + return spurious_kernel_fault_check(error_code, (pte_t *) pmd); pte = pte_offset_kernel(pmd, address); if (!pte_present(*pte)) return 0; - ret = spurious_fault_check(error_code, pte); + ret = spurious_kernel_fault_check(error_code, pte); if (!ret) return 0; @@ -1130,12 +1130,12 @@ spurious_fault(unsigned long error_code, unsigned long address) * Make sure we have permissions in PMD. * If not, then there's a bug in the page tables: */ - ret = spurious_fault_check(error_code, (pte_t *) pmd); + ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd); WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); return ret; } -NOKPROBE_SYMBOL(spurious_fault); +NOKPROBE_SYMBOL(spurious_kernel_fault); int show_unhandled_signals = 1; @@ -1202,6 +1202,58 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) return true; } +/* + * Called for all faults where 'address' is part of the kernel address + * space. Might get called for faults that originate from *code* that + * ran in userspace or the kernel. + */ +static void +do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, + unsigned long address) +{ + /* + * We can fault-in kernel-space virtual memory on-demand. The + * 'reference' page table is init_mm.pgd. + * + * NOTE! We MUST NOT take any locks for this case. We may + * be in an interrupt or a critical region, and should + * only copy the information from the master page table, + * nothing more. + * + * Before doing this on-demand faulting, ensure that the + * fault is not any of the following: + * 1. A fault on a PTE with a reserved bit set. + * 2. A fault caused by a user-mode access. (Do not demand- + * fault kernel memory due to user-mode accesses). + * 3. A fault caused by a page-level protection violation. + * (A demand fault would be on a non-present page which + * would have X86_PF_PROT==0). + */ + if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { + if (vmalloc_fault(address) >= 0) + return; + } + + /* Was the fault spurious, caused by lazy TLB invalidation? */ + if (spurious_kernel_fault(hw_error_code, address)) + return; + + /* kprobes don't want to hook the spurious faults: */ + if (kprobes_fault(regs)) + return; + + /* + * Note, despite being a "bad area", there are quite a few + * acceptable reasons to get here, such as erratum fixups + * and handling kernel code that can fault, like get_user(). + * + * Don't take the mm semaphore here. If we fixup a prefetch + * fault we could otherwise deadlock: + */ + bad_area_nosemaphore(regs, hw_error_code, address, NULL); +} +NOKPROBE_SYMBOL(do_kern_addr_fault); + /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate @@ -1227,38 +1279,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, if (unlikely(kmmio_fault(regs, address))) return; - /* - * We fault-in kernel-space virtual memory on-demand. The - * 'reference' page table is init_mm.pgd. - * - * NOTE! We MUST NOT take any locks for this case. We may - * be in an interrupt or a critical region, and should - * only copy the information from the master page table, - * nothing more. - * - * This verifies that the fault happens in kernel space - * (hw_error_code & 4) == 0, and that the fault was not a - * protection error (hw_error_code & 9) == 0. - */ + /* Was the fault on kernel-controlled part of the address space? */ if (unlikely(fault_in_kernel_space(address))) { - if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { - if (vmalloc_fault(address) >= 0) - return; - } - - /* Can handle a stale RO->RW TLB: */ - if (spurious_fault(hw_error_code, address)) - return; - - /* kprobes don't want to hook the spurious faults: */ - if (kprobes_fault(regs)) - return; - /* - * Don't take the mm semaphore here. If we fixup a prefetch - * fault we could otherwise deadlock: - */ - bad_area_nosemaphore(regs, hw_error_code, address, NULL); - + do_kern_addr_fault(regs, hw_error_code, address); return; } -- cgit v1.2.3 From aa37c51b9421d66f7931c5fdcb9ce80c450974be Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 28 Sep 2018 09:02:23 -0700 Subject: x86/mm: Break out user address space handling The last patch broke out kernel address space handing into its own helper. Now, do the same for user address space handling. Cc: x86@kernel.org Cc: Jann Horn Cc: Sean Christopherson Cc: Thomas Gleixner Cc: Andy Lutomirski Signed-off-by: Dave Hansen Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180928160223.9C4F6440@viggo.jf.intel.com --- arch/x86/mm/fault.c | 47 ++++++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index c7e32f453852..0d1f5d39fc63 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -966,6 +966,7 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code, __bad_area(regs, error_code, address, vma, SEGV_ACCERR); } +/* Handle faults in the kernel portion of the address space */ static void do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, u32 *pkey, unsigned int fault) @@ -1254,14 +1255,11 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, } NOKPROBE_SYMBOL(do_kern_addr_fault); -/* - * This routine handles page faults. It determines the address, - * and the problem, and then passes it off to one of the appropriate - * routines. - */ -static noinline void -__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, - unsigned long address) +/* Handle faults in the user portion of the address space */ +static inline +void do_user_addr_fault(struct pt_regs *regs, + unsigned long hw_error_code, + unsigned long address) { unsigned long sw_error_code; struct vm_area_struct *vma; @@ -1274,17 +1272,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, tsk = current; mm = tsk->mm; - prefetchw(&mm->mmap_sem); - - if (unlikely(kmmio_fault(regs, address))) - return; - - /* Was the fault on kernel-controlled part of the address space? */ - if (unlikely(fault_in_kernel_space(address))) { - do_kern_addr_fault(regs, hw_error_code, address); - return; - } - /* kprobes don't want to hook the spurious faults: */ if (unlikely(kprobes_fault(regs))) return; @@ -1488,6 +1475,28 @@ good_area: check_v8086_mode(regs, address, tsk); } +NOKPROBE_SYMBOL(do_user_addr_fault); + +/* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate + * routines. + */ +static noinline void +__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, + unsigned long address) +{ + prefetchw(¤t->mm->mmap_sem); + + if (unlikely(kmmio_fault(regs, address))) + return; + + /* Was the fault on kernel-controlled part of the address space? */ + if (unlikely(fault_in_kernel_space(address))) + do_kern_addr_fault(regs, hw_error_code, address); + else + do_user_addr_fault(regs, hw_error_code, address); +} NOKPROBE_SYMBOL(__do_page_fault); static nokprobe_inline void -- cgit v1.2.3 From 5b0c2cac54d44ea40d5c5aec7cbf14353b2a903e Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 28 Sep 2018 09:02:25 -0700 Subject: x86/mm: Add clarifying comments for user addr space The SMAP and Reserved checking do not have nice comments. Add some to clarify and make it match everything else. Cc: x86@kernel.org Cc: Jann Horn Cc: Sean Christopherson Cc: Thomas Gleixner Cc: Andy Lutomirski Signed-off-by: Dave Hansen Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180928160225.FFD44B8D@viggo.jf.intel.com --- arch/x86/mm/fault.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 0d1f5d39fc63..1d838701a5f7 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1276,9 +1276,17 @@ void do_user_addr_fault(struct pt_regs *regs, if (unlikely(kprobes_fault(regs))) return; + /* + * Reserved bits are never expected to be set on + * entries in the user portion of the page tables. + */ if (unlikely(hw_error_code & X86_PF_RSVD)) pgtable_bad(regs, hw_error_code, address); + /* + * Check for invalid kernel (supervisor) access to user + * pages in the user address space. + */ if (unlikely(smap_violation(hw_error_code, regs))) { bad_area_nosemaphore(regs, hw_error_code, address, NULL); return; -- cgit v1.2.3 From 88259744e253777e898c186f08670c86dd8199bf Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 28 Sep 2018 09:02:27 -0700 Subject: x86/mm: Fix exception table comments The comments here are wrong. They are too absolute about where faults can occur when running in the kernel. The comments are also a bit hard to match up with the code. Trim down the comments, and make them more precise. Also add a comment explaining why we are doing the bad_area_nosemaphore() path here. Cc: x86@kernel.org Cc: Jann Horn Cc: Sean Christopherson Cc: Thomas Gleixner Cc: Andy Lutomirski Signed-off-by: Dave Hansen Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180928160227.077DDD7A@viggo.jf.intel.com --- arch/x86/mm/fault.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 1d838701a5f7..57b074b02ebb 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1351,24 +1351,26 @@ void do_user_addr_fault(struct pt_regs *regs, flags |= FAULT_FLAG_INSTRUCTION; /* - * When running in the kernel we expect faults to occur only to - * addresses in user space. All other faults represent errors in - * the kernel and should generate an OOPS. Unfortunately, in the - * case of an erroneous fault occurring in a code path which already - * holds mmap_sem we will deadlock attempting to validate the fault - * against the address space. Luckily the kernel only validly - * references user space from well defined areas of code, which are - * listed in the exceptions table. + * Kernel-mode access to the user address space should only occur + * on well-defined single instructions listed in the exception + * tables. But, an erroneous kernel fault occurring outside one of + * those areas which also holds mmap_sem might deadlock attempting + * to validate the fault against the address space. * - * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibility of a - * deadlock. Attempt to lock the address space, if we cannot we then - * validate the source. If this is invalid we can skip the address - * space check, thus avoiding the deadlock: + * Only do the expensive exception table search when we might be at + * risk of a deadlock. This happens if we + * 1. Failed to acquire mmap_sem, and + * 2. The access did not originate in userspace. Note: either the + * hardware or earlier page fault code may set X86_PF_USER + * in sw_error_code. */ if (unlikely(!down_read_trylock(&mm->mmap_sem))) { if (!(sw_error_code & X86_PF_USER) && !search_exception_tables(regs->ip)) { + /* + * Fault from code in kernel from + * which we do not expect faults. + */ bad_area_nosemaphore(regs, sw_error_code, address, NULL); return; } -- cgit v1.2.3 From 02e983b760c0d4183c28d625a3c99016e2cd8a7f Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 28 Sep 2018 09:02:28 -0700 Subject: x86/mm: Add vsyscall address helper We will shortly be using this check in two locations. Put it in a helper before we do so. Let's also insert PAGE_MASK instead of the open-coded ~0xfff. It is easier to read and also more obviously correct considering the implicit type conversion that has to happen when it is not an implicit 'unsigned long'. Cc: x86@kernel.org Cc: Jann Horn Cc: Sean Christopherson Cc: Thomas Gleixner Cc: Andy Lutomirski Signed-off-by: Dave Hansen Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180928160228.C593509B@viggo.jf.intel.com --- arch/x86/mm/fault.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 57b074b02ebb..7a627ac3a0d2 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -840,6 +840,15 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, show_opcodes(regs, loglvl); } +/* + * The (legacy) vsyscall page is the long page in the kernel portion + * of the address space that has user-accessible permissions. + */ +static bool is_vsyscall_vaddr(unsigned long vaddr) +{ + return (vaddr & PAGE_MASK) == VSYSCALL_ADDR; +} + static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, unsigned long address, u32 *pkey, int si_code) @@ -869,7 +878,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, * emulation. */ if (unlikely((error_code & X86_PF_INSTR) && - ((address & ~0xfff) == VSYSCALL_ADDR))) { + is_vsyscall_vaddr(address))) { if (emulate_vsyscall(regs, address)) return; } -- cgit v1.2.3 From 3ae0ad92f53e0f05cf6ab781230b7902b88f73cd Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 28 Sep 2018 09:02:30 -0700 Subject: x86/mm/vsyscall: Consider vsyscall page part of user address space The vsyscall page is weird. It is in what is traditionally part of the kernel address space. But, it has user permissions and we handle faults on it like we would on a user page: interrupts on. Right now, we handle vsyscall emulation in the "bad_area" code, which is used for both user-address-space and kernel-address-space faults. Move the handling to the user-address-space code *only* and ensure we get there by "excluding" the vsyscall page from the kernel address space via a check in fault_in_kernel_space(). Since the fault_in_kernel_space() check is used on 32-bit, also add a 64-bit check to make it clear we only use this path on 64-bit. Also move the unlikely() to be in is_vsyscall_vaddr() itself. This helps clean up the kernel fault handling path by removing a case that can happen in normal[1] operation. (Yeah, yeah, we can argue about the vsyscall page being "normal" or not.) This also makes sanity checks easier, like the "we never take pkey faults in the kernel address space" check in the next patch. Cc: x86@kernel.org Cc: Jann Horn Cc: Sean Christopherson Cc: Thomas Gleixner Cc: Andy Lutomirski Signed-off-by: Dave Hansen Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180928160230.6E9336EE@viggo.jf.intel.com --- arch/x86/mm/fault.c | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7a627ac3a0d2..7e0fa7e24168 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -846,7 +846,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, */ static bool is_vsyscall_vaddr(unsigned long vaddr) { - return (vaddr & PAGE_MASK) == VSYSCALL_ADDR; + return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR); } static void @@ -872,18 +872,6 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, if (is_errata100(regs, address)) return; -#ifdef CONFIG_X86_64 - /* - * Instruction fetch faults in the vsyscall page might need - * emulation. - */ - if (unlikely((error_code & X86_PF_INSTR) && - is_vsyscall_vaddr(address))) { - if (emulate_vsyscall(regs, address)) - return; - } -#endif - /* * To avoid leaking information about the kernel page table * layout, pretend that user-mode accesses to kernel addresses @@ -1192,6 +1180,14 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) static int fault_in_kernel_space(unsigned long address) { + /* + * On 64-bit systems, the vsyscall page is at an address above + * TASK_SIZE_MAX, but is not considered part of the kernel + * address space. + */ + if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address)) + return false; + return address >= TASK_SIZE_MAX; } @@ -1359,6 +1355,22 @@ void do_user_addr_fault(struct pt_regs *regs, if (sw_error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; +#ifdef CONFIG_X86_64 + /* + * Instruction fetch faults in the vsyscall page might need + * emulation. The vsyscall page is at a high address + * (>PAGE_OFFSET), but is considered to be part of the user + * address space. + * + * The vsyscall page does not have a "real" VMA, so do this + * emulation before we go searching for VMAs. + */ + if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) { + if (emulate_vsyscall(regs, address)) + return; + } +#endif + /* * Kernel-mode access to the user address space should only occur * on well-defined single instructions listed in the exception -- cgit v1.2.3 From 367e3f1d3fc9bbf1e626da7aea527f40babf8079 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 28 Sep 2018 09:02:31 -0700 Subject: x86/mm: Remove spurious fault pkey check Spurious faults only ever occur in the kernel's address space. They are also constrained specifically to faults with one of these error codes: X86_PF_WRITE | X86_PF_PROT X86_PF_INSTR | X86_PF_PROT So, it's never even possible to reach spurious_kernel_fault_check() with X86_PF_PK set. In addition, the kernel's address space never has pages with user-mode protections. Protection Keys are only enforced on pages with user-mode protection. This gives us lots of reasons to not check for protection keys in our sprurious kernel fault handling. But, let's also add some warnings to ensure that these assumptions about protection keys hold true. Cc: x86@kernel.org Cc: Jann Horn Cc: Sean Christopherson Cc: Thomas Gleixner Cc: Andy Lutomirski Signed-off-by: Dave Hansen Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180928160231.243A0D6A@viggo.jf.intel.com --- arch/x86/mm/fault.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7e0fa7e24168..a16652982f98 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1037,12 +1037,6 @@ static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte) if ((error_code & X86_PF_INSTR) && !pte_exec(*pte)) return 0; - /* - * Note: We do not do lazy flushing on protection key - * changes, so no spurious fault will ever set X86_PF_PK. - */ - if ((error_code & X86_PF_PK)) - return 1; return 1; } @@ -1217,6 +1211,13 @@ static void do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, unsigned long address) { + /* + * Protection keys exceptions only happen on user pages. We + * have no user pages in the kernel portion of the address + * space, so do not expect them here. + */ + WARN_ON_ONCE(hw_error_code & X86_PF_PK); + /* * We can fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. -- cgit v1.2.3 From 51fbf14f2528a8c6401290e37f1c893a2412f1d3 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 27 Sep 2018 09:21:55 -0500 Subject: x86/kexec: Correct KEXEC_BACKUP_SRC_END off-by-one error The only use of KEXEC_BACKUP_SRC_END is as an argument to walk_system_ram_res(): int crash_load_segments(struct kimage *image) { ... walk_system_ram_res(KEXEC_BACKUP_SRC_START, KEXEC_BACKUP_SRC_END, image, determine_backup_region); walk_system_ram_res() expects "start, end" arguments that are inclusive, i.e., the range to be walked includes both the start and end addresses. KEXEC_BACKUP_SRC_END was previously defined as (640 * 1024UL), which is the first address *past* the desired 0-640KB range. Define KEXEC_BACKUP_SRC_END as (640 * 1024UL - 1) so the KEXEC_BACKUP_SRC region is [0-0x9ffff], not [0-0xa0000]. Fixes: dd5f726076cc ("kexec: support for kexec on panic using new system call") Signed-off-by: Bjorn Helgaas Signed-off-by: Borislav Petkov CC: "H. Peter Anvin" CC: Andrew Morton CC: Brijesh Singh CC: Greg Kroah-Hartman CC: Ingo Molnar CC: Lianbo Jiang CC: Takashi Iwai CC: Thomas Gleixner CC: Tom Lendacky CC: Vivek Goyal CC: baiyaowei@cmss.chinamobile.com CC: bhe@redhat.com CC: dan.j.williams@intel.com CC: dyoung@redhat.com CC: kexec@lists.infradead.org Link: http://lkml.kernel.org/r/153805811578.1157.6948388946904655969.stgit@bhelgaas-glaptop.roam.corp.google.com --- arch/x86/include/asm/kexec.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index f327236f0fa7..5125fca472bb 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -67,7 +67,7 @@ struct kimage; /* Memory to backup during crash kdump */ #define KEXEC_BACKUP_SRC_START (0UL) -#define KEXEC_BACKUP_SRC_END (640 * 1024UL) /* 640K */ +#define KEXEC_BACKUP_SRC_END (640 * 1024UL - 1) /* 640K */ /* * CPU does not save ss and sp on stack if execution is already -- cgit v1.2.3 From a98959fdbda1849a01b2150bb635ed559ec06700 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 27 Sep 2018 09:22:02 -0500 Subject: resource: Include resource end in walk_*() interfaces find_next_iomem_res() finds an iomem resource that covers part of a range described by "start, end". All callers expect that range to be inclusive, i.e., both start and end are included, but find_next_iomem_res() doesn't handle the end address correctly. If it finds an iomem resource that contains exactly the end address, it skips it, e.g., if "start, end" is [0x0-0x10000] and there happens to be an iomem resource [mem 0x10000-0x10000] (the single byte at 0x10000), we skip it: find_next_iomem_res(...) { start = 0x0; end = 0x10000; for (p = next_resource(...)) { # p->start = 0x10000; # p->end = 0x10000; # we *should* return this resource, but this condition is false: if ((p->end >= start) && (p->start < end)) break; Adjust find_next_iomem_res() so it allows a resource that includes the single byte at the end of the range. This is a corner case that we probably don't see in practice. Fixes: 58c1b5b07907 ("[PATCH] memory hotadd fixes: find_next_system_ram catch range fix") Signed-off-by: Bjorn Helgaas Signed-off-by: Borislav Petkov CC: Andrew Morton CC: Brijesh Singh CC: Dan Williams CC: H. Peter Anvin CC: Lianbo Jiang CC: Takashi Iwai CC: Thomas Gleixner CC: Tom Lendacky CC: Vivek Goyal CC: Yaowei Bai CC: bhe@redhat.com CC: dan.j.williams@intel.com CC: dyoung@redhat.com CC: kexec@lists.infradead.org CC: mingo@redhat.com CC: x86-ml Link: http://lkml.kernel.org/r/153805812254.1157.16736368485811773752.stgit@bhelgaas-glaptop.roam.corp.google.com --- kernel/resource.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index 30e1bc68503b..155ec873ea4d 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -319,7 +319,7 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); /* - * Finds the lowest iomem resource existing within [res->start.res->end). + * Finds the lowest iomem resource existing within [res->start..res->end]. * The caller must specify res->start, res->end, res->flags, and optionally * desc. If found, returns 0, res is overwritten, if not found, returns -1. * This function walks the whole tree and not just first level children until @@ -352,7 +352,7 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc, p = NULL; break; } - if ((p->end >= start) && (p->start < end)) + if ((p->end >= start) && (p->start <= end)) break; } -- cgit v1.2.3 From 010a93bf97c72f43aac664d0a685942f83d1a103 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 27 Sep 2018 09:22:09 -0500 Subject: resource: Fix find_next_iomem_res() iteration issue Previously find_next_iomem_res() used "*res" as both an input parameter for the range to search and the type of resource to search for, and an output parameter for the resource we found, which makes the interface confusing. The current callers use find_next_iomem_res() incorrectly because they allocate a single struct resource and use it for repeated calls to find_next_iomem_res(). When find_next_iomem_res() returns a resource, it overwrites the start, end, flags, and desc members of the struct. If we call find_next_iomem_res() again, we must update or restore these fields. The previous code restored res.start and res.end, but not res.flags or res.desc. Since the callers did not restore res.flags, if they searched for flags IORESOURCE_MEM | IORESOURCE_BUSY and found a resource with flags IORESOURCE_MEM | IORESOURCE_BUSY | IORESOURCE_SYSRAM, the next search would incorrectly skip resources unless they were also marked as IORESOURCE_SYSRAM. Fix this by restructuring the interface so it takes explicit "start, end, flags" parameters and uses "*res" only as an output parameter. Based on a patch by Lianbo Jiang . [ bp: While at it: - make comments kernel-doc style. - Originally-by: http://lore.kernel.org/lkml/20180921073211.20097-2-lijiang@redhat.com Signed-off-by: Bjorn Helgaas Signed-off-by: Borislav Petkov CC: Andrew Morton CC: Brijesh Singh CC: Dan Williams CC: H. Peter Anvin CC: Lianbo Jiang CC: Takashi Iwai CC: Thomas Gleixner CC: Tom Lendacky CC: Vivek Goyal CC: Yaowei Bai CC: bhe@redhat.com CC: dan.j.williams@intel.com CC: dyoung@redhat.com CC: kexec@lists.infradead.org CC: mingo@redhat.com CC: x86-ml Link: http://lkml.kernel.org/r/153805812916.1157.177580438135143788.stgit@bhelgaas-glaptop.roam.corp.google.com --- kernel/resource.c | 96 ++++++++++++++++++++++++------------------------------- 1 file changed, 42 insertions(+), 54 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index 155ec873ea4d..38b8d11c9eaf 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -318,24 +318,27 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); -/* - * Finds the lowest iomem resource existing within [res->start..res->end]. - * The caller must specify res->start, res->end, res->flags, and optionally - * desc. If found, returns 0, res is overwritten, if not found, returns -1. - * This function walks the whole tree and not just first level children until - * and unless first_level_children_only is true. +/** + * Finds the lowest iomem resource that covers part of [start..end]. The + * caller must specify start, end, flags, and desc (which may be + * IORES_DESC_NONE). + * + * If a resource is found, returns 0 and *res is overwritten with the part + * of the resource that's within [start..end]; if none is found, returns + * -1. + * + * This function walks the whole tree and not just first level children + * unless @first_level_children_only is true. */ -static int find_next_iomem_res(struct resource *res, unsigned long desc, - bool first_level_children_only) +static int find_next_iomem_res(resource_size_t start, resource_size_t end, + unsigned long flags, unsigned long desc, + bool first_level_children_only, + struct resource *res) { - resource_size_t start, end; struct resource *p; bool sibling_only = false; BUG_ON(!res); - - start = res->start; - end = res->end; BUG_ON(start >= end); if (first_level_children_only) @@ -344,7 +347,7 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc, read_lock(&resource_lock); for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) { - if ((p->flags & res->flags) != res->flags) + if ((p->flags & flags) != flags) continue; if ((desc != IORES_DESC_NONE) && (desc != p->desc)) continue; @@ -359,32 +362,31 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc, read_unlock(&resource_lock); if (!p) return -1; + /* copy data */ - if (res->start < p->start) - res->start = p->start; - if (res->end > p->end) - res->end = p->end; + res->start = max(start, p->start); + res->end = min(end, p->end); res->flags = p->flags; res->desc = p->desc; return 0; } -static int __walk_iomem_res_desc(struct resource *res, unsigned long desc, - bool first_level_children_only, - void *arg, +static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, + unsigned long flags, unsigned long desc, + bool first_level_children_only, void *arg, int (*func)(struct resource *, void *)) { - u64 orig_end = res->end; + struct resource res; int ret = -1; - while ((res->start < res->end) && - !find_next_iomem_res(res, desc, first_level_children_only)) { - ret = (*func)(res, arg); + while (start < end && + !find_next_iomem_res(start, end, flags, desc, + first_level_children_only, &res)) { + ret = (*func)(&res, arg); if (ret) break; - res->start = res->end + 1; - res->end = orig_end; + start = res.end + 1; } return ret; @@ -407,13 +409,7 @@ static int __walk_iomem_res_desc(struct resource *res, unsigned long desc, int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)) { - struct resource res; - - res.start = start; - res.end = end; - res.flags = flags; - - return __walk_iomem_res_desc(&res, desc, false, arg, func); + return __walk_iomem_res_desc(start, end, flags, desc, false, arg, func); } EXPORT_SYMBOL_GPL(walk_iomem_res_desc); @@ -427,13 +423,9 @@ EXPORT_SYMBOL_GPL(walk_iomem_res_desc); int walk_system_ram_res(u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)) { - struct resource res; + unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - res.start = start; - res.end = end; - res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - - return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true, + return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true, arg, func); } @@ -444,13 +436,9 @@ int walk_system_ram_res(u64 start, u64 end, void *arg, int walk_mem_res(u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)) { - struct resource res; + unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY; - res.start = start; - res.end = end; - res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; - - return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true, + return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true, arg, func); } @@ -464,25 +452,25 @@ int walk_mem_res(u64 start, u64 end, void *arg, int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)) { + resource_size_t start, end; + unsigned long flags; struct resource res; unsigned long pfn, end_pfn; - u64 orig_end; int ret = -1; - res.start = (u64) start_pfn << PAGE_SHIFT; - res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; - res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - orig_end = res.end; - while ((res.start < res.end) && - (find_next_iomem_res(&res, IORES_DESC_NONE, true) >= 0)) { + start = (u64) start_pfn << PAGE_SHIFT; + end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; + flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + while (start < end && + !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, + true, &res)) { pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; end_pfn = (res.end + 1) >> PAGE_SHIFT; if (end_pfn > pfn) ret = (*func)(pfn, end_pfn - pfn, arg); if (ret) break; - res.start = res.end + 1; - res.end = orig_end; + start = res.end + 1; } return ret; } -- cgit v1.2.3 From b69c2e20f6e4046da84ce5b33ba1ef89cb087b40 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 9 Oct 2018 16:11:21 +0200 Subject: resource: Clean it up a bit - Drop BUG_ON()s and do normal error handling instead, in find_next_iomem_res(). - Align function arguments on opening braces. - Get rid of local var sibling_only in find_next_iomem_res(). - Shorten unnecessarily long first_level_children_only arg name. Signed-off-by: Borislav Petkov CC: Andrew Morton CC: Bjorn Helgaas CC: Brijesh Singh CC: Dan Williams CC: H. Peter Anvin CC: Lianbo Jiang CC: Takashi Iwai CC: Thomas Gleixner CC: Tom Lendacky CC: Vivek Goyal CC: Yaowei Bai CC: bhe@redhat.com CC: dan.j.williams@intel.com CC: dyoung@redhat.com CC: kexec@lists.infradead.org CC: mingo@redhat.com Link: --- kernel/resource.c | 55 ++++++++++++++++++++++++++----------------------------- 1 file changed, 26 insertions(+), 29 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index 38b8d11c9eaf..b3a3a1fc499e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -328,25 +328,23 @@ EXPORT_SYMBOL(release_resource); * -1. * * This function walks the whole tree and not just first level children - * unless @first_level_children_only is true. + * unless @first_lvl is true. */ static int find_next_iomem_res(resource_size_t start, resource_size_t end, unsigned long flags, unsigned long desc, - bool first_level_children_only, - struct resource *res) + bool first_lvl, struct resource *res) { struct resource *p; - bool sibling_only = false; - BUG_ON(!res); - BUG_ON(start >= end); + if (!res) + return -EINVAL; - if (first_level_children_only) - sibling_only = true; + if (start >= end) + return -EINVAL; read_lock(&resource_lock); - for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) { + for (p = iomem_resource.child; p; p = next_resource(p, first_lvl)) { if ((p->flags & flags) != flags) continue; if ((desc != IORES_DESC_NONE) && (desc != p->desc)) @@ -373,15 +371,14 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, unsigned long flags, unsigned long desc, - bool first_level_children_only, void *arg, + bool first_lvl, void *arg, int (*func)(struct resource *, void *)) { struct resource res; int ret = -1; while (start < end && - !find_next_iomem_res(start, end, flags, desc, - first_level_children_only, &res)) { + !find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) { ret = (*func)(&res, arg); if (ret) break; @@ -392,7 +389,7 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, return ret; } -/* +/** * Walks through iomem resources and calls func() with matching resource * ranges. This walks through whole tree and not just first level children. * All the memory ranges which overlap start,end and also match flags and @@ -421,7 +418,7 @@ EXPORT_SYMBOL_GPL(walk_iomem_res_desc); * ranges. */ int walk_system_ram_res(u64 start, u64 end, void *arg, - int (*func)(struct resource *, void *)) + int (*func)(struct resource *, void *)) { unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; @@ -450,7 +447,7 @@ int walk_mem_res(u64 start, u64 end, void *arg, * It is to be used only for System RAM. */ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, - void *arg, int (*func)(unsigned long, unsigned long, void *)) + void *arg, int (*func)(unsigned long, unsigned long, void *)) { resource_size_t start, end; unsigned long flags; @@ -646,8 +643,8 @@ static int find_resource(struct resource *root, struct resource *new, * @constraint: the size and alignment constraints to be met. */ static int reallocate_resource(struct resource *root, struct resource *old, - resource_size_t newsize, - struct resource_constraint *constraint) + resource_size_t newsize, + struct resource_constraint *constraint) { int err=0; struct resource new = *old; @@ -960,7 +957,7 @@ skip: * Existing children of the resource are assumed to be immutable. */ int adjust_resource(struct resource *res, resource_size_t start, - resource_size_t size) + resource_size_t size) { int result; @@ -971,9 +968,9 @@ int adjust_resource(struct resource *res, resource_size_t start, } EXPORT_SYMBOL(adjust_resource); -static void __init __reserve_region_with_split(struct resource *root, - resource_size_t start, resource_size_t end, - const char *name) +static void __init +__reserve_region_with_split(struct resource *root, resource_size_t start, + resource_size_t end, const char *name) { struct resource *parent = root; struct resource *conflict; @@ -1032,9 +1029,9 @@ static void __init __reserve_region_with_split(struct resource *root, } -void __init reserve_region_with_split(struct resource *root, - resource_size_t start, resource_size_t end, - const char *name) +void __init +reserve_region_with_split(struct resource *root, resource_size_t start, + resource_size_t end, const char *name) { int abort = 0; @@ -1160,7 +1157,7 @@ EXPORT_SYMBOL(__request_region); * The described resource region must match a currently busy region. */ void __release_region(struct resource *parent, resource_size_t start, - resource_size_t n) + resource_size_t n) { struct resource **p; resource_size_t end; @@ -1222,7 +1219,7 @@ EXPORT_SYMBOL(__release_region); * simplicity. Enhance this logic when necessary. */ int release_mem_region_adjustable(struct resource *parent, - resource_size_t start, resource_size_t size) + resource_size_t start, resource_size_t size) { struct resource **p; struct resource *res; @@ -1398,9 +1395,9 @@ static int devm_region_match(struct device *dev, void *res, void *match_data) this->start == match->start && this->n == match->n; } -struct resource * __devm_request_region(struct device *dev, - struct resource *parent, resource_size_t start, - resource_size_t n, const char *name) +struct resource * +__devm_request_region(struct device *dev, struct resource *parent, + resource_size_t start, resource_size_t n, const char *name) { struct region_devres *dr = NULL; struct resource *res; -- cgit v1.2.3 From c200dac78fec66d87ef262cac38cfe4feabdf737 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Oct 2018 21:53:48 +0200 Subject: x86/mm: Do not warn about PCI BIOS W+X mappings PCI BIOS requires the BIOS area 0x0A0000-0x0FFFFFF to be mapped W+X for various legacy reasons. When CONFIG_DEBUG_WX is enabled, this triggers the WX warning, but this is misleading because the mapping is required and is not a result of an accidental oversight. Prevent the full warning when PCI BIOS is enabled and the detected WX mapping is in the BIOS area. Just emit a pr_warn() which denotes the fact. This is partially duplicating the info which the PCI BIOS code emits when it maps the area as executable, but that info is not in the context of the WX checking output. Remove the extra %p printout in the WARN_ONCE() while at it. %pS is enough. Reported-by: Paul Menzel Signed-off-by: Thomas Gleixner Tested-by: Borislav Petkov Cc: Joerg Roedel Cc: Kees Cook Cc: Bjorn Helgaas Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1810082151160.2455@nanos.tec.linutronix.de --- arch/x86/mm/dump_pagetables.c | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index a12afff146d1..fc37bbd23eb8 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -19,7 +19,9 @@ #include #include #include +#include +#include #include /* @@ -241,6 +243,29 @@ static unsigned long normalize_addr(unsigned long u) return (signed long)(u << shift) >> shift; } +static void note_wx(struct pg_state *st) +{ + unsigned long npages; + + npages = (st->current_address - st->start_address) / PAGE_SIZE; + +#ifdef CONFIG_PCI_BIOS + /* + * If PCI BIOS is enabled, the PCI BIOS area is forced to WX. + * Inform about it, but avoid the warning. + */ + if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN && + st->current_address <= PAGE_OFFSET + BIOS_END) { + pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages); + return; + } +#endif + /* Account the WX pages */ + st->wx_pages += npages; + WARN_ONCE(1, "x86/mm: Found insecure W+X mapping at address %pS\n", + (void *)st->start_address); +} + /* * This function gets called on a break in a continuous series * of PTE entries; the next one is different so we need to @@ -276,14 +301,8 @@ static void note_page(struct seq_file *m, struct pg_state *st, unsigned long delta; int width = sizeof(unsigned long) * 2; - if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) { - WARN_ONCE(1, - "x86/mm: Found insecure W+X mapping at address %p/%pS\n", - (void *)st->start_address, - (void *)st->start_address); - st->wx_pages += (st->current_address - - st->start_address) / PAGE_SIZE; - } + if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) + note_wx(st); /* * Now print the actual finished series -- cgit v1.2.3 From 162041425193602b15774c61740ad8e7dc157df3 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 19 Oct 2018 07:08:42 -0700 Subject: x86/mm: Kill stray kernel fault handling comment I originally had matching user and kernel comments, but the kernel one got improved. Some errant conflict resolution kicked the commment somewhere wrong. Kill it. Reported-by: Eric W. Biederman Signed-off-by: Dave Hansen Cc: Andy Lutomirski Cc: Jann Horn Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sean Christopherson Cc: Thomas Gleixner Fixes: aa37c51b94 ("x86/mm: Break out user address space handling") Link: http://lkml.kernel.org/r/20181019140842.12F929FA@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a16652982f98..bd047438d23d 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -963,7 +963,6 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code, __bad_area(regs, error_code, address, vma, SEGV_ACCERR); } -/* Handle faults in the kernel portion of the address space */ static void do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, u32 *pkey, unsigned int fault) -- cgit v1.2.3 From 977e4be5eb714c48a67afc26a6c477f24130a1f2 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 20 Oct 2018 09:26:49 +0200 Subject: x86/stackprotector: Remove the call to boot_init_stack_canary() from cpu_startup_entry() The following commit: d7880812b359 ("idle: Add the stack canary init to cpu_startup_entry()") ... added an x86 specific boot_init_stack_canary() call to the generic cpu_startup_entry() as a temporary hack, with the intention to remove the #ifdef CONFIG_X86 later. More than 5 years later let's finally realize that plan! :-) While implementing stack protector support for PowerPC, we found that calling boot_init_stack_canary() is also needed for PowerPC which uses per task (TLS) stack canary like the X86. However, calling boot_init_stack_canary() would break architectures using a global stack canary (ARM, SH, MIPS and XTENSA). Instead of modifying the #ifdef CONFIG_X86 to an even messier: #if defined(CONFIG_X86) || defined(CONFIG_PPC) PowerPC implemented the call to boot_init_stack_canary() in the function calling cpu_startup_entry(). Let's try the same cleanup on the x86 side as well. On x86 we have two functions calling cpu_startup_entry(): - start_secondary() - cpu_bringup_and_idle() start_secondary() already calls boot_init_stack_canary(), so it's good, and this patch adds the call to boot_init_stack_canary() in cpu_bringup_and_idle(). I.e. now x86 catches up to the rest of the world and the ugly init sequence in init/main.c can be removed from cpu_startup_entry(). As a final benefit we can also remove the dependency from . [ mingo: Improved the changelog a bit, added language explaining x86 borkage and sched.h change. ] Signed-off-by: Christophe Leroy Reviewed-by: Juergen Gross Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linuxppc-dev@lists.ozlabs.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/20181020072649.5B59310483E@pc16082vm.idsi0.si.c-s.fr Signed-off-by: Ingo Molnar --- arch/x86/xen/smp_pv.c | 2 ++ kernel/sched/idle.c | 15 --------------- kernel/sched/sched.h | 1 - 3 files changed, 2 insertions(+), 16 deletions(-) diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index e3b18ad49889..145506f9fdbe 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -88,6 +89,7 @@ static void cpu_bringup(void) asmlinkage __visible void cpu_bringup_and_idle(void) { cpu_bringup(); + boot_init_stack_canary(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 16f84142f2f4..f5516bae0c1b 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -347,21 +347,6 @@ EXPORT_SYMBOL_GPL(play_idle); void cpu_startup_entry(enum cpuhp_state state) { - /* - * This #ifdef needs to die, but it's too late in the cycle to - * make this generic (ARM and SH have never invoked the canary - * init for the non boot CPUs!). Will be fixed in 3.11 - */ -#ifdef CONFIG_X86 - /* - * If we're the non-boot CPU, nothing set the stack canary up - * for us. The boot CPU already has it initialized but no harm - * in doing it again. This is a good place for updating it, as - * we wont ever return from this function (so the invalid - * canaries already on the stack wont ever trigger). - */ - boot_init_stack_canary(); -#endif arch_cpu_idle_prepare(); cpuhp_online_idle(state); while (1) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4a2e8cae63c4..5b00a816a4b3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -56,7 +56,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3