diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-02-23 17:50:35 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-02-23 17:50:35 -0800 |
commit | 5ce1a70e2f00f0bce0cab57f798ca354b9496169 (patch) | |
tree | 6e80200536b7a3576fd71ff2c7135ffe87dc858e /arch | |
parent | 9d3cae26acb471d5954cfdc25d1438b32060babe (diff) | |
parent | ef53d16cded7f89b3843b7a96970dab897843ea5 (diff) | |
download | lwn-5ce1a70e2f00f0bce0cab57f798ca354b9496169.tar.gz lwn-5ce1a70e2f00f0bce0cab57f798ca354b9496169.zip |
Merge branch 'akpm' (more incoming from Andrew)
Merge second patch-bomb from Andrew Morton:
- A little DM fix
- the MM queue
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (154 commits)
ksm: allocate roots when needed
mm: cleanup "swapcache" in do_swap_page
mm,ksm: swapoff might need to copy
mm,ksm: FOLL_MIGRATION do migration_entry_wait
ksm: shrink 32-bit rmap_item back to 32 bytes
ksm: treat unstable nid like in stable tree
ksm: add some comments
tmpfs: fix mempolicy object leaks
tmpfs: fix use-after-free of mempolicy object
mm/fadvise.c: drain all pagevecs if POSIX_FADV_DONTNEED fails to discard all pages
mm: export mmu notifier invalidates
mm: accelerate mm_populate() treatment of THP pages
mm: use long type for page counts in mm_populate() and get_user_pages()
mm: accurately document nr_free_*_pages functions with code comments
HWPOISON: change order of error_states[]'s elements
HWPOISON: fix misjudgement of page_action() for errors on mlocked pages
memcg: stop warning on memcg_propagate_kmem
net: change type of virtio_chan->p9_max_pages
vmscan: change type of vm_total_pages to unsigned long
fs/nfsd: change type of max_delegations, nfsd_drc_max_mem and nfsd_drc_mem_used
...
Diffstat (limited to 'arch')
-rw-r--r-- | arch/arm64/mm/mmu.c | 3 | ||||
-rw-r--r-- | arch/ia64/mm/contig.c | 2 | ||||
-rw-r--r-- | arch/ia64/mm/discontig.c | 6 | ||||
-rw-r--r-- | arch/ia64/mm/init.c | 18 | ||||
-rw-r--r-- | arch/powerpc/mm/init_64.c | 5 | ||||
-rw-r--r-- | arch/powerpc/mm/mem.c | 12 | ||||
-rw-r--r-- | arch/s390/mm/init.c | 12 | ||||
-rw-r--r-- | arch/s390/mm/vmem.c | 4 | ||||
-rw-r--r-- | arch/sh/mm/init.c | 17 | ||||
-rw-r--r-- | arch/sparc/mm/init_32.c | 2 | ||||
-rw-r--r-- | arch/sparc/mm/init_64.c | 5 | ||||
-rw-r--r-- | arch/tile/mm/elf.c | 1 | ||||
-rw-r--r-- | arch/tile/mm/init.c | 8 | ||||
-rw-r--r-- | arch/tile/mm/pgtable.c | 2 | ||||
-rw-r--r-- | arch/x86/include/asm/numa.h | 4 | ||||
-rw-r--r-- | arch/x86/include/asm/pgtable_types.h | 1 | ||||
-rw-r--r-- | arch/x86/kernel/acpi/boot.c | 4 | ||||
-rw-r--r-- | arch/x86/kernel/setup.c | 13 | ||||
-rw-r--r-- | arch/x86/mm/init_32.c | 12 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 397 | ||||
-rw-r--r-- | arch/x86/mm/numa.c | 17 | ||||
-rw-r--r-- | arch/x86/mm/pageattr.c | 47 | ||||
-rw-r--r-- | arch/x86/mm/srat.c | 125 |
23 files changed, 673 insertions, 44 deletions
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index f4dd585898c5..224b44ab534e 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -434,4 +434,7 @@ int __meminit vmemmap_populate(struct page *start_page, return 0; } #endif /* CONFIG_ARM64_64K_PAGES */ +void vmemmap_free(struct page *memmap, unsigned long nr_pages) +{ +} #endif /* CONFIG_SPARSEMEM_VMEMMAP */ diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c index 1516d1dc11fd..80dab509dfb0 100644 --- a/arch/ia64/mm/contig.c +++ b/arch/ia64/mm/contig.c @@ -93,7 +93,7 @@ void show_mem(unsigned int filter) printk(KERN_INFO "%d pages swap cached\n", total_cached); printk(KERN_INFO "Total of %ld pages in page table cache\n", quicklist_total_size()); - printk(KERN_INFO "%d free buffer pages\n", nr_free_buffer_pages()); + printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages()); } diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index c641333cd997..c2e955ee79a8 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -666,7 +666,7 @@ void show_mem(unsigned int filter) printk(KERN_INFO "%d pages swap cached\n", total_cached); printk(KERN_INFO "Total of %ld pages in page table cache\n", quicklist_total_size()); - printk(KERN_INFO "%d free buffer pages\n", nr_free_buffer_pages()); + printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages()); } /** @@ -822,4 +822,8 @@ int __meminit vmemmap_populate(struct page *start_page, { return vmemmap_populate_basepages(start_page, size, node); } + +void vmemmap_free(struct page *memmap, unsigned long nr_pages) +{ +} #endif diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index b755ea92aea7..20bc967c7209 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -688,6 +688,24 @@ int arch_add_memory(int nid, u64 start, u64 size) return ret; } + +#ifdef CONFIG_MEMORY_HOTREMOVE +int arch_remove_memory(u64 start, u64 size) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + struct zone *zone; + int ret; + + zone = page_zone(pfn_to_page(start_pfn)); + ret = __remove_pages(zone, start_pfn, nr_pages); + if (ret) + pr_warn("%s: Problem encountered in __remove_pages() as" + " ret=%d\n", __func__, ret); + + return ret; +} +#endif #endif /* diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 95a45293e5ac..7e2246fb2f31 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -297,5 +297,10 @@ int __meminit vmemmap_populate(struct page *start_page, return 0; } + +void vmemmap_free(struct page *memmap, unsigned long nr_pages) +{ +} + #endif /* CONFIG_SPARSEMEM_VMEMMAP */ diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 40df7c8f2096..f1f7409a4183 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -133,6 +133,18 @@ int arch_add_memory(int nid, u64 start, u64 size) return __add_pages(nid, zone, start_pfn, nr_pages); } + +#ifdef CONFIG_MEMORY_HOTREMOVE +int arch_remove_memory(u64 start, u64 size) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + struct zone *zone; + + zone = page_zone(pfn_to_page(start_pfn)); + return __remove_pages(zone, start_pfn, nr_pages); +} +#endif #endif /* CONFIG_MEMORY_HOTPLUG */ /* diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index ae672f41c464..49ce6bb2c641 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -228,4 +228,16 @@ int arch_add_memory(int nid, u64 start, u64 size) vmem_remove_mapping(start, size); return rc; } + +#ifdef CONFIG_MEMORY_HOTREMOVE +int arch_remove_memory(u64 start, u64 size) +{ + /* + * There is no hardware or firmware interface which could trigger a + * hot memory remove on s390. So there is nothing that needs to be + * implemented. + */ + return -EBUSY; +} +#endif #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 79699f46a443..e21aaf4f5cb6 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -268,6 +268,10 @@ out: return ret; } +void vmemmap_free(struct page *memmap, unsigned long nr_pages) +{ +} + /* * Add memory segment to the segment list if it doesn't overlap with * an already present segment. diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 82cc576fab15..105794037143 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -558,4 +558,21 @@ int memory_add_physaddr_to_nid(u64 addr) EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif +#ifdef CONFIG_MEMORY_HOTREMOVE +int arch_remove_memory(u64 start, u64 size) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + struct zone *zone; + int ret; + + zone = page_zone(pfn_to_page(start_pfn)); + ret = __remove_pages(zone, start_pfn, nr_pages); + if (unlikely(ret)) + pr_warn("%s: Failed, __remove_pages() == %d\n", __func__, + ret); + + return ret; +} +#endif #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index dde85ef1c56d..48e0c030e8f5 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -57,7 +57,7 @@ void show_mem(unsigned int filter) printk("Mem-info:\n"); show_free_areas(filter); printk("Free swap: %6ldkB\n", - nr_swap_pages << (PAGE_SHIFT-10)); + get_nr_swap_pages() << (PAGE_SHIFT-10)); printk("%ld pages of RAM\n", totalram_pages); printk("%ld free pages\n", nr_free_pages()); } diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 5c2c6e61facb..1588d33d5492 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2235,6 +2235,11 @@ void __meminit vmemmap_populate_print_last(void) node_start = 0; } } + +void vmemmap_free(struct page *memmap, unsigned long nr_pages) +{ +} + #endif /* CONFIG_SPARSEMEM_VMEMMAP */ static void prot_init_common(unsigned long page_none, diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c index 3cfa98bf9125..743c951c61b0 100644 --- a/arch/tile/mm/elf.c +++ b/arch/tile/mm/elf.c @@ -130,7 +130,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, if (!retval) { unsigned long addr = MEM_USER_INTRPT; addr = mmap_region(NULL, addr, INTRPT_SIZE, - MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0); if (addr > (unsigned long) -PAGE_SIZE) diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c index ef29d6c5e10e..2749515a0547 100644 --- a/arch/tile/mm/init.c +++ b/arch/tile/mm/init.c @@ -935,6 +935,14 @@ int remove_memory(u64 start, u64 size) { return -EINVAL; } + +#ifdef CONFIG_MEMORY_HOTREMOVE +int arch_remove_memory(u64 start, u64 size) +{ + /* TODO */ + return -EBUSY; +} +#endif #endif struct kmem_cache *pgd_cache; diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c index de0de0c0e8a1..b3b4972c2451 100644 --- a/arch/tile/mm/pgtable.c +++ b/arch/tile/mm/pgtable.c @@ -61,7 +61,7 @@ void show_mem(unsigned int filter) global_page_state(NR_PAGETABLE), global_page_state(NR_BOUNCE), global_page_state(NR_FILE_PAGES), - nr_swap_pages); + get_nr_swap_pages()); for_each_zone(zone) { unsigned long flags, order, total = 0, largest_order = -1; diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 52560a2038e1..1b99ee5c9f00 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -57,8 +57,8 @@ static inline int numa_cpu_node(int cpu) #endif #ifdef CONFIG_NUMA -extern void __cpuinit numa_set_node(int cpu, int node); -extern void __cpuinit numa_clear_node(int cpu); +extern void numa_set_node(int cpu, int node); +extern void numa_clear_node(int cpu); extern void __init init_cpu_to_node(void); extern void __cpuinit numa_add_cpu(int cpu); extern void __cpuinit numa_remove_cpu(int cpu); diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index e6423002c10b..567b5d0632b2 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -351,6 +351,7 @@ static inline void update_page_count(int level, unsigned long pages) { } * as a pte too. */ extern pte_t *lookup_address(unsigned long address, unsigned int *level); +extern int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase); extern phys_addr_t slow_virt_to_phys(void *__address); #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index cfc755dc1607..230c8ea878e5 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -696,6 +696,10 @@ EXPORT_SYMBOL(acpi_map_lsapic); int acpi_unmap_lsapic(int cpu) { +#ifdef CONFIG_ACPI_NUMA + set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE); +#endif + per_cpu(x86_cpu_to_apicid, cpu) = -1; set_cpu_present(cpu, false); num_processors--; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 915f5efefcf5..9c857f05cef0 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1056,6 +1056,15 @@ void __init setup_arch(char **cmdline_p) setup_bios_corruption_check(); #endif + /* + * In the memory hotplug case, the kernel needs info from SRAT to + * determine which memory is hotpluggable before allocating memory + * using memblock. + */ + acpi_boot_table_init(); + early_acpi_boot_init(); + early_parse_srat(); + #ifdef CONFIG_X86_32 printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", (max_pfn_mapped<<PAGE_SHIFT) - 1); @@ -1101,10 +1110,6 @@ void __init setup_arch(char **cmdline_p) /* * Parse the ACPI tables for possible boot-time SMP configuration. */ - acpi_boot_table_init(); - - early_acpi_boot_init(); - initmem_init(); memblock_find_dma_reserve(); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index b299724f6e34..2d19001151d5 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -862,6 +862,18 @@ int arch_add_memory(int nid, u64 start, u64 size) return __add_pages(nid, zone, start_pfn, nr_pages); } + +#ifdef CONFIG_MEMORY_HOTREMOVE +int arch_remove_memory(u64 start, u64 size) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + struct zone *zone; + + zone = page_zone(pfn_to_page(start_pfn)); + return __remove_pages(zone, start_pfn, nr_pages); +} +#endif #endif /* diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3eba7f429880..474e28f10815 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -707,6 +707,343 @@ int arch_add_memory(int nid, u64 start, u64 size) } EXPORT_SYMBOL_GPL(arch_add_memory); +#define PAGE_INUSE 0xFD + +static void __meminit free_pagetable(struct page *page, int order) +{ + struct zone *zone; + bool bootmem = false; + unsigned long magic; + unsigned int nr_pages = 1 << order; + + /* bootmem page has reserved flag */ + if (PageReserved(page)) { + __ClearPageReserved(page); + bootmem = true; + + magic = (unsigned long)page->lru.next; + if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { + while (nr_pages--) + put_page_bootmem(page++); + } else + __free_pages_bootmem(page, order); + } else + free_pages((unsigned long)page_address(page), order); + + /* + * SECTION_INFO pages and MIX_SECTION_INFO pages + * are all allocated by bootmem. + */ + if (bootmem) { + zone = page_zone(page); + zone_span_writelock(zone); + zone->present_pages += nr_pages; + zone_span_writeunlock(zone); + totalram_pages += nr_pages; + } +} + +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) +{ + pte_t *pte; + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = pte_start + i; + if (pte_val(*pte)) + return; + } + + /* free a pte talbe */ + free_pagetable(pmd_page(*pmd), 0); + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); +} + +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) +{ + pmd_t *pmd; + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd = pmd_start + i; + if (pmd_val(*pmd)) + return; + } + + /* free a pmd talbe */ + free_pagetable(pud_page(*pud), 0); + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); +} + +/* Return true if pgd is changed, otherwise return false. */ +static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd) +{ + pud_t *pud; + int i; + + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (pud_val(*pud)) + return false; + } + + /* free a pud table */ + free_pagetable(pgd_page(*pgd), 0); + spin_lock(&init_mm.page_table_lock); + pgd_clear(pgd); + spin_unlock(&init_mm.page_table_lock); + + return true; +} + +static void __meminit +remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, + bool direct) +{ + unsigned long next, pages = 0; + pte_t *pte; + void *page_addr; + phys_addr_t phys_addr; + + pte = pte_start + pte_index(addr); + for (; addr < end; addr = next, pte++) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + if (next > end) + next = end; + + if (!pte_present(*pte)) + continue; + + /* + * We mapped [0,1G) memory as identity mapping when + * initializing, in arch/x86/kernel/head_64.S. These + * pagetables cannot be removed. + */ + phys_addr = pte_val(*pte) + (addr & PAGE_MASK); + if (phys_addr < (phys_addr_t)0x40000000) + return; + + if (IS_ALIGNED(addr, PAGE_SIZE) && + IS_ALIGNED(next, PAGE_SIZE)) { + /* + * Do not free direct mapping pages since they were + * freed when offlining, or simplely not in use. + */ + if (!direct) + free_pagetable(pte_page(*pte), 0); + + spin_lock(&init_mm.page_table_lock); + pte_clear(&init_mm, addr, pte); + spin_unlock(&init_mm.page_table_lock); + + /* For non-direct mapping, pages means nothing. */ + pages++; + } else { + /* + * If we are here, we are freeing vmemmap pages since + * direct mapped memory ranges to be freed are aligned. + * + * If we are not removing the whole page, it means + * other page structs in this page are being used and + * we canot remove them. So fill the unused page_structs + * with 0xFD, and remove the page when it is wholly + * filled with 0xFD. + */ + memset((void *)addr, PAGE_INUSE, next - addr); + + page_addr = page_address(pte_page(*pte)); + if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) { + free_pagetable(pte_page(*pte), 0); + + spin_lock(&init_mm.page_table_lock); + pte_clear(&init_mm, addr, pte); + spin_unlock(&init_mm.page_table_lock); + } + } + } + + /* Call free_pte_table() in remove_pmd_table(). */ + flush_tlb_all(); + if (direct) + update_page_count(PG_LEVEL_4K, -pages); +} + +static void __meminit +remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end, + bool direct) +{ + unsigned long next, pages = 0; + pte_t *pte_base; + pmd_t *pmd; + void *page_addr; + + pmd = pmd_start + pmd_index(addr); + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); + + if (!pmd_present(*pmd)) + continue; + + if (pmd_large(*pmd)) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE)) { + if (!direct) + free_pagetable(pmd_page(*pmd), + get_order(PMD_SIZE)); + + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); + pages++; + } else { + /* If here, we are freeing vmemmap pages. */ + memset((void *)addr, PAGE_INUSE, next - addr); + + page_addr = page_address(pmd_page(*pmd)); + if (!memchr_inv(page_addr, PAGE_INUSE, + PMD_SIZE)) { + free_pagetable(pmd_page(*pmd), + get_order(PMD_SIZE)); + + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); + } + } + + continue; + } + + pte_base = (pte_t *)pmd_page_vaddr(*pmd); + remove_pte_table(pte_base, addr, next, direct); + free_pte_table(pte_base, pmd); + } + + /* Call free_pmd_table() in remove_pud_table(). */ + if (direct) + update_page_count(PG_LEVEL_2M, -pages); +} + +static void __meminit +remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, + bool direct) +{ + unsigned long next, pages = 0; + pmd_t *pmd_base; + pud_t *pud; + void *page_addr; + + pud = pud_start + pud_index(addr); + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); + + if (!pud_present(*pud)) + continue; + + if (pud_large(*pud)) { + if (IS_ALIGNED(addr, PUD_SIZE) && + IS_ALIGNED(next, PUD_SIZE)) { + if (!direct) + free_pagetable(pud_page(*pud), + get_order(PUD_SIZE)); + + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); + pages++; + } else { + /* If here, we are freeing vmemmap pages. */ + memset((void *)addr, PAGE_INUSE, next - addr); + + page_addr = page_address(pud_page(*pud)); + if (!memchr_inv(page_addr, PAGE_INUSE, + PUD_SIZE)) { + free_pagetable(pud_page(*pud), + get_order(PUD_SIZE)); + + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); + } + } + + continue; + } + + pmd_base = (pmd_t *)pud_page_vaddr(*pud); + remove_pmd_table(pmd_base, addr, next, direct); + free_pmd_table(pmd_base, pud); + } + + if (direct) + update_page_count(PG_LEVEL_1G, -pages); +} + +/* start and end are both virtual address. */ +static void __meminit +remove_pagetable(unsigned long start, unsigned long end, bool direct) +{ + unsigned long next; + pgd_t *pgd; + pud_t *pud; + bool pgd_changed = false; + + for (; start < end; start = next) { + next = pgd_addr_end(start, end); + + pgd = pgd_offset_k(start); + if (!pgd_present(*pgd)) + continue; + + pud = (pud_t *)pgd_page_vaddr(*pgd); + remove_pud_table(pud, start, next, direct); + if (free_pud_table(pud, pgd)) + pgd_changed = true; + } + + if (pgd_changed) + sync_global_pgds(start, end - 1); + + flush_tlb_all(); +} + +void __ref vmemmap_free(struct page *memmap, unsigned long nr_pages) +{ + unsigned long start = (unsigned long)memmap; + unsigned long end = (unsigned long)(memmap + nr_pages); + + remove_pagetable(start, end, false); +} + +static void __meminit +kernel_physical_mapping_remove(unsigned long start, unsigned long end) +{ + start = (unsigned long)__va(start); + end = (unsigned long)__va(end); + + remove_pagetable(start, end, true); +} + +#ifdef CONFIG_MEMORY_HOTREMOVE +int __ref arch_remove_memory(u64 start, u64 size) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + struct zone *zone; + int ret; + + zone = page_zone(pfn_to_page(start_pfn)); + kernel_physical_mapping_remove(start, start + size); + ret = __remove_pages(zone, start_pfn, nr_pages); + WARN_ON_ONCE(ret); + + return ret; +} +#endif #endif /* CONFIG_MEMORY_HOTPLUG */ static struct kcore_list kcore_vsyscall; @@ -1019,6 +1356,66 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) return 0; } +#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE) +void register_page_bootmem_memmap(unsigned long section_nr, + struct page *start_page, unsigned long size) +{ + unsigned long addr = (unsigned long)start_page; + unsigned long end = (unsigned long)(start_page + size); + unsigned long next; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + unsigned int nr_pages; + struct page *page; + + for (; addr < end; addr = next) { + pte_t *pte = NULL; + + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + continue; + } + get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO); + + pud = pud_offset(pgd, addr); + if (pud_none(*pud)) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + continue; + } + get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); + + if (!cpu_has_pse) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + continue; + get_page_bootmem(section_nr, pmd_page(*pmd), + MIX_SECTION_INFO); + + pte = pte_offset_kernel(pmd, addr); + if (pte_none(*pte)) + continue; + get_page_bootmem(section_nr, pte_page(*pte), + SECTION_INFO); + } else { + next = pmd_addr_end(addr, end); + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + continue; + + nr_pages = 1 << (get_order(PMD_SIZE)); + page = pmd_page(*pmd); + while (nr_pages--) + get_page_bootmem(section_nr, page++, + SECTION_INFO); + } + } +} +#endif + void __meminit vmemmap_populate_print_last(void) { if (p_start) { diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 8504f3698753..dfd30259eb89 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -56,7 +56,7 @@ early_param("numa", numa_setup); /* * apicid, cpu, node mappings */ -s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { +s16 __apicid_to_node[MAX_LOCAL_APIC] = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; @@ -78,7 +78,7 @@ EXPORT_SYMBOL(node_to_cpumask_map); DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); -void __cpuinit numa_set_node(int cpu, int node) +void numa_set_node(int cpu, int node) { int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); @@ -101,7 +101,7 @@ void __cpuinit numa_set_node(int cpu, int node) set_cpu_numa_node(cpu, node); } -void __cpuinit numa_clear_node(int cpu) +void numa_clear_node(int cpu) { numa_set_node(cpu, NUMA_NO_NODE); } @@ -213,10 +213,9 @@ static void __init setup_node_data(int nid, u64 start, u64 end) * Allocate node data. Try node-local memory and then any node. * Never allocate in DMA zone. */ - nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); + nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); if (!nd_pa) { - pr_err("Cannot find %zu bytes in node %d\n", - nd_size, nid); + pr_err("Cannot find %zu bytes in any node\n", nd_size); return; } nd = __va(nd_pa); @@ -561,10 +560,12 @@ static int __init numa_init(int (*init_func)(void)) for (i = 0; i < MAX_LOCAL_APIC; i++) set_apicid_to_node(i, NUMA_NO_NODE); - nodes_clear(numa_nodes_parsed); + /* + * Do not clear numa_nodes_parsed or zero numa_meminfo here, because + * SRAT was parsed earlier in early_parse_srat(). + */ nodes_clear(node_possible_map); nodes_clear(node_online_map); - memset(&numa_meminfo, 0, sizeof(numa_meminfo)); WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES)); numa_reset_distance(); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a1b1c88f9caf..ca1f1c2bb7be 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -529,21 +529,13 @@ out_unlock: return do_split; } -static int split_large_page(pte_t *kpte, unsigned long address) +int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase) { unsigned long pfn, pfninc = 1; unsigned int i, level; - pte_t *pbase, *tmp; + pte_t *tmp; pgprot_t ref_prot; - struct page *base; - - if (!debug_pagealloc) - spin_unlock(&cpa_lock); - base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); - if (!debug_pagealloc) - spin_lock(&cpa_lock); - if (!base) - return -ENOMEM; + struct page *base = virt_to_page(pbase); spin_lock(&pgd_lock); /* @@ -551,10 +543,11 @@ static int split_large_page(pte_t *kpte, unsigned long address) * up for us already: */ tmp = lookup_address(address, &level); - if (tmp != kpte) - goto out_unlock; + if (tmp != kpte) { + spin_unlock(&pgd_lock); + return 1; + } - pbase = (pte_t *)page_address(base); paravirt_alloc_pte(&init_mm, page_to_pfn(base)); ref_prot = pte_pgprot(pte_clrhuge(*kpte)); /* @@ -601,17 +594,27 @@ static int split_large_page(pte_t *kpte, unsigned long address) * going on. */ __flush_tlb_all(); + spin_unlock(&pgd_lock); - base = NULL; + return 0; +} -out_unlock: - /* - * If we dropped out via the lookup_address check under - * pgd_lock then stick the page back into the pool: - */ - if (base) +static int split_large_page(pte_t *kpte, unsigned long address) +{ + pte_t *pbase; + struct page *base; + + if (!debug_pagealloc) + spin_unlock(&cpa_lock); + base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); + if (!debug_pagealloc) + spin_lock(&cpa_lock); + if (!base) + return -ENOMEM; + + pbase = (pte_t *)page_address(base); + if (__split_large_page(kpte, address, pbase)) __free_page(base); - spin_unlock(&pgd_lock); return 0; } diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index cdd0da9dd530..79836d01f789 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -141,11 +141,126 @@ static inline int save_add_info(void) {return 1;} static inline int save_add_info(void) {return 0;} #endif +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +static void __init +handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable) +{ + int overlap, i; + unsigned long start_pfn, end_pfn; + + start_pfn = PFN_DOWN(start); + end_pfn = PFN_UP(end); + + /* + * For movablemem_map=acpi: + * + * SRAT: |_____| |_____| |_________| |_________| ...... + * node id: 0 1 1 2 + * hotpluggable: n y y n + * movablemem_map: |_____| |_________| + * + * Using movablemem_map, we can prevent memblock from allocating memory + * on ZONE_MOVABLE at boot time. + * + * Before parsing SRAT, memblock has already reserve some memory ranges + * for other purposes, such as for kernel image. We cannot prevent + * kernel from using these memory, so we need to exclude these memory + * even if it is hotpluggable. + * Furthermore, to ensure the kernel has enough memory to boot, we make + * all the memory on the node which the kernel resides in + * un-hotpluggable. + */ + if (hotpluggable && movablemem_map.acpi) { + /* Exclude ranges reserved by memblock. */ + struct memblock_type *rgn = &memblock.reserved; + + for (i = 0; i < rgn->cnt; i++) { + if (end <= rgn->regions[i].base || + start >= rgn->regions[i].base + + rgn->regions[i].size) + continue; + + /* + * If the memory range overlaps the memory reserved by + * memblock, then the kernel resides in this node. + */ + node_set(node, movablemem_map.numa_nodes_kernel); + + goto out; + } + + /* + * If the kernel resides in this node, then the whole node + * should not be hotpluggable. + */ + if (node_isset(node, movablemem_map.numa_nodes_kernel)) + goto out; + + insert_movablemem_map(start_pfn, end_pfn); + + /* + * numa_nodes_hotplug nodemask represents which nodes are put + * into movablemem_map.map[]. + */ + node_set(node, movablemem_map.numa_nodes_hotplug); + goto out; + } + + /* + * For movablemem_map=nn[KMG]@ss[KMG]: + * + * SRAT: |_____| |_____| |_________| |_________| ...... + * node id: 0 1 1 2 + * user specified: |__| |___| + * movablemem_map: |___| |_________| |______| ...... + * + * Using movablemem_map, we can prevent memblock from allocating memory + * on ZONE_MOVABLE at boot time. + * + * NOTE: In this case, SRAT info will be ingored. + */ + overlap = movablemem_map_overlap(start_pfn, end_pfn); + if (overlap >= 0) { + /* + * If part of this range is in movablemem_map, we need to + * add the range after it to extend the range to the end + * of the node, because from the min address specified to + * the end of the node will be ZONE_MOVABLE. + */ + start_pfn = max(start_pfn, + movablemem_map.map[overlap].start_pfn); + insert_movablemem_map(start_pfn, end_pfn); + + /* + * Set the nodemask, so that if the address range on one node + * is not continuse, we can add the subsequent ranges on the + * same node into movablemem_map. + */ + node_set(node, movablemem_map.numa_nodes_hotplug); + } else { + if (node_isset(node, movablemem_map.numa_nodes_hotplug)) + /* + * Insert the range if we already have movable ranges + * on the same node. + */ + insert_movablemem_map(start_pfn, end_pfn); + } +out: + return; +} +#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +static inline void +handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable) +{ +} +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ + /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ int __init acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) { u64 start, end; + u32 hotpluggable; int node, pxm; if (srat_disabled()) @@ -154,7 +269,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) goto out_err_bad_srat; if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) goto out_err; - if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) + hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE; + if (hotpluggable && !save_add_info()) goto out_err; start = ma->base_address; @@ -174,9 +290,12 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) node_set(node, numa_nodes_parsed); - printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n", + printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx] %s\n", node, pxm, - (unsigned long long) start, (unsigned long long) end - 1); + (unsigned long long) start, (unsigned long long) end - 1, + hotpluggable ? "Hot Pluggable": ""); + + handle_movablemem(node, start, end, hotpluggable); return 0; out_err_bad_srat: |