From 872fec16d9a0ed3b75b8893aa217e49cca575ee5 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:21 -0700 Subject: [PATCH] mm: init_mm without ptlock First step in pushing down the page_table_lock. init_mm.page_table_lock has been used throughout the architectures (usually for ioremap): not to serialize kernel address space allocation (that's usually vmlist_lock), but because pud_alloc,pmd_alloc,pte_alloc_kernel expect caller holds it. Reverse that: don't lock or unlock init_mm.page_table_lock in any of the architectures; instead rely on pud_alloc,pmd_alloc,pte_alloc_kernel to take and drop it when allocating a new one, to check lest a racing task already did. Similarly no page_table_lock in vmalloc's map_vm_area. Some temporary ugliness in __pud_alloc and __pmd_alloc: since they also handle user mms, which are converted only by a later patch, for now they have to lock differently according to whether or not it's init_mm. If sources get muddled, there's a danger that an arch source taking init_mm.page_table_lock will be mixed with common source also taking it (or neither take it). So break the rules and make another change, which should break the build for such a mismatch: remove the redundant mm arg from pte_alloc_kernel (ppc64 scrapped its distinct ioremap_mm in 2.6.13). Exceptions: arm26 used pte_alloc_kernel on user mm, now pte_alloc_map; ia64 used pte_alloc_map on init_mm, now pte_alloc_kernel; parisc had bad args to pmd_alloc and pte_alloc_kernel in unused USE_HPPA_IOREMAP code; ppc64 map_io_page forgot to unlock on failure; ppc mmu_mapin_ram and ppc64 im_free took page_table_lock for no good reason. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ppc64/mm/imalloc.c | 5 ----- arch/ppc64/mm/init.c | 4 +--- 2 files changed, 1 insertion(+), 8 deletions(-) (limited to 'arch/ppc64/mm') diff --git a/arch/ppc64/mm/imalloc.c b/arch/ppc64/mm/imalloc.c index c65b87b92756..f4ca29cf5364 100644 --- a/arch/ppc64/mm/imalloc.c +++ b/arch/ppc64/mm/imalloc.c @@ -300,12 +300,7 @@ void im_free(void * addr) for (p = &imlist ; (tmp = *p) ; p = &tmp->next) { if (tmp->addr == addr) { *p = tmp->next; - - /* XXX: do we need the lock? */ - spin_lock(&init_mm.page_table_lock); unmap_vm_area(tmp); - spin_unlock(&init_mm.page_table_lock); - kfree(tmp); up(&imlist_sem); return; diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c index be64b157afce..a45584b3440c 100644 --- a/arch/ppc64/mm/init.c +++ b/arch/ppc64/mm/init.c @@ -155,7 +155,6 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags) unsigned long vsid; if (mem_init_done) { - spin_lock(&init_mm.page_table_lock); pgdp = pgd_offset_k(ea); pudp = pud_alloc(&init_mm, pgdp, ea); if (!pudp) @@ -163,12 +162,11 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags) pmdp = pmd_alloc(&init_mm, pudp, ea); if (!pmdp) return -ENOMEM; - ptep = pte_alloc_kernel(&init_mm, pmdp, ea); + ptep = pte_alloc_kernel(pmdp, ea); if (!ptep) return -ENOMEM; set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags))); - spin_unlock(&init_mm.page_table_lock); } else { unsigned long va, vpn, hash, hpteg; -- cgit v1.2.3 From 208d54e5513c0c02d85af0990901354c74364d5c Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sat, 29 Oct 2005 18:16:52 -0700 Subject: [PATCH] memory hotplug locking: node_size_lock pgdat->node_size_lock is basically only neeeded in one place in the normal code: show_mem(), which is the arch-specific sysrq-m printing function. Strictly speaking, the architectures not doing memory hotplug do no need this locking in show_mem(). However, they are all included for completeness. This should also make any future consolidation of all of the implementations a little more straightforward. This lock is also held in the sparsemem code during a memory removal, as sections are invalidated. This is the place there pfn_valid() is made false for a memory area that's being removed. The lock is only required when doing pfn_valid() operations on memory which the user does not already have a reference on the page, such as in show_mem(). Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/mm/numa.c | 3 +++ arch/i386/mm/pgtable.c | 3 +++ arch/ia64/mm/discontig.c | 7 ++++++- arch/m32r/mm/init.c | 9 ++++++++- arch/parisc/mm/init.c | 3 +++ arch/ppc64/mm/init.c | 6 ++++++ include/linux/memory_hotplug.h | 34 ++++++++++++++++++++++++++++++++++ include/linux/mmzone.h | 12 ++++++++++++ mm/page_alloc.c | 1 + 9 files changed, 76 insertions(+), 2 deletions(-) create mode 100644 include/linux/memory_hotplug.h (limited to 'arch/ppc64/mm') diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c index c7481d59b6df..6d5251254f68 100644 --- a/arch/alpha/mm/numa.c +++ b/arch/alpha/mm/numa.c @@ -371,6 +371,8 @@ show_mem(void) show_free_areas(); printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_online_node(nid) { + unsigned long flags; + pgdat_resize_lock(NODE_DATA(nid), &flags); i = node_spanned_pages(nid); while (i-- > 0) { struct page *page = nid_page_nr(nid, i); @@ -384,6 +386,7 @@ show_mem(void) else shared += page_count(page) - 1; } + pgdat_resize_unlock(NODE_DATA(nid), &flags); } printk("%ld pages of RAM\n",total); printk("%ld free pages\n",free); diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c index 39c099f15b5f..9db3242103be 100644 --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c @@ -31,11 +31,13 @@ void show_mem(void) pg_data_t *pgdat; unsigned long i; struct page_state ps; + unsigned long flags; printk(KERN_INFO "Mem-info:\n"); show_free_areas(); printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { + pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; ++i) { page = pgdat_page_nr(pgdat, i); total++; @@ -48,6 +50,7 @@ void show_mem(void) else if (page_count(page)) shared += page_count(page) - 1; } + pgdat_resize_unlock(pgdat, &flags); } printk(KERN_INFO "%d pages of RAM\n", total); printk(KERN_INFO "%d pages of HIGHMEM\n", highmem); diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index a3788fb84809..a88cdb7232f8 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -555,9 +555,13 @@ void show_mem(void) show_free_areas(); printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { - unsigned long present = pgdat->node_present_pages; + unsigned long present; + unsigned long flags; int shared = 0, cached = 0, reserved = 0; + printk("Node ID: %d\n", pgdat->node_id); + pgdat_resize_lock(pgdat, &flags); + present = pgdat->node_present_pages; for(i = 0; i < pgdat->node_spanned_pages; i++) { struct page *page; if (pfn_valid(pgdat->node_start_pfn + i)) @@ -571,6 +575,7 @@ void show_mem(void) else if (page_count(page)) shared += page_count(page)-1; } + pgdat_resize_unlock(pgdat, &flags); total_present += present; total_reserved += reserved; total_cached += cached; diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c index d9a40b1fe8ba..6facf15b04f3 100644 --- a/arch/m32r/mm/init.c +++ b/arch/m32r/mm/init.c @@ -48,6 +48,8 @@ void show_mem(void) show_free_areas(); printk("Free swap: %6ldkB\n",nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { + unsigned long flags; + pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; ++i) { page = pgdat_page_nr(pgdat, i); total++; @@ -60,6 +62,7 @@ void show_mem(void) else if (page_count(page)) shared += page_count(page) - 1; } + pgdat_resize_unlock(pgdat, &flags); } printk("%d pages of RAM\n", total); printk("%d pages of HIGHMEM\n",highmem); @@ -150,10 +153,14 @@ int __init reservedpages_count(void) int reservedpages, nid, i; reservedpages = 0; - for_each_online_node(nid) + for_each_online_node(nid) { + unsigned long flags; + pgdat_resize_lock(NODE_DATA(nid), &flags); for (i = 0 ; i < MAX_LOW_PFN(nid) - START_PFN(nid) ; i++) if (PageReserved(nid_page_nr(nid, i))) reservedpages++; + pgdat_resize_unlock(NODE_DATA(nid), &flags); + } return reservedpages; } diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 2886ad70db48..29b998e430e6 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -505,7 +505,9 @@ void show_mem(void) for (j = node_start_pfn(i); j < node_end_pfn(i); j++) { struct page *p; + unsigned long flags; + pgdat_resize_lock(NODE_DATA(i), &flags); p = nid_page_nr(i, j) - node_start_pfn(i); total++; @@ -517,6 +519,7 @@ void show_mem(void) free++; else shared += page_count(p) - 1; + pgdat_resize_unlock(NODE_DATA(i), &flags); } } #endif diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c index a45584b3440c..975b26de34d6 100644 --- a/arch/ppc64/mm/init.c +++ b/arch/ppc64/mm/init.c @@ -104,6 +104,8 @@ void show_mem(void) show_free_areas(); printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { + unsigned long flags; + pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; i++) { page = pgdat_page_nr(pgdat, i); total++; @@ -114,6 +116,7 @@ void show_mem(void) else if (page_count(page)) shared += page_count(page) - 1; } + pgdat_resize_unlock(pgdat, &flags); } printk("%ld pages of RAM\n", total); printk("%ld reserved pages\n", reserved); @@ -647,11 +650,14 @@ void __init mem_init(void) #endif for_each_pgdat(pgdat) { + unsigned long flags; + pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; i++) { page = pgdat_page_nr(pgdat, i); if (PageReserved(page)) reservedpages++; } + pgdat_resize_unlock(pgdat, &flags); } codesize = (unsigned long)&_etext - (unsigned long)&_stext; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h new file mode 100644 index 000000000000..e8103be9d528 --- /dev/null +++ b/include/linux/memory_hotplug.h @@ -0,0 +1,34 @@ +#ifndef __LINUX_MEMORY_HOTPLUG_H +#define __LINUX_MEMORY_HOTPLUG_H + +#include +#include + +#ifdef CONFIG_MEMORY_HOTPLUG +/* + * pgdat resizing functions + */ +static inline +void pgdat_resize_lock(struct pglist_data *pgdat, unsigned long *flags) +{ + spin_lock_irqsave(&pgdat->node_size_lock, *flags); +} +static inline +void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags) +{ + spin_lock_irqrestore(&pgdat->node_size_lock, *flags); +} +static inline +void pgdat_resize_init(struct pglist_data *pgdat) +{ + spin_lock_init(&pgdat->node_size_lock); +} +#else /* ! CONFIG_MEMORY_HOTPLUG */ +/* + * Stub functions for when hotplug is off + */ +static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {} +static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {} +static inline void pgdat_resize_init(struct pglist_data *pgdat) {} +#endif +#endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4674145bb63d..e050d68963a1 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -273,6 +273,16 @@ typedef struct pglist_data { struct page *node_mem_map; #endif struct bootmem_data *bdata; +#ifdef CONFIG_MEMORY_HOTPLUG + /* + * Must be held any time you expect node_start_pfn, node_present_pages + * or node_spanned_pages stay constant. Holding this will also + * guarantee that any pfn_valid() stays that way. + * + * Nests above zone->lock and zone->size_seqlock. + */ + spinlock_t node_size_lock; +#endif unsigned long node_start_pfn; unsigned long node_present_pages; /* total number of physical pages */ unsigned long node_spanned_pages; /* total size of physical page @@ -293,6 +303,8 @@ typedef struct pglist_data { #endif #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) +#include + extern struct pglist_data *pgdat_list; void __get_zone_counts(unsigned long *active, unsigned long *inactive, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a51ef94eec33..32fad6d23200 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1958,6 +1958,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat, int nid = pgdat->node_id; unsigned long zone_start_pfn = pgdat->node_start_pfn; + pgdat_resize_init(pgdat); pgdat->nr_zones = 0; init_waitqueue_head(&pgdat->kswapd_wait); pgdat->kswapd_max_order = 0; -- cgit v1.2.3 From bb7e7e032d2cb8e0e9a88a2be209de5e61033b39 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sat, 29 Oct 2005 18:16:58 -0700 Subject: [PATCH] memory hotplug: ppc64 specific hot-add functions Here is a set of ppc64 specific patches that at least allow compilation/booting with the following configurations: FLATMEM SPARSEMEN SPARSEMEM + MEMORY_HOTPLUG Signed-off-by: Mike Kravetz Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ppc64/mm/init.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) (limited to 'arch/ppc64/mm') diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c index 975b26de34d6..e2bd7776622f 100644 --- a/arch/ppc64/mm/init.c +++ b/arch/ppc64/mm/init.c @@ -871,3 +871,80 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr, return vma_prot; } EXPORT_SYMBOL(phys_mem_access_prot); + +#ifdef CONFIG_MEMORY_HOTPLUG + +void online_page(struct page *page) +{ + ClearPageReserved(page); + free_cold_page(page); + totalram_pages++; + num_physpages++; +} + +/* + * This works only for the non-NUMA case. Later, we'll need a lookup + * to convert from real physical addresses to nid, that doesn't use + * pfn_to_nid(). + */ +int __devinit add_memory(u64 start, u64 size) +{ + struct pglist_data *pgdata = NODE_DATA(0); + struct zone *zone; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + /* this should work for most non-highmem platforms */ + zone = pgdata->node_zones; + + return __add_pages(zone, start_pfn, nr_pages); + + return 0; +} + +/* + * First pass at this code will check to determine if the remove + * request is within the RMO. Do not allow removal within the RMO. + */ +int __devinit remove_memory(u64 start, u64 size) +{ + struct zone *zone; + unsigned long start_pfn, end_pfn, nr_pages; + + start_pfn = start >> PAGE_SHIFT; + nr_pages = size >> PAGE_SHIFT; + end_pfn = start_pfn + nr_pages; + + printk("%s(): Attempting to remove memoy in range " + "%lx to %lx\n", __func__, start, start+size); + /* + * check for range within RMO + */ + zone = page_zone(pfn_to_page(start_pfn)); + + printk("%s(): memory will be removed from " + "the %s zone\n", __func__, zone->name); + + /* + * not handling removing memory ranges that + * overlap multiple zones yet + */ + if (end_pfn > (zone->zone_start_pfn + zone->spanned_pages)) + goto overlap; + + /* make sure it is NOT in RMO */ + if ((start < lmb.rmo_size) || ((start+size) < lmb.rmo_size)) { + printk("%s(): range to be removed must NOT be in RMO!\n", + __func__); + goto in_rmo; + } + + return __remove_pages(zone, start_pfn, nr_pages); + +overlap: + printk("%s(): memory range to be removed overlaps " + "multiple zones!!!\n", __func__); +in_rmo: + return -1; +} +#endif /* CONFIG_MEMORY_HOTPLUG */ -- cgit v1.2.3