From e8c27ac9191ab9e6506ae5cbe70d87ac50f8e960 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 1 Jun 2008 13:15:22 -0700 Subject: x86, numa, 32-bit: print out debug info on all kvas also fix the print out of node_remap_end_vaddr Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- mm/page_alloc.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 63835579323a..502223c3c2c6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3486,6 +3486,11 @@ void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat, calculate_node_totalpages(pgdat, zones_size, zholes_size); alloc_node_mem_map(pgdat); +#ifdef CONFIG_FLAT_NODE_MEM_MAP + printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", + nid, (unsigned long)pgdat, + (unsigned long)pgdat->node_mem_map); +#endif free_area_init_core(pgdat, zones_size, zholes_size); } -- cgit v1.2.3 From cc1a9d86ce989083703c4bdc11b75a87e1cc404a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 8 Jun 2008 19:39:16 -0700 Subject: mm, x86: shrink_active_range() should check all Now we are using register_e820_active_regions() instead of add_active_range() directly. So end_pfn could be different between the value in early_node_map to node_end_pfn. So we need to make shrink_active_range() smarter. shrink_active_range() is a generic MM function in mm/page_alloc.c but it is only used on 32-bit x86. Should we move it back to some file in arch/x86? Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 2 +- include/linux/mm.h | 3 +-- mm/page_alloc.c | 44 ++++++++++++++++++++++++++++++++++---------- 3 files changed, 36 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index a89ccf3d4c14..489605bab85a 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -282,7 +282,7 @@ static unsigned long calculate_numa_remap_pages(void) node_end_pfn[nid] -= size; node_remap_start_pfn[nid] = node_end_pfn[nid]; - shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]); + shrink_active_range(nid, node_end_pfn[nid]); } printk("Reserving total of %ld pages for numa KVA remap\n", reserve_pages); diff --git a/include/linux/mm.h b/include/linux/mm.h index c31a9cd2a30e..7cbd949f2516 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -997,8 +997,7 @@ extern void free_area_init_node(int nid, pg_data_t *pgdat, extern void free_area_init_nodes(unsigned long *max_zone_pfn); extern void add_active_range(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); -extern void shrink_active_range(unsigned int nid, unsigned long old_end_pfn, - unsigned long new_end_pfn); +extern void shrink_active_range(unsigned int nid, unsigned long new_end_pfn); extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); extern void remove_all_active_ranges(void); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 502223c3c2c6..215408684076 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3579,25 +3579,49 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, /** * shrink_active_range - Shrink an existing registered range of PFNs * @nid: The node id the range is on that should be shrunk - * @old_end_pfn: The old end PFN of the range * @new_end_pfn: The new PFN of the range * * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. - * The map is kept at the end physical page range that has already been - * registered with add_active_range(). This function allows an arch to shrink - * an existing registered range. + * The map is kept near the end physical page range that has already been + * registered. This function allows an arch to shrink an existing registered + * range. */ -void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn, - unsigned long new_end_pfn) +void __init shrink_active_range(unsigned int nid, unsigned long new_end_pfn) { - int i; + int i, j; + int removed = 0; /* Find the old active region end and shrink */ - for_each_active_range_index_in_nid(i, nid) - if (early_node_map[i].end_pfn == old_end_pfn) { + for_each_active_range_index_in_nid(i, nid) { + if (early_node_map[i].start_pfn >= new_end_pfn) { + /* clear it */ + early_node_map[i].end_pfn = 0; + removed = 1; + continue; + } + if (early_node_map[i].end_pfn > new_end_pfn) { early_node_map[i].end_pfn = new_end_pfn; - break; + continue; } + } + + if (!removed) + return; + + /* remove the blank ones */ + for (i = nr_nodemap_entries - 1; i > 0; i--) { + if (early_node_map[i].nid != nid) + continue; + if (early_node_map[i].end_pfn) + continue; + /* we found it, get rid of it */ + for (j = i; j < nr_nodemap_entries - 1; j++) + memcpy(&early_node_map[j], &early_node_map[j+1], + sizeof(early_node_map[j])); + j = nr_nodemap_entries - 1; + memset(&early_node_map[j], 0, sizeof(early_node_map[j])); + nr_nodemap_entries--; + } } /** -- cgit v1.2.3 From 1ea0704e0da65b2b46f9142ff1391163aac24060 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 16 Jun 2008 04:30:00 -0700 Subject: mm: add a ptep_modify_prot transaction abstraction This patch adds an API for doing read-modify-write updates to a pte's protection bits which may race against hardware updates to the pte. After reading the pte, the hardware may asynchonously set the accessed or dirty bits on a pte, which would be lost when writing back the modified pte value. The existing technique to handle this race is to use ptep_get_and_clear() atomically fetch the old pte value and clear it in memory. This has the effect of marking the pte as non-present, which will prevent the hardware from updating its state. When the new value is written back, the pte will be present again, and the hardware can resume updating the access/dirty flags. When running in a virtualized environment, pagetable updates are relatively expensive, since they generally involve some trap into the hypervisor. To mitigate the cost of these updates, we tend to batch them. However, because of the atomic nature of ptep_get_and_clear(), it is inherently non-batchable. This new interface allows batching by giving the underlying implementation enough information to open a transaction between the read and write phases: ptep_modify_prot_start() returns the current pte value, and puts the pte entry into a state where either the hardware will not update the pte, or if it does, the updates will be preserved on commit. ptep_modify_prot_commit() writes back the updated pte, makes sure that any hardware updates made since ptep_modify_prot_start() are preserved. ptep_modify_prot_start() and _commit() must be exactly paired, and used while holding the appropriate pte lock. They do not protect against other software updates of the pte in any way. The current implementations of ptep_modify_prot_start and _commit are functionally unchanged from before: _start() uses ptep_get_and_clear() fetch the pte and zero the entry, preventing any hardware updates. _commit() simply writes the new pte value back knowing that the hardware has not updated the pte in the meantime. The only current user of this interface is mprotect Signed-off-by: Jeremy Fitzhardinge Acked-by: Linus Torvalds Acked-by: Hugh Dickins Signed-off-by: Ingo Molnar --- include/asm-generic/pgtable.h | 57 +++++++++++++++++++++++++++++++++++++++++++ mm/mprotect.c | 10 +++----- 2 files changed, 61 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 44ef329531c3..4fce3db2cecc 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -197,6 +197,63 @@ static inline int pmd_none_or_clear_bad(pmd_t *pmd) } #endif /* CONFIG_MMU */ +static inline pte_t __ptep_modify_prot_start(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep) +{ + /* + * Get the current pte state, but zero it out to make it + * non-present, preventing the hardware from asynchronously + * updating it. + */ + return ptep_get_and_clear(mm, addr, ptep); +} + +static inline void __ptep_modify_prot_commit(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, pte_t pte) +{ + /* + * The pte is non-present, so there's no hardware state to + * preserve. + */ + set_pte_at(mm, addr, ptep, pte); +} + +#ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION +/* + * Start a pte protection read-modify-write transaction, which + * protects against asynchronous hardware modifications to the pte. + * The intention is not to prevent the hardware from making pte + * updates, but to prevent any updates it may make from being lost. + * + * This does not protect against other software modifications of the + * pte; the appropriate pte lock must be held over the transation. + * + * Note that this interface is intended to be batchable, meaning that + * ptep_modify_prot_commit may not actually update the pte, but merely + * queue the update to be done at some later time. The update must be + * actually committed before the pte lock is released, however. + */ +static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep) +{ + return __ptep_modify_prot_start(mm, addr, ptep); +} + +/* + * Commit an update to a pte, leaving any hardware-controlled bits in + * the PTE unmodified. + */ +static inline void ptep_modify_prot_commit(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, pte_t pte) +{ + __ptep_modify_prot_commit(mm, addr, ptep, pte); +} +#endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ + /* * A facility to provide lazy MMU batching. This allows PTE updates and * page invalidations to be delayed until a call to leave lazy MMU mode diff --git a/mm/mprotect.c b/mm/mprotect.c index a5bf31c27375..acfe7c8d72fc 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -47,19 +47,17 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, if (pte_present(oldpte)) { pte_t ptent; - /* Avoid an SMP race with hardware updated dirty/clean - * bits by wiping the pte and then setting the new pte - * into place. - */ - ptent = ptep_get_and_clear(mm, addr, pte); + ptent = ptep_modify_prot_start(mm, addr, pte); ptent = pte_modify(ptent, newprot); + /* * Avoid taking write faults for pages we know to be * dirty. */ if (dirty_accountable && pte_dirty(ptent)) ptent = pte_mkwrite(ptent); - set_pte_at(mm, addr, pte, ptent); + + ptep_modify_prot_commit(mm, addr, pte, ptent); #ifdef CONFIG_MIGRATION } else if (!pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); -- cgit v1.2.3 From cc1050bafebfb1d7935331282e948b5016318192 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 13 Jun 2008 19:08:52 -0700 Subject: x86: replace shrink_active_range() with remove_active_range() in case we have kva before ramdisk on a node, we still need to use those ranges. v2: reserve_early kva ram area, in case there are holes in highmem, to avoid those area could be treat as free high pages. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 45 ++++++++++++++++++++++++--------------------- include/linux/mm.h | 3 ++- mm/page_alloc.c | 29 +++++++++++++++++++++++------ 3 files changed, 49 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index accc7c6c57fc..c3f119e99e0d 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -230,8 +230,8 @@ static unsigned long calculate_numa_remap_pages(void) unsigned long size, reserve_pages = 0; for_each_online_node(nid) { - u64 node_end_target; - u64 node_end_final; + u64 node_kva_target; + u64 node_kva_final; /* * The acpi/srat node info can show hot-add memroy zones @@ -254,42 +254,45 @@ static unsigned long calculate_numa_remap_pages(void) /* now the roundup is correct, convert to PAGE_SIZE pages */ size = size * PTRS_PER_PTE; - node_end_target = round_down(node_end_pfn[nid] - size, + node_kva_target = round_down(node_end_pfn[nid] - size, PTRS_PER_PTE); - node_end_target <<= PAGE_SHIFT; + node_kva_target <<= PAGE_SHIFT; do { - node_end_final = find_e820_area(node_end_target, + node_kva_final = find_e820_area(node_kva_target, ((u64)node_end_pfn[nid])<>PAGE_SHIFT) > (node_start_pfn[nid])); + node_kva_target -= LARGE_PAGE_BYTES; + } while (node_kva_final == -1ULL && + (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid])); - if (node_end_final == -1ULL) + if (node_kva_final == -1ULL) panic("Can not get kva ram\n"); - printk("Reserving %ld pages of KVA for lmem_map of node %d\n", - size, nid); node_remap_size[nid] = size; node_remap_offset[nid] = reserve_pages; reserve_pages += size; - printk("Shrinking node %d from %ld pages to %lld pages\n", - nid, node_end_pfn[nid], node_end_final>>PAGE_SHIFT); + printk("Reserving %ld pages of KVA for lmem_map of node %d at %llx\n", + size, nid, node_kva_final>>PAGE_SHIFT); /* * prevent kva address below max_low_pfn want it on system * with less memory later. * layout will be: KVA address , KVA RAM + * + * we are supposed to only record the one less then max_low_pfn + * but we could have some hole in high memory, and it will only + * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide + * to use it as free. + * So reserve_early here, hope we don't run out of that array */ - if ((node_end_final>>PAGE_SHIFT) < max_low_pfn) - reserve_early(node_end_final, - node_end_final+(((u64)size)<>PAGE_SHIFT; - node_remap_start_pfn[nid] = node_end_pfn[nid]; - shrink_active_range(nid, node_end_pfn[nid]); + reserve_early(node_kva_final, + node_kva_final+(((u64)size)<>PAGE_SHIFT; + remove_active_range(nid, node_remap_start_pfn[nid], + node_remap_start_pfn[nid] + size); } printk("Reserving total of %ld pages for numa KVA remap\n", reserve_pages); diff --git a/include/linux/mm.h b/include/linux/mm.h index ce8e397a61f6..034a3156d2f0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -998,7 +998,8 @@ extern void free_area_init_node(int nid, pg_data_t *pgdat, extern void free_area_init_nodes(unsigned long *max_zone_pfn); extern void add_active_range(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); -extern void shrink_active_range(unsigned int nid, unsigned long new_end_pfn); +extern void remove_active_range(unsigned int nid, unsigned long start_pfn, + unsigned long end_pfn); extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); extern void remove_all_active_ranges(void); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eee5ba7509c1..d80e1868e570 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3552,30 +3552,47 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, } /** - * shrink_active_range - Shrink an existing registered range of PFNs + * remove_active_range - Shrink an existing registered range of PFNs * @nid: The node id the range is on that should be shrunk - * @new_end_pfn: The new PFN of the range + * @start_pfn: The new PFN of the range + * @end_pfn: The new PFN of the range * * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. * The map is kept near the end physical page range that has already been * registered. This function allows an arch to shrink an existing registered * range. */ -void __init shrink_active_range(unsigned int nid, unsigned long new_end_pfn) +void __init remove_active_range(unsigned int nid, unsigned long start_pfn, + unsigned long end_pfn) { int i, j; int removed = 0; + printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n", + nid, start_pfn, end_pfn); + /* Find the old active region end and shrink */ for_each_active_range_index_in_nid(i, nid) { - if (early_node_map[i].start_pfn >= new_end_pfn) { + if (early_node_map[i].start_pfn >= start_pfn && + early_node_map[i].end_pfn <= end_pfn) { /* clear it */ + early_node_map[i].start_pfn = 0; early_node_map[i].end_pfn = 0; removed = 1; continue; } - if (early_node_map[i].end_pfn > new_end_pfn) { - early_node_map[i].end_pfn = new_end_pfn; + if (early_node_map[i].start_pfn < start_pfn && + early_node_map[i].end_pfn > start_pfn) { + unsigned long temp_end_pfn = early_node_map[i].end_pfn; + early_node_map[i].end_pfn = start_pfn; + if (temp_end_pfn > end_pfn) + add_active_range(nid, end_pfn, temp_end_pfn); + continue; + } + if (early_node_map[i].start_pfn >= start_pfn && + early_node_map[i].end_pfn > end_pfn && + early_node_map[i].start_pfn < end_pfn) { + early_node_map[i].start_pfn = end_pfn; continue; } } -- cgit v1.2.3 From b5bc6c0e55000dab86b73f838f5ad02908b23755 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 14 Jun 2008 18:32:52 -0700 Subject: x86, mm: use add_highpages_with_active_regions() for high pages init v2 use early_node_map to init high pages, so we can remove page_is_ram() and page_is_reserved_early() in the big loop with add_one_highpage also remove page_is_reserved_early(), it is not needed anymore. v2: fix the build of other platforms Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 11 -------- arch/x86/mm/discontig_32.c | 19 ++++++-------- arch/x86/mm/init_32.c | 62 ++++++++++++++++++++++++++++++++++++++-------- include/asm-x86/e820.h | 1 - include/asm-x86/highmem.h | 3 +++ include/linux/mm.h | 2 ++ mm/page_alloc.c | 8 ++++++ 7 files changed, 71 insertions(+), 35 deletions(-) (limited to 'mm') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 5051ce744b4e..ed46b7a6bc13 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -612,17 +612,6 @@ void __init free_early(u64 start, u64 end) early_res[j - 1].end = 0; } -int __init page_is_reserved_early(unsigned long pagenr) -{ - u64 start = (u64)pagenr << PAGE_SHIFT; - int i; - struct early_res *r; - - i = find_overlapped_early(start, start + PAGE_SIZE); - r = &early_res[i]; - return (i < MAX_EARLY_RES && r->end); -} - void __init early_res_to_bootmem(u64 start, u64 end) { int i; diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index c3f119e99e0d..7c4d0255f8d8 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -100,7 +100,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, #endif extern unsigned long find_max_low_pfn(void); -extern void add_one_highpage_init(struct page *, int, int); extern unsigned long highend_pfn, highstart_pfn; #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) @@ -432,10 +431,10 @@ void __init set_highmem_pages_init(int bad_ppro) { #ifdef CONFIG_HIGHMEM struct zone *zone; - struct page *page; + int nid; for_each_zone(zone) { - unsigned long node_pfn, zone_start_pfn, zone_end_pfn; + unsigned long zone_start_pfn, zone_end_pfn; if (!is_highmem(zone)) continue; @@ -443,16 +442,12 @@ void __init set_highmem_pages_init(int bad_ppro) zone_start_pfn = zone->zone_start_pfn; zone_end_pfn = zone_start_pfn + zone->spanned_pages; + nid = zone_to_nid(zone); printk("Initializing %s for node %d (%08lx:%08lx)\n", - zone->name, zone_to_nid(zone), - zone_start_pfn, zone_end_pfn); - - for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) { - if (!pfn_valid(node_pfn)) - continue; - page = pfn_to_page(node_pfn); - add_one_highpage_init(page, node_pfn, bad_ppro); - } + zone->name, nid, zone_start_pfn, zone_end_pfn); + + add_highpages_with_active_regions(nid, zone_start_pfn, + zone_end_pfn, bad_ppro); } totalram_pages += totalhigh_pages; #endif diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index abadb1da70df..ba07a489230e 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -287,10 +287,10 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) pkmap_page_table = pte; } -void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) +static void __init +add_one_highpage_init(struct page *page, int pfn, int bad_ppro) { - if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn)) && - !page_is_reserved_early(pfn)) { + if (!(bad_ppro && page_kills_ppro(pfn))) { ClearPageReserved(page); init_page_count(page); __free_page(page); @@ -299,18 +299,58 @@ void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) SetPageReserved(page); } +struct add_highpages_data { + unsigned long start_pfn; + unsigned long end_pfn; + int bad_ppro; +}; + +static void __init add_highpages_work_fn(unsigned long start_pfn, + unsigned long end_pfn, void *datax) +{ + int node_pfn; + struct page *page; + unsigned long final_start_pfn, final_end_pfn; + struct add_highpages_data *data; + int bad_ppro; + + data = (struct add_highpages_data *)datax; + bad_ppro = data->bad_ppro; + + final_start_pfn = max(start_pfn, data->start_pfn); + final_end_pfn = min(end_pfn, data->end_pfn); + if (final_start_pfn >= final_end_pfn) + return; + + for (node_pfn = final_start_pfn; node_pfn < final_end_pfn; + node_pfn++) { + if (!pfn_valid(node_pfn)) + continue; + page = pfn_to_page(node_pfn); + add_one_highpage_init(page, node_pfn, bad_ppro); + } + +} + +void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn, + int bad_ppro) +{ + struct add_highpages_data data; + + data.start_pfn = start_pfn; + data.end_pfn = end_pfn; + data.bad_ppro = bad_ppro; + + work_with_active_regions(nid, add_highpages_work_fn, &data); +} + #ifndef CONFIG_NUMA static void __init set_highmem_pages_init(int bad_ppro) { - int pfn; + add_highpages_with_active_regions(0, highstart_pfn, highend_pfn, + bad_ppro); - for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) { - /* - * Holes under sparsemem might not have no mem_map[]: - */ - if (pfn_valid(pfn)) - add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); - } totalram_pages += totalhigh_pages; } #endif /* !CONFIG_NUMA */ diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 6b0ce745a60c..55d310596907 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -86,7 +86,6 @@ extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); extern void reserve_early(u64 start, u64 end, char *name); extern void free_early(u64 start, u64 end); extern void early_res_to_bootmem(u64 start, u64 end); -extern int page_is_reserved_early(unsigned long pagenr); extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); extern unsigned long e820_end_of_ram(void); diff --git a/include/asm-x86/highmem.h b/include/asm-x86/highmem.h index e153f3b44774..85c4fea41ff6 100644 --- a/include/asm-x86/highmem.h +++ b/include/asm-x86/highmem.h @@ -74,6 +74,9 @@ struct page *kmap_atomic_to_page(void *ptr); #define flush_cache_kmaps() do { } while (0) +extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn, int bad_ppro); + #endif /* __KERNEL__ */ #endif /* _ASM_HIGHMEM_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 034a3156d2f0..e4de460907c1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1011,6 +1011,8 @@ extern unsigned long find_min_pfn_with_active_regions(void); extern unsigned long find_max_pfn_with_active_regions(void); extern void free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn); +typedef void (*work_fn_t)(unsigned long, unsigned long, void *); +extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data); extern void sparse_memory_present_with_active_regions(int nid); #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID extern int early_pfn_to_nid(unsigned long pfn); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d80e1868e570..41c6e3aa059f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2929,6 +2929,14 @@ void __init free_bootmem_with_active_regions(int nid, } } +void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) +{ + int i; + + for_each_active_range_index_in_nid(i, nid) + work_fn(early_node_map[i].start_pfn, early_node_map[i].end_pfn, + data); +} /** * sparse_memory_present_with_active_regions - Call memory_present for each active range * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. -- cgit v1.2.3 From d52d53b8a5b258bfaab9223a5e7284fcfdd48577 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 16 Jun 2008 20:10:55 -0700 Subject: RFC x86: try to remove arch_get_ram_range want to remove arch_get_ram_range, and use early_node_map instead. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 6 ++++-- drivers/pci/intel-iommu.c | 51 ++++++++++++++++++++++++++++++++++------------- include/linux/mm.h | 2 +- mm/page_alloc.c | 10 +++++++--- 4 files changed, 49 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 65d55056b6e7..a0484adbf59d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -298,7 +298,7 @@ struct add_highpages_data { unsigned long end_pfn; }; -static void __init add_highpages_work_fn(unsigned long start_pfn, +static int __init add_highpages_work_fn(unsigned long start_pfn, unsigned long end_pfn, void *datax) { int node_pfn; @@ -311,7 +311,7 @@ static void __init add_highpages_work_fn(unsigned long start_pfn, final_start_pfn = max(start_pfn, data->start_pfn); final_end_pfn = min(end_pfn, data->end_pfn); if (final_start_pfn >= final_end_pfn) - return; + return 0; for (node_pfn = final_start_pfn; node_pfn < final_end_pfn; node_pfn++) { @@ -321,6 +321,8 @@ static void __init add_highpages_work_fn(unsigned long start_pfn, add_one_highpage_init(page, node_pfn); } + return 0; + } void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 66c0fd21894b..bb0642318a95 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -1637,12 +1637,43 @@ static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr, } #ifdef CONFIG_DMAR_GFX_WA -extern int arch_get_ram_range(int slot, u64 *addr, u64 *size); +struct iommu_prepare_data { + struct pci_dev *pdev; + int ret; +}; + +static int __init iommu_prepare_work_fn(unsigned long start_pfn, + unsigned long end_pfn, void *datax) +{ + struct iommu_prepare_data *data; + + data = (struct iommu_prepare_data *)datax; + + data->ret = iommu_prepare_identity_map(data->pdev, + start_pfn<ret; + +} + +static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev) +{ + int nid; + struct iommu_prepare_data data; + + data.pdev = pdev; + data.ret = 0; + + for_each_online_node(nid) { + work_with_active_regions(nid, iommu_prepare_work_fn, &data); + if (data.ret) + return data.ret; + } + return data.ret; +} + static void __init iommu_prepare_gfx_mapping(void) { struct pci_dev *pdev = NULL; - u64 base, size; - int slot; int ret; for_each_pci_dev(pdev) { @@ -1651,17 +1682,9 @@ static void __init iommu_prepare_gfx_mapping(void) continue; printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n", pci_name(pdev)); - slot = arch_get_ram_range(0, &base, &size); - while (slot >= 0) { - ret = iommu_prepare_identity_map(pdev, - base, base + size); - if (ret) - goto error; - slot = arch_get_ram_range(slot, &base, &size); - } - continue; -error: - printk(KERN_ERR "IOMMU: mapping reserved region failed\n"); + ret = iommu_prepare_with_active_regions(pdev); + if (ret) + printk(KERN_ERR "IOMMU: mapping reserved region failed\n"); } } #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 3d647b24041f..cf1cd3a2ed78 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1011,7 +1011,7 @@ extern unsigned long find_min_pfn_with_active_regions(void); extern unsigned long find_max_pfn_with_active_regions(void); extern void free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn); -typedef void (*work_fn_t)(unsigned long, unsigned long, void *); +typedef int (*work_fn_t)(unsigned long, unsigned long, void *); extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data); extern void sparse_memory_present_with_active_regions(int nid); #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 41c6e3aa059f..e25b6b24f844 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2932,10 +2932,14 @@ void __init free_bootmem_with_active_regions(int nid, void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) { int i; + int ret; - for_each_active_range_index_in_nid(i, nid) - work_fn(early_node_map[i].start_pfn, early_node_map[i].end_pfn, - data); + for_each_active_range_index_in_nid(i, nid) { + ret = work_fn(early_node_map[i].start_pfn, + early_node_map[i].end_pfn, data); + if (ret) + break; + } } /** * sparse_memory_present_with_active_regions - Call memory_present for each active range -- cgit v1.2.3 From e2fc252e0ce695b4c4abe27bb073c35bd0d73252 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Sun, 22 Jun 2008 07:22:12 -0700 Subject: x86 boot: show pfn addresses in hex not decimal in some kernel info printks Page frame numbers (the portion of physical addresses above the low order page offsets) are displayed in several kernel debug and info prints in decimal, not hex. Decimal addresse are unreadable. Use hex. Signed-off-by: Paul Jackson Cc: "Yinghai Lu" Cc: "Jack Steiner" Cc: "Mike Travis" Cc: "Huang Cc: Ying" Cc: "Andi Kleen" Cc: "Andrew Morton" Cc: Paul Jackson Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 2 +- mm/page_alloc.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 600c9de237a0..512f779fc6af 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -990,7 +990,7 @@ unsigned long __init e820_end_of_ram(void) if (last_pfn > end_user_pfn) last_pfn = end_user_pfn; - printk(KERN_INFO "last_pfn = %lu max_arch_pfn = %lu\n", + printk(KERN_INFO "last_pfn = 0x%lx max_arch_pfn = 0x%lx\n", last_pfn, max_arch_pfn); return last_pfn; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e25b6b24f844..c09c2c8d2c6a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3520,7 +3520,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, { int i; - printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) " + printk(KERN_DEBUG "Entering add_active_range(%d, 0x%lx, 0x%lx) " "%d entries of %d used\n", nid, start_pfn, end_pfn, nr_nodemap_entries, MAX_ACTIVE_REGIONS); @@ -3936,7 +3936,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; - printk(" %-8s %8lu -> %8lu\n", + printk(" %-8s 0x%8lx -> 0x%8lx\n", zone_names[i], arch_zone_lowest_possible_pfn[i], arch_zone_highest_possible_pfn[i]); @@ -3952,7 +3952,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) /* Print out the early_node_map[] */ printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); for (i = 0; i < nr_nodemap_entries; i++) - printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid, + printk(" %3d: 0x%8lx -> 0x%8lx\n", early_node_map[i].nid, early_node_map[i].start_pfn, early_node_map[i].end_pfn); -- cgit v1.2.3 From 2bc0d2615a15a93d344abbe8cb1b9056122bce9d Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Sun, 22 Jun 2008 07:22:17 -0700 Subject: x86 boot: more consistently use type int for node ids Everywhere I look, node id's are of type 'int', except in this one case, which has 'unsigned long'. Change this one to 'int' as well. There is nothing special about the way this variable 'nid' is used in this routine to justify using an unusual type here. Signed-off-by: Paul Jackson Cc: "Yinghai Lu" Cc: "Jack Steiner" Cc: "Mike Travis" Cc: "Huang Cc: Ying" Cc: "Andi Kleen" Cc: "Andrew Morton" Cc: Paul Jackson Signed-off-by: Ingo Molnar --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c09c2c8d2c6a..b604c64a0337 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3669,7 +3669,7 @@ static void __init sort_node_map(void) } /* Find the lowest pfn for a node */ -unsigned long __init find_min_pfn_for_node(unsigned long nid) +unsigned long __init find_min_pfn_for_node(int nid) { int i; unsigned long min_pfn = ULONG_MAX; @@ -3680,7 +3680,7 @@ unsigned long __init find_min_pfn_for_node(unsigned long nid) if (min_pfn == ULONG_MAX) { printk(KERN_WARNING - "Could not find start_pfn for node %lu\n", nid); + "Could not find start_pfn for node %d\n", nid); return 0; } -- cgit v1.2.3 From 5dab8ec139be215fbaba216fb4aea914d0f4dac5 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Wed, 25 Jun 2008 05:44:40 -0700 Subject: mm, generic, x86 boot: more tweaks to hex prints of some pfn addresses Fix some problems with (and applies on top of) a previous patch: x86 boot: show pfn addresses in hex not decimal in some kernel info printks Primarily change "0x%8lx" format, which displays with a right aligned space filled hex number (spaces between the "0x" prefix and the number), into "%0#10lx" format, which zero fills instead of space fills, and which uses the printf flag '#' to request the "0x" prefix instead of hard coding it. Also replace some other "0x%lx" formats with "%#lx", making use of the '#' printf flag again. Signed-off-by: Paul Jackson Cc: "Yinghai Lu" Cc: "Jack Steiner" Cc: "Mike Travis" Cc: "Huang Cc: Ying" Cc: "Andi Kleen" Cc: "Andrew Morton" Cc: Paul Jackson Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 2 +- mm/page_alloc.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 22cfd665224c..1dcb66533dfc 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1014,7 +1014,7 @@ unsigned long __init e820_end_of_ram(void) if (last_pfn > end_user_pfn) last_pfn = end_user_pfn; - printk(KERN_INFO "last_pfn = 0x%lx max_arch_pfn = 0x%lx\n", + printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n", last_pfn, max_arch_pfn); return last_pfn; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b604c64a0337..f024b9b3a2a6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3520,7 +3520,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, { int i; - printk(KERN_DEBUG "Entering add_active_range(%d, 0x%lx, 0x%lx) " + printk(KERN_DEBUG "Entering add_active_range(%d, %#lx, %#lx) " "%d entries of %d used\n", nid, start_pfn, end_pfn, nr_nodemap_entries, MAX_ACTIVE_REGIONS); @@ -3936,7 +3936,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; - printk(" %-8s 0x%8lx -> 0x%8lx\n", + printk(" %-8s %0#10lx -> %0#10lx\n", zone_names[i], arch_zone_lowest_possible_pfn[i], arch_zone_highest_possible_pfn[i]); @@ -3952,7 +3952,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) /* Print out the early_node_map[] */ printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); for (i = 0; i < nr_nodemap_entries; i++) - printk(" %3d: 0x%8lx -> 0x%8lx\n", early_node_map[i].nid, + printk(" %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid, early_node_map[i].start_pfn, early_node_map[i].end_pfn); -- cgit v1.2.3