From 479e2802d09f1e18a97262c4c6f8f17ae5884bd8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Oct 2012 14:16:28 +0200 Subject: mm: mempolicy: Make MPOL_LOCAL a real policy Make MPOL_LOCAL a real and exposed policy such that applications that relied on the previous default behaviour can explicitly request it. Requested-by: Christoph Lameter Reviewed-by: Rik van Riel Cc: Lee Schermerhorn Cc: Andrew Morton Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Mel Gorman --- include/uapi/linux/mempolicy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 23e62e0537e2..3e835c9d847b 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -20,6 +20,7 @@ enum { MPOL_PREFERRED, MPOL_BIND, MPOL_INTERLEAVE, + MPOL_LOCAL, MPOL_MAX, /* always last member of enum */ }; -- cgit v1.2.3 From d3a710337b0590f43fd236d5e6518439afc7410a Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Thu, 25 Oct 2012 14:16:29 +0200 Subject: mm: mempolicy: Add MPOL_NOOP This patch augments the MPOL_MF_LAZY feature by adding a "NOOP" policy to mbind(). When the NOOP policy is used with the 'MOVE and 'LAZY flags, mbind() will map the pages PROT_NONE so that they will be migrated on the next touch. This allows an application to prepare for a new phase of operation where different regions of shared storage will be assigned to worker threads, w/o changing policy. Note that we could just use "default" policy in this case. However, this also allows an application to request that pages be migrated, only if necessary, to follow any arbitrary policy that might currently apply to a range of pages, without knowing the policy, or without specifying multiple mbind()s for ranges with different policies. [ Bug in early version of mpol_parse_str() reported by Fengguang Wu. ] Bug-Reported-by: Reported-by: Fengguang Wu Signed-off-by: Lee Schermerhorn Reviewed-by: Rik van Riel Cc: Andrew Morton Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Mel Gorman --- include/uapi/linux/mempolicy.h | 1 + mm/mempolicy.c | 11 ++++++----- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 3e835c9d847b..d23dca8367cc 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -21,6 +21,7 @@ enum { MPOL_BIND, MPOL_INTERLEAVE, MPOL_LOCAL, + MPOL_NOOP, /* retain existing policy for range */ MPOL_MAX, /* always last member of enum */ }; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 54bd3e5ed776..c21e91477c4f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -251,10 +251,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, pr_debug("setting mode %d flags %d nodes[0] %lx\n", mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); - if (mode == MPOL_DEFAULT) { + if (mode == MPOL_DEFAULT || mode == MPOL_NOOP) { if (nodes && !nodes_empty(*nodes)) return ERR_PTR(-EINVAL); - return NULL; /* simply delete any existing policy */ + return NULL; } VM_BUG_ON(!nodes); @@ -1147,7 +1147,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (start & ~PAGE_MASK) return -EINVAL; - if (mode == MPOL_DEFAULT) + if (mode == MPOL_DEFAULT || mode == MPOL_NOOP) flags &= ~MPOL_MF_STRICT; len = (len + PAGE_SIZE - 1) & PAGE_MASK; @@ -2409,7 +2409,8 @@ static const char * const policy_modes[] = [MPOL_PREFERRED] = "prefer", [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", - [MPOL_LOCAL] = "local" + [MPOL_LOCAL] = "local", + [MPOL_NOOP] = "noop", /* should not actually be used */ }; @@ -2460,7 +2461,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) break; } } - if (mode >= MPOL_MAX) + if (mode >= MPOL_MAX || mode == MPOL_NOOP) goto out; switch (mode) { -- cgit v1.2.3 From 771fb4d806a92bf6c988fcfbd286ae40a9374332 Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Thu, 25 Oct 2012 14:16:30 +0200 Subject: mm: mempolicy: Check for misplaced page This patch provides a new function to test whether a page resides on a node that is appropriate for the mempolicy for the vma and address where the page is supposed to be mapped. This involves looking up the node where the page belongs. So, the function returns that node so that it may be used to allocated the page without consulting the policy again. A subsequent patch will call this function from the fault path. Because of this, I don't want to go ahead and allocate the page, e.g., via alloc_page_vma() only to have to free it if it has the correct policy. So, I just mimic the alloc_page_vma() node computation logic--sort of. Note: we could use this function to implement a MPOL_MF_STRICT behavior when migrating pages to match mbind() mempolicy--e.g., to ensure that pages in an interleaved range are reinterleaved rather than left where they are when they reside on any page in the interleave nodemask. Signed-off-by: Lee Schermerhorn Reviewed-by: Rik van Riel Cc: Andrew Morton Cc: Linus Torvalds [ Added MPOL_F_LAZY to trigger migrate-on-fault; simplified code now that we don't have to bother with special crap for interleaved ] Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Mel Gorman --- include/linux/mempolicy.h | 8 +++++ include/uapi/linux/mempolicy.h | 1 + mm/mempolicy.c | 76 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+) (limited to 'include/uapi') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index e5ccb9ddd90e..c511e2523560 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -198,6 +198,8 @@ static inline int vma_migratable(struct vm_area_struct *vma) return 1; } +extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long); + #else struct mempolicy {}; @@ -323,5 +325,11 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, return 0; } +static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma, + unsigned long address) +{ + return -1; /* no node preference */ +} + #endif /* CONFIG_NUMA */ #endif diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index d23dca8367cc..472de8a5d37e 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -61,6 +61,7 @@ enum mpol_rebind_step { #define MPOL_F_SHARED (1 << 0) /* identify shared policies */ #define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ #define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */ +#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ #endif /* _UAPI_LINUX_MEMPOLICY_H */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c21e91477c4f..df1466d3d2d8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2181,6 +2181,82 @@ static void sp_free(struct sp_node *n) kmem_cache_free(sn_cache, n); } +/** + * mpol_misplaced - check whether current page node is valid in policy + * + * @page - page to be checked + * @vma - vm area where page mapped + * @addr - virtual address where page mapped + * + * Lookup current policy node id for vma,addr and "compare to" page's + * node id. + * + * Returns: + * -1 - not misplaced, page is in the right node + * node - node id where the page should be + * + * Policy determination "mimics" alloc_page_vma(). + * Called from fault path where we know the vma and faulting address. + */ +int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) +{ + struct mempolicy *pol; + struct zone *zone; + int curnid = page_to_nid(page); + unsigned long pgoff; + int polnid = -1; + int ret = -1; + + BUG_ON(!vma); + + pol = get_vma_policy(current, vma, addr); + if (!(pol->flags & MPOL_F_MOF)) + goto out; + + switch (pol->mode) { + case MPOL_INTERLEAVE: + BUG_ON(addr >= vma->vm_end); + BUG_ON(addr < vma->vm_start); + + pgoff = vma->vm_pgoff; + pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; + polnid = offset_il_node(pol, vma, pgoff); + break; + + case MPOL_PREFERRED: + if (pol->flags & MPOL_F_LOCAL) + polnid = numa_node_id(); + else + polnid = pol->v.preferred_node; + break; + + case MPOL_BIND: + /* + * allows binding to multiple nodes. + * use current page if in policy nodemask, + * else select nearest allowed node, if any. + * If no allowed nodes, use current [!misplaced]. + */ + if (node_isset(curnid, pol->v.nodes)) + goto out; + (void)first_zones_zonelist( + node_zonelist(numa_node_id(), GFP_HIGHUSER), + gfp_zone(GFP_HIGHUSER), + &pol->v.nodes, &zone); + polnid = zone->node; + break; + + default: + BUG(); + } + if (curnid != polnid) + ret = polnid; +out: + mpol_cond_put(pol); + + return ret; +} + static void sp_delete(struct shared_policy *sp, struct sp_node *n) { pr_debug("deleting %lx-l%lx\n", n->start, n->end); -- cgit v1.2.3 From b24f53a0bea38b266d219ee651b22dba727c44ae Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Thu, 25 Oct 2012 14:16:32 +0200 Subject: mm: mempolicy: Add MPOL_MF_LAZY NOTE: Once again there is a lot of patch stealing and the end result is sufficiently different that I had to drop the signed-offs. Will re-add if the original authors are ok with that. This patch adds another mbind() flag to request "lazy migration". The flag, MPOL_MF_LAZY, modifies MPOL_MF_MOVE* such that the selected pages are marked PROT_NONE. The pages will be migrated in the fault path on "first touch", if the policy dictates at that time. "Lazy Migration" will allow testing of migrate-on-fault via mbind(). Also allows applications to specify that only subsequently touched pages be migrated to obey new policy, instead of all pages in range. This can be useful for multi-threaded applications working on a large shared data area that is initialized by an initial thread resulting in all pages on one [or a few, if overflowed] nodes. After PROT_NONE, the pages in regions assigned to the worker threads will be automatically migrated local to the threads on 1st touch. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel --- include/linux/mm.h | 5 ++ include/uapi/linux/mempolicy.h | 13 ++- mm/mempolicy.c | 185 +++++++++++++++++++++++++++++++++++++---- 3 files changed, 185 insertions(+), 18 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/mm.h b/include/linux/mm.h index fa1615211159..471185e29bab 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1551,6 +1551,11 @@ static inline pgprot_t vm_get_page_prot(unsigned long vm_flags) } #endif +#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE +void change_prot_numa(struct vm_area_struct *vma, + unsigned long start, unsigned long end); +#endif + struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 472de8a5d37e..6a1baae3775d 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -49,9 +49,16 @@ enum mpol_rebind_step { /* Flags for mbind */ #define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */ -#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */ -#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */ -#define MPOL_MF_INTERNAL (1<<3) /* Internal flags start here */ +#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform + to policy */ +#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */ +#define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */ +#define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */ + +#define MPOL_MF_VALID (MPOL_MF_STRICT | \ + MPOL_MF_MOVE | \ + MPOL_MF_MOVE_ALL | \ + MPOL_MF_LAZY) /* * Internal flags that share the struct mempolicy flags word with diff --git a/mm/mempolicy.c b/mm/mempolicy.c index df1466d3d2d8..51d3ebd8561e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -90,6 +90,7 @@ #include #include #include +#include #include #include @@ -565,6 +566,145 @@ static inline int check_pgd_range(struct vm_area_struct *vma, return 0; } +#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE +/* + * Here we search for not shared page mappings (mapcount == 1) and we + * set up the pmd/pte_numa on those mappings so the very next access + * will fire a NUMA hinting page fault. + */ +static int +change_prot_numa_range(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte, *_pte; + struct page *page; + unsigned long _address, end; + spinlock_t *ptl; + int ret = 0; + + VM_BUG_ON(address & ~PAGE_MASK); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + goto out; + + if (pmd_trans_huge_lock(pmd, vma) == 1) { + int page_nid; + ret = HPAGE_PMD_NR; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + if (pmd_numa(*pmd)) { + spin_unlock(&mm->page_table_lock); + goto out; + } + + page = pmd_page(*pmd); + + /* only check non-shared pages */ + if (page_mapcount(page) != 1) { + spin_unlock(&mm->page_table_lock); + goto out; + } + + page_nid = page_to_nid(page); + + if (pmd_numa(*pmd)) { + spin_unlock(&mm->page_table_lock); + goto out; + } + + set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd)); + ret += HPAGE_PMD_NR; + /* defer TLB flush to lower the overhead */ + spin_unlock(&mm->page_table_lock); + goto out; + } + + if (pmd_trans_unstable(pmd)) + goto out; + VM_BUG_ON(!pmd_present(*pmd)); + + end = min(vma->vm_end, (address + PMD_SIZE) & PMD_MASK); + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + for (_address = address, _pte = pte; _address < end; + _pte++, _address += PAGE_SIZE) { + pte_t pteval = *_pte; + if (!pte_present(pteval)) + continue; + if (pte_numa(pteval)) + continue; + page = vm_normal_page(vma, _address, pteval); + if (unlikely(!page)) + continue; + /* only check non-shared pages */ + if (page_mapcount(page) != 1) + continue; + + set_pte_at(mm, _address, _pte, pte_mknuma(pteval)); + + /* defer TLB flush to lower the overhead */ + ret++; + } + pte_unmap_unlock(pte, ptl); + + if (ret && !pmd_numa(*pmd)) { + spin_lock(&mm->page_table_lock); + set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd)); + spin_unlock(&mm->page_table_lock); + /* defer TLB flush to lower the overhead */ + } + +out: + return ret; +} + +/* Assumes mmap_sem is held */ +void +change_prot_numa(struct vm_area_struct *vma, + unsigned long address, unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + int progress = 0; + + while (address < end) { + VM_BUG_ON(address < vma->vm_start || + address + PAGE_SIZE > vma->vm_end); + + progress += change_prot_numa_range(mm, vma, address); + address = (address + PMD_SIZE) & PMD_MASK; + } + + /* + * Flush the TLB for the mm to start the NUMA hinting + * page faults after we finish scanning this vma part + * if there were any PTE updates + */ + if (progress) { + mmu_notifier_invalidate_range_start(vma->vm_mm, address, end); + flush_tlb_range(vma, address, end); + mmu_notifier_invalidate_range_end(vma->vm_mm, address, end); + } +} +#else +static unsigned long change_prot_numa(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + return 0; +} +#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ + /* * Check if all pages in a range are on a set of nodes. * If pagelist != NULL then isolate pages from the LRU and @@ -583,22 +723,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, return ERR_PTR(-EFAULT); prev = NULL; for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { + unsigned long endvma = vma->vm_end; + + if (endvma > end) + endvma = end; + if (vma->vm_start > start) + start = vma->vm_start; + if (!(flags & MPOL_MF_DISCONTIG_OK)) { if (!vma->vm_next && vma->vm_end < end) return ERR_PTR(-EFAULT); if (prev && prev->vm_end < vma->vm_start) return ERR_PTR(-EFAULT); } - if (!is_vm_hugetlb_page(vma) && - ((flags & MPOL_MF_STRICT) || + + if (is_vm_hugetlb_page(vma)) + goto next; + + if (flags & MPOL_MF_LAZY) { + change_prot_numa(vma, start, endvma); + goto next; + } + + if ((flags & MPOL_MF_STRICT) || ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && - vma_migratable(vma)))) { - unsigned long endvma = vma->vm_end; + vma_migratable(vma))) { - if (endvma > end) - endvma = end; - if (vma->vm_start > start) - start = vma->vm_start; err = check_pgd_range(vma, start, endvma, nodes, flags, private); if (err) { @@ -606,6 +756,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, break; } } +next: prev = vma; } return first; @@ -1138,8 +1289,7 @@ static long do_mbind(unsigned long start, unsigned long len, int err; LIST_HEAD(pagelist); - if (flags & ~(unsigned long)(MPOL_MF_STRICT | - MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + if (flags & ~(unsigned long)MPOL_MF_VALID) return -EINVAL; if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) return -EPERM; @@ -1162,6 +1312,9 @@ static long do_mbind(unsigned long start, unsigned long len, if (IS_ERR(new)) return PTR_ERR(new); + if (flags & MPOL_MF_LAZY) + new->flags |= MPOL_F_MOF; + /* * If we are using the default policy then operation * on discontinuous address spaces is okay after all @@ -1198,13 +1351,15 @@ static long do_mbind(unsigned long start, unsigned long len, vma = check_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); - err = PTR_ERR(vma); - if (!IS_ERR(vma)) { - int nr_failed = 0; - + err = PTR_ERR(vma); /* maybe ... */ + if (!IS_ERR(vma) && mode != MPOL_NOOP) err = mbind_range(mm, start, end, new); + if (!err) { + int nr_failed = 0; + if (!list_empty(&pagelist)) { + WARN_ON_ONCE(flags & MPOL_MF_LAZY); nr_failed = migrate_pages(&pagelist, new_vma_page, (unsigned long)vma, false, MIGRATE_SYNC, @@ -1213,7 +1368,7 @@ static long do_mbind(unsigned long start, unsigned long len, putback_lru_pages(&pagelist); } - if (!err && nr_failed && (flags & MPOL_MF_STRICT)) + if (nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; } else putback_lru_pages(&pagelist); -- cgit v1.2.3 From a720094ded8cbb303111035be91858011d2eac71 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 16 Nov 2012 09:37:58 +0000 Subject: mm: mempolicy: Hide MPOL_NOOP and MPOL_MF_LAZY from userspace for now The use of MPOL_NOOP and MPOL_MF_LAZY to allow an application to explicitly request lazy migration is a good idea but the actual API has not been well reviewed and once released we have to support it. For now this patch prevents an application using the services. This will need to be revisited. Signed-off-by: Mel Gorman --- include/uapi/linux/mempolicy.h | 4 +--- mm/mempolicy.c | 9 ++++----- 2 files changed, 5 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 6a1baae3775d..16fb4e6efbc4 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -21,7 +21,6 @@ enum { MPOL_BIND, MPOL_INTERLEAVE, MPOL_LOCAL, - MPOL_NOOP, /* retain existing policy for range */ MPOL_MAX, /* always last member of enum */ }; @@ -57,8 +56,7 @@ enum mpol_rebind_step { #define MPOL_MF_VALID (MPOL_MF_STRICT | \ MPOL_MF_MOVE | \ - MPOL_MF_MOVE_ALL | \ - MPOL_MF_LAZY) + MPOL_MF_MOVE_ALL) /* * Internal flags that share the struct mempolicy flags word with diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 75d4600a5e92..a7a62fe7c280 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -252,7 +252,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, pr_debug("setting mode %d flags %d nodes[0] %lx\n", mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); - if (mode == MPOL_DEFAULT || mode == MPOL_NOOP) { + if (mode == MPOL_DEFAULT) { if (nodes && !nodes_empty(*nodes)) return ERR_PTR(-EINVAL); return NULL; @@ -1186,7 +1186,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (start & ~PAGE_MASK) return -EINVAL; - if (mode == MPOL_DEFAULT || mode == MPOL_NOOP) + if (mode == MPOL_DEFAULT) flags &= ~MPOL_MF_STRICT; len = (len + PAGE_SIZE - 1) & PAGE_MASK; @@ -1241,7 +1241,7 @@ static long do_mbind(unsigned long start, unsigned long len, flags | MPOL_MF_INVERT, &pagelist); err = PTR_ERR(vma); /* maybe ... */ - if (!IS_ERR(vma) && mode != MPOL_NOOP) + if (!IS_ERR(vma)) err = mbind_range(mm, start, end, new); if (!err) { @@ -2530,7 +2530,6 @@ static const char * const policy_modes[] = [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", [MPOL_LOCAL] = "local", - [MPOL_NOOP] = "noop", /* should not actually be used */ }; @@ -2581,7 +2580,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) break; } } - if (mode >= MPOL_MAX || mode == MPOL_NOOP) + if (mode >= MPOL_MAX) goto out; switch (mode) { -- cgit v1.2.3 From 5606e3877ad8baea42f3a71ebde0a03622bbb551 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 2 Nov 2012 18:19:13 +0000 Subject: mm: numa: Migrate on reference policy This is the simplest possible policy that still does something of note. When a pte_numa is faulted, it is moved immediately. Any replacement policy must at least do better than this and in all likelihood this policy regresses normal workloads. Signed-off-by: Mel Gorman Acked-by: Rik van Riel --- include/uapi/linux/mempolicy.h | 1 + mm/mempolicy.c | 38 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 16fb4e6efbc4..0d11c3dcd3a1 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -67,6 +67,7 @@ enum mpol_rebind_step { #define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ #define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */ #define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ +#define MPOL_F_MORON (1 << 4) /* Migrate On pte_numa Reference On Node */ #endif /* _UAPI_LINUX_MEMPOLICY_H */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 516491fbfaa8..4c1c8d83ac6a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -118,6 +118,26 @@ static struct mempolicy default_policy = { .flags = MPOL_F_LOCAL, }; +static struct mempolicy preferred_node_policy[MAX_NUMNODES]; + +static struct mempolicy *get_task_policy(struct task_struct *p) +{ + struct mempolicy *pol = p->mempolicy; + int node; + + if (!pol) { + node = numa_node_id(); + if (node != -1) + pol = &preferred_node_policy[node]; + + /* preferred_node_policy is not initialised early in boot */ + if (!pol->mode) + pol = NULL; + } + + return pol; +} + static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); /* @@ -1598,7 +1618,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, struct mempolicy *get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) { - struct mempolicy *pol = task->mempolicy; + struct mempolicy *pol = get_task_policy(task); if (vma) { if (vma->vm_ops && vma->vm_ops->get_policy) { @@ -2021,7 +2041,7 @@ retry_cpuset: */ struct page *alloc_pages_current(gfp_t gfp, unsigned order) { - struct mempolicy *pol = current->mempolicy; + struct mempolicy *pol = get_task_policy(current); struct page *page; unsigned int cpuset_mems_cookie; @@ -2295,6 +2315,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long default: BUG(); } + + /* Migrate the page towards the node whose CPU is referencing it */ + if (pol->flags & MPOL_F_MORON) + polnid = numa_node_id(); + if (curnid != polnid) ret = polnid; out: @@ -2483,6 +2508,15 @@ void __init numa_policy_init(void) sizeof(struct sp_node), 0, SLAB_PANIC, NULL); + for_each_node(nid) { + preferred_node_policy[nid] = (struct mempolicy) { + .refcnt = ATOMIC_INIT(1), + .mode = MPOL_PREFERRED, + .flags = MPOL_F_MOF | MPOL_F_MORON, + .v = { .preferred_node = nid, }, + }; + } + /* * Set interleaving policy for system init. Interleaving is only * enabled across suitably sized nodes (default is >= 16MB), or -- cgit v1.2.3