diff options
author | Lee Schermerhorn <lee.schermerhorn@hp.com> | 2012-10-25 14:16:32 +0200 |
---|---|---|
committer | Mel Gorman <mgorman@suse.de> | 2012-12-11 14:42:43 +0000 |
commit | b24f53a0bea38b266d219ee651b22dba727c44ae (patch) | |
tree | f85431707b44913a412efb5483dc366c310aab5e /mm | |
parent | 4daae3b4b9e49b7e0935499a352f1c59d90287d2 (diff) | |
download | lwn-b24f53a0bea38b266d219ee651b22dba727c44ae.tar.gz lwn-b24f53a0bea38b266d219ee651b22dba727c44ae.zip |
mm: mempolicy: Add MPOL_MF_LAZY
NOTE: Once again there is a lot of patch stealing and the end result
is sufficiently different that I had to drop the signed-offs.
Will re-add if the original authors are ok with that.
This patch adds another mbind() flag to request "lazy migration". The
flag, MPOL_MF_LAZY, modifies MPOL_MF_MOVE* such that the selected
pages are marked PROT_NONE. The pages will be migrated in the fault
path on "first touch", if the policy dictates at that time.
"Lazy Migration" will allow testing of migrate-on-fault via mbind().
Also allows applications to specify that only subsequently touched
pages be migrated to obey new policy, instead of all pages in range.
This can be useful for multi-threaded applications working on a
large shared data area that is initialized by an initial thread
resulting in all pages on one [or a few, if overflowed] nodes.
After PROT_NONE, the pages in regions assigned to the worker threads
will be automatically migrated local to the threads on 1st touch.
Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/mempolicy.c | 185 |
1 files changed, 170 insertions, 15 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index df1466d3d2d8..51d3ebd8561e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -90,6 +90,7 @@ #include <linux/syscalls.h> #include <linux/ctype.h> #include <linux/mm_inline.h> +#include <linux/mmu_notifier.h> #include <asm/tlbflush.h> #include <asm/uaccess.h> @@ -565,6 +566,145 @@ static inline int check_pgd_range(struct vm_area_struct *vma, return 0; } +#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE +/* + * Here we search for not shared page mappings (mapcount == 1) and we + * set up the pmd/pte_numa on those mappings so the very next access + * will fire a NUMA hinting page fault. + */ +static int +change_prot_numa_range(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte, *_pte; + struct page *page; + unsigned long _address, end; + spinlock_t *ptl; + int ret = 0; + + VM_BUG_ON(address & ~PAGE_MASK); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + goto out; + + if (pmd_trans_huge_lock(pmd, vma) == 1) { + int page_nid; + ret = HPAGE_PMD_NR; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + if (pmd_numa(*pmd)) { + spin_unlock(&mm->page_table_lock); + goto out; + } + + page = pmd_page(*pmd); + + /* only check non-shared pages */ + if (page_mapcount(page) != 1) { + spin_unlock(&mm->page_table_lock); + goto out; + } + + page_nid = page_to_nid(page); + + if (pmd_numa(*pmd)) { + spin_unlock(&mm->page_table_lock); + goto out; + } + + set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd)); + ret += HPAGE_PMD_NR; + /* defer TLB flush to lower the overhead */ + spin_unlock(&mm->page_table_lock); + goto out; + } + + if (pmd_trans_unstable(pmd)) + goto out; + VM_BUG_ON(!pmd_present(*pmd)); + + end = min(vma->vm_end, (address + PMD_SIZE) & PMD_MASK); + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + for (_address = address, _pte = pte; _address < end; + _pte++, _address += PAGE_SIZE) { + pte_t pteval = *_pte; + if (!pte_present(pteval)) + continue; + if (pte_numa(pteval)) + continue; + page = vm_normal_page(vma, _address, pteval); + if (unlikely(!page)) + continue; + /* only check non-shared pages */ + if (page_mapcount(page) != 1) + continue; + + set_pte_at(mm, _address, _pte, pte_mknuma(pteval)); + + /* defer TLB flush to lower the overhead */ + ret++; + } + pte_unmap_unlock(pte, ptl); + + if (ret && !pmd_numa(*pmd)) { + spin_lock(&mm->page_table_lock); + set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd)); + spin_unlock(&mm->page_table_lock); + /* defer TLB flush to lower the overhead */ + } + +out: + return ret; +} + +/* Assumes mmap_sem is held */ +void +change_prot_numa(struct vm_area_struct *vma, + unsigned long address, unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + int progress = 0; + + while (address < end) { + VM_BUG_ON(address < vma->vm_start || + address + PAGE_SIZE > vma->vm_end); + + progress += change_prot_numa_range(mm, vma, address); + address = (address + PMD_SIZE) & PMD_MASK; + } + + /* + * Flush the TLB for the mm to start the NUMA hinting + * page faults after we finish scanning this vma part + * if there were any PTE updates + */ + if (progress) { + mmu_notifier_invalidate_range_start(vma->vm_mm, address, end); + flush_tlb_range(vma, address, end); + mmu_notifier_invalidate_range_end(vma->vm_mm, address, end); + } +} +#else +static unsigned long change_prot_numa(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + return 0; +} +#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ + /* * Check if all pages in a range are on a set of nodes. * If pagelist != NULL then isolate pages from the LRU and @@ -583,22 +723,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, return ERR_PTR(-EFAULT); prev = NULL; for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { + unsigned long endvma = vma->vm_end; + + if (endvma > end) + endvma = end; + if (vma->vm_start > start) + start = vma->vm_start; + if (!(flags & MPOL_MF_DISCONTIG_OK)) { if (!vma->vm_next && vma->vm_end < end) return ERR_PTR(-EFAULT); if (prev && prev->vm_end < vma->vm_start) return ERR_PTR(-EFAULT); } - if (!is_vm_hugetlb_page(vma) && - ((flags & MPOL_MF_STRICT) || + + if (is_vm_hugetlb_page(vma)) + goto next; + + if (flags & MPOL_MF_LAZY) { + change_prot_numa(vma, start, endvma); + goto next; + } + + if ((flags & MPOL_MF_STRICT) || ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && - vma_migratable(vma)))) { - unsigned long endvma = vma->vm_end; + vma_migratable(vma))) { - if (endvma > end) - endvma = end; - if (vma->vm_start > start) - start = vma->vm_start; err = check_pgd_range(vma, start, endvma, nodes, flags, private); if (err) { @@ -606,6 +756,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, break; } } +next: prev = vma; } return first; @@ -1138,8 +1289,7 @@ static long do_mbind(unsigned long start, unsigned long len, int err; LIST_HEAD(pagelist); - if (flags & ~(unsigned long)(MPOL_MF_STRICT | - MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + if (flags & ~(unsigned long)MPOL_MF_VALID) return -EINVAL; if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) return -EPERM; @@ -1162,6 +1312,9 @@ static long do_mbind(unsigned long start, unsigned long len, if (IS_ERR(new)) return PTR_ERR(new); + if (flags & MPOL_MF_LAZY) + new->flags |= MPOL_F_MOF; + /* * If we are using the default policy then operation * on discontinuous address spaces is okay after all @@ -1198,13 +1351,15 @@ static long do_mbind(unsigned long start, unsigned long len, vma = check_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); - err = PTR_ERR(vma); - if (!IS_ERR(vma)) { - int nr_failed = 0; - + err = PTR_ERR(vma); /* maybe ... */ + if (!IS_ERR(vma) && mode != MPOL_NOOP) err = mbind_range(mm, start, end, new); + if (!err) { + int nr_failed = 0; + if (!list_empty(&pagelist)) { + WARN_ON_ONCE(flags & MPOL_MF_LAZY); nr_failed = migrate_pages(&pagelist, new_vma_page, (unsigned long)vma, false, MIGRATE_SYNC, @@ -1213,7 +1368,7 @@ static long do_mbind(unsigned long start, unsigned long len, putback_lru_pages(&pagelist); } - if (!err && nr_failed && (flags & MPOL_MF_STRICT)) + if (nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; } else putback_lru_pages(&pagelist); |