summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c10
-rw-r--r--mm/hugetlb.c177
-rw-r--r--mm/madvise.c103
-rw-r--r--mm/memory.c74
-rw-r--r--mm/mempolicy.c110
-rw-r--r--mm/mmap.c120
-rw-r--r--mm/mremap.c7
-rw-r--r--mm/msync.c2
-rw-r--r--mm/nommu.c5
-rw-r--r--mm/oom_kill.c7
-rw-r--r--mm/page_alloc.c423
-rw-r--r--mm/rmap.c29
-rw-r--r--mm/shmem.c143
-rw-r--r--mm/slab.c7
-rw-r--r--mm/swapfile.c57
-rw-r--r--mm/vmalloc.c33
-rw-r--r--mm/vmscan.c103
17 files changed, 1010 insertions, 400 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 47263ac3e4ea..4a2fee2cb62b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1004,7 +1004,7 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
if (pos < size) {
retval = generic_file_direct_IO(READ, iocb,
iov, pos, nr_segs);
- if (retval >= 0 && !is_sync_kiocb(iocb))
+ if (retval > 0 && !is_sync_kiocb(iocb))
retval = -EIOCBQUEUED;
if (retval > 0)
*ppos = pos + retval;
@@ -1968,6 +1968,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
do {
unsigned long index;
unsigned long offset;
+ unsigned long maxlen;
size_t copied;
offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
@@ -1982,7 +1983,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
* same page as we're writing to, without it being marked
* up-to-date.
*/
- fault_in_pages_readable(buf, bytes);
+ maxlen = cur_iov->iov_len - iov_base;
+ if (maxlen > bytes)
+ maxlen = bytes;
+ fault_in_pages_readable(buf, maxlen);
page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
if (!page) {
@@ -2024,6 +2028,8 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
filemap_set_next_iovec(&cur_iov,
&iov_base, status);
buf = cur_iov->iov_base + iov_base;
+ } else {
+ iov_base += status;
}
}
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4eb5ae3fbe10..fbd1111ea119 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7,10 +7,14 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
-#include <linux/hugetlb.h>
#include <linux/sysctl.h>
#include <linux/highmem.h>
#include <linux/nodemask.h>
+#include <linux/pagemap.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+
+#include <linux/hugetlb.h>
const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
static unsigned long nr_huge_pages, free_huge_pages;
@@ -249,6 +253,72 @@ struct vm_operations_struct hugetlb_vm_ops = {
.nopage = hugetlb_nopage,
};
+static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
+{
+ pte_t entry;
+
+ if (vma->vm_flags & VM_WRITE) {
+ entry =
+ pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
+ } else {
+ entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
+ }
+ entry = pte_mkyoung(entry);
+ entry = pte_mkhuge(entry);
+
+ return entry;
+}
+
+int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
+ struct vm_area_struct *vma)
+{
+ pte_t *src_pte, *dst_pte, entry;
+ struct page *ptepage;
+ unsigned long addr = vma->vm_start;
+ unsigned long end = vma->vm_end;
+
+ while (addr < end) {
+ dst_pte = huge_pte_alloc(dst, addr);
+ if (!dst_pte)
+ goto nomem;
+ src_pte = huge_pte_offset(src, addr);
+ BUG_ON(!src_pte || pte_none(*src_pte)); /* prefaulted */
+ entry = *src_pte;
+ ptepage = pte_page(entry);
+ get_page(ptepage);
+ add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
+ set_huge_pte_at(dst, addr, dst_pte, entry);
+ addr += HPAGE_SIZE;
+ }
+ return 0;
+
+nomem:
+ return -ENOMEM;
+}
+
+void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long address;
+ pte_t pte;
+ struct page *page;
+
+ WARN_ON(!is_vm_hugetlb_page(vma));
+ BUG_ON(start & ~HPAGE_MASK);
+ BUG_ON(end & ~HPAGE_MASK);
+
+ for (address = start; address < end; address += HPAGE_SIZE) {
+ pte = huge_ptep_get_and_clear(mm, address, huge_pte_offset(mm, address));
+ if (pte_none(pte))
+ continue;
+ page = pte_page(pte);
+ put_page(page);
+ }
+ add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT));
+ flush_tlb_range(vma, start, end);
+}
+
void zap_hugepage_range(struct vm_area_struct *vma,
unsigned long start, unsigned long length)
{
@@ -258,3 +328,108 @@ void zap_hugepage_range(struct vm_area_struct *vma,
unmap_hugepage_range(vma, start, start + length);
spin_unlock(&mm->page_table_lock);
}
+
+int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long addr;
+ int ret = 0;
+
+ WARN_ON(!is_vm_hugetlb_page(vma));
+ BUG_ON(vma->vm_start & ~HPAGE_MASK);
+ BUG_ON(vma->vm_end & ~HPAGE_MASK);
+
+ hugetlb_prefault_arch_hook(mm);
+
+ spin_lock(&mm->page_table_lock);
+ for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+ unsigned long idx;
+ pte_t *pte = huge_pte_alloc(mm, addr);
+ struct page *page;
+
+ if (!pte) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ if (! pte_none(*pte))
+ hugetlb_clean_stale_pgtable(pte);
+
+ idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+ + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+ page = find_get_page(mapping, idx);
+ if (!page) {
+ /* charge the fs quota first */
+ if (hugetlb_get_quota(mapping)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ page = alloc_huge_page();
+ if (!page) {
+ hugetlb_put_quota(mapping);
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
+ if (! ret) {
+ unlock_page(page);
+ } else {
+ hugetlb_put_quota(mapping);
+ free_huge_page(page);
+ goto out;
+ }
+ }
+ add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
+ set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
+ }
+out:
+ spin_unlock(&mm->page_table_lock);
+ return ret;
+}
+
+int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct page **pages, struct vm_area_struct **vmas,
+ unsigned long *position, int *length, int i)
+{
+ unsigned long vpfn, vaddr = *position;
+ int remainder = *length;
+
+ BUG_ON(!is_vm_hugetlb_page(vma));
+
+ vpfn = vaddr/PAGE_SIZE;
+ while (vaddr < vma->vm_end && remainder) {
+
+ if (pages) {
+ pte_t *pte;
+ struct page *page;
+
+ /* Some archs (sparc64, sh*) have multiple
+ * pte_ts to each hugepage. We have to make
+ * sure we get the first, for the page
+ * indexing below to work. */
+ pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
+
+ /* hugetlb should be locked, and hence, prefaulted */
+ WARN_ON(!pte || pte_none(*pte));
+
+ page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+
+ WARN_ON(!PageCompound(page));
+
+ get_page(page);
+ pages[i] = page;
+ }
+
+ if (vmas)
+ vmas[i] = vma;
+
+ vaddr += PAGE_SIZE;
+ ++vpfn;
+ --remainder;
+ ++i;
+ }
+
+ *length = remainder;
+ *position = vaddr;
+
+ return i;
+}
diff --git a/mm/madvise.c b/mm/madvise.c
index 944b5e52d812..e3108054733c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -8,17 +8,47 @@
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
+#include <linux/mempolicy.h>
#include <linux/hugetlb.h>
/*
* We can potentially split a vm area into separate
* areas, each area with its own behavior.
*/
-static long madvise_behavior(struct vm_area_struct * vma, unsigned long start,
- unsigned long end, int behavior)
+static long madvise_behavior(struct vm_area_struct * vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end, int behavior)
{
struct mm_struct * mm = vma->vm_mm;
int error = 0;
+ pgoff_t pgoff;
+ int new_flags = vma->vm_flags & ~VM_READHINTMASK;
+
+ switch (behavior) {
+ case MADV_SEQUENTIAL:
+ new_flags |= VM_SEQ_READ;
+ break;
+ case MADV_RANDOM:
+ new_flags |= VM_RAND_READ;
+ break;
+ default:
+ break;
+ }
+
+ if (new_flags == vma->vm_flags) {
+ *prev = vma;
+ goto success;
+ }
+
+ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+ *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
+ vma->vm_file, pgoff, vma_policy(vma));
+ if (*prev) {
+ vma = *prev;
+ goto success;
+ }
+
+ *prev = vma;
if (start != vma->vm_start) {
error = split_vma(mm, vma, start, 1);
@@ -36,21 +66,12 @@ static long madvise_behavior(struct vm_area_struct * vma, unsigned long start,
* vm_flags is protected by the mmap_sem held in write mode.
*/
VM_ClearReadHint(vma);
-
- switch (behavior) {
- case MADV_SEQUENTIAL:
- vma->vm_flags |= VM_SEQ_READ;
- break;
- case MADV_RANDOM:
- vma->vm_flags |= VM_RAND_READ;
- break;
- default:
- break;
- }
+ vma->vm_flags = new_flags;
out:
if (error == -ENOMEM)
error = -EAGAIN;
+success:
return error;
}
@@ -58,6 +79,7 @@ out:
* Schedule all required I/O operations. Do not wait for completion.
*/
static long madvise_willneed(struct vm_area_struct * vma,
+ struct vm_area_struct ** prev,
unsigned long start, unsigned long end)
{
struct file *file = vma->vm_file;
@@ -65,6 +87,7 @@ static long madvise_willneed(struct vm_area_struct * vma,
if (!file)
return -EBADF;
+ *prev = vma;
start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
if (end > vma->vm_end)
end = vma->vm_end;
@@ -95,8 +118,10 @@ static long madvise_willneed(struct vm_area_struct * vma,
* dirty pages is already available as msync(MS_INVALIDATE).
*/
static long madvise_dontneed(struct vm_area_struct * vma,
+ struct vm_area_struct ** prev,
unsigned long start, unsigned long end)
{
+ *prev = vma;
if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma))
return -EINVAL;
@@ -111,8 +136,8 @@ static long madvise_dontneed(struct vm_area_struct * vma,
return 0;
}
-static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
- unsigned long end, int behavior)
+static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
+ unsigned long start, unsigned long end, int behavior)
{
long error = -EBADF;
@@ -120,15 +145,15 @@ static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
case MADV_NORMAL:
case MADV_SEQUENTIAL:
case MADV_RANDOM:
- error = madvise_behavior(vma, start, end, behavior);
+ error = madvise_behavior(vma, prev, start, end, behavior);
break;
case MADV_WILLNEED:
- error = madvise_willneed(vma, start, end);
+ error = madvise_willneed(vma, prev, start, end);
break;
case MADV_DONTNEED:
- error = madvise_dontneed(vma, start, end);
+ error = madvise_dontneed(vma, prev, start, end);
break;
default:
@@ -175,8 +200,8 @@ static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
*/
asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
{
- unsigned long end;
- struct vm_area_struct * vma;
+ unsigned long end, tmp;
+ struct vm_area_struct * vma, *prev;
int unmapped_error = 0;
int error = -EINVAL;
size_t len;
@@ -202,40 +227,42 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
/*
* If the interval [start,end) covers some unmapped address
* ranges, just ignore them, but return -ENOMEM at the end.
+ * - different from the way of handling in mlock etc.
*/
- vma = find_vma(current->mm, start);
+ vma = find_vma_prev(current->mm, start, &prev);
+ if (!vma && prev)
+ vma = prev->vm_next;
for (;;) {
/* Still start < end. */
error = -ENOMEM;
if (!vma)
goto out;
- /* Here start < vma->vm_end. */
+ /* Here start < (end|vma->vm_end). */
if (start < vma->vm_start) {
unmapped_error = -ENOMEM;
start = vma->vm_start;
+ if (start >= end)
+ goto out;
}
- /* Here vma->vm_start <= start < vma->vm_end. */
- if (end <= vma->vm_end) {
- if (start < end) {
- error = madvise_vma(vma, start, end,
- behavior);
- if (error)
- goto out;
- }
- error = unmapped_error;
- goto out;
- }
+ /* Here vma->vm_start <= start < (end|vma->vm_end) */
+ tmp = vma->vm_end;
+ if (end < tmp)
+ tmp = end;
- /* Here vma->vm_start <= start < vma->vm_end < end. */
- error = madvise_vma(vma, start, vma->vm_end, behavior);
+ /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
+ error = madvise_vma(vma, &prev, start, tmp, behavior);
if (error)
goto out;
- start = vma->vm_end;
- vma = vma->vm_next;
+ start = tmp;
+ if (start < prev->vm_end)
+ start = prev->vm_end;
+ error = unmapped_error;
+ if (start >= end)
+ goto out;
+ vma = prev->vm_next;
}
-
out:
up_write(&current->mm->mmap_sem);
return error;
diff --git a/mm/memory.c b/mm/memory.c
index 6bad4c4064e7..da91b7bf9986 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -840,23 +840,8 @@ check_user_page_readable(struct mm_struct *mm, unsigned long address)
{
return __follow_page(mm, address, /*read*/1, /*write*/0) != NULL;
}
-
EXPORT_SYMBOL(check_user_page_readable);
-/*
- * Given a physical address, is there a useful struct page pointing to
- * it? This may become more complex in the future if we start dealing
- * with IO-aperture pages for direct-IO.
- */
-
-static inline struct page *get_page_map(struct page *page)
-{
- if (!pfn_valid(page_to_pfn(page)))
- return NULL;
- return page;
-}
-
-
static inline int
untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
unsigned long address)
@@ -887,7 +872,6 @@ untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
return 0;
}
-
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int write, int force,
struct page **pages, struct vm_area_struct **vmas)
@@ -951,21 +935,21 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
}
spin_lock(&mm->page_table_lock);
do {
- struct page *map;
+ struct page *page;
int lookup_write = write;
cond_resched_lock(&mm->page_table_lock);
- while (!(map = follow_page(mm, start, lookup_write))) {
+ while (!(page = follow_page(mm, start, lookup_write))) {
/*
* Shortcut for anonymous pages. We don't want
* to force the creation of pages tables for
- * insanly big anonymously mapped areas that
+ * insanely big anonymously mapped areas that
* nobody touched so far. This is important
* for doing a core dump for these mappings.
*/
if (!lookup_write &&
untouched_anonymous_page(mm,vma,start)) {
- map = ZERO_PAGE(start);
+ page = ZERO_PAGE(start);
break;
}
spin_unlock(&mm->page_table_lock);
@@ -994,30 +978,21 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
spin_lock(&mm->page_table_lock);
}
if (pages) {
- pages[i] = get_page_map(map);
- if (!pages[i]) {
- spin_unlock(&mm->page_table_lock);
- while (i--)
- page_cache_release(pages[i]);
- i = -EFAULT;
- goto out;
- }
- flush_dcache_page(pages[i]);
- if (!PageReserved(pages[i]))
- page_cache_get(pages[i]);
+ pages[i] = page;
+ flush_dcache_page(page);
+ if (!PageReserved(page))
+ page_cache_get(page);
}
if (vmas)
vmas[i] = vma;
i++;
start += PAGE_SIZE;
len--;
- } while(len && start < vma->vm_end);
+ } while (len && start < vma->vm_end);
spin_unlock(&mm->page_table_lock);
- } while(len);
-out:
+ } while (len);
return i;
}
-
EXPORT_SYMBOL(get_user_pages);
static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
@@ -1264,7 +1239,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
}
old_page = pfn_to_page(pfn);
- if (!TestSetPageLocked(old_page)) {
+ if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
int reuse = can_share_swap_page(old_page);
unlock_page(old_page);
if (reuse) {
@@ -1701,19 +1676,16 @@ static int do_swap_page(struct mm_struct * mm,
spin_lock(&mm->page_table_lock);
page_table = pte_offset_map(pmd, address);
if (unlikely(!pte_same(*page_table, orig_pte))) {
- pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
- unlock_page(page);
- page_cache_release(page);
ret = VM_FAULT_MINOR;
- goto out;
+ goto out_nomap;
+ }
+
+ if (unlikely(!PageUptodate(page))) {
+ ret = VM_FAULT_SIGBUS;
+ goto out_nomap;
}
/* The page isn't present yet, go ahead with the fault. */
-
- swap_free(entry);
- if (vm_swap_full())
- remove_exclusive_swap_page(page);
inc_mm_counter(mm, rss);
pte = mk_pte(page, vma->vm_page_prot);
@@ -1721,12 +1693,16 @@ static int do_swap_page(struct mm_struct * mm,
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
write_access = 0;
}
- unlock_page(page);
flush_icache_page(vma, page);
set_pte_at(mm, address, page_table, pte);
page_add_anon_rmap(page, vma, address);
+ swap_free(entry);
+ if (vm_swap_full())
+ remove_exclusive_swap_page(page);
+ unlock_page(page);
+
if (write_access) {
if (do_wp_page(mm, vma, address,
page_table, pmd, pte) == VM_FAULT_OOM)
@@ -1741,6 +1717,12 @@ static int do_swap_page(struct mm_struct * mm,
spin_unlock(&mm->page_table_lock);
out:
return ret;
+out_nomap:
+ pte_unmap(page_table);
+ spin_unlock(&mm->page_table_lock);
+ unlock_page(page);
+ page_cache_release(page);
+ goto out;
}
/*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 08c41da429cf..cb41c31e7c87 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -238,46 +238,80 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
}
/* Ensure all existing pages follow the policy. */
-static int
-verify_pages(struct mm_struct *mm,
- unsigned long addr, unsigned long end, unsigned long *nodes)
+static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, unsigned long end, unsigned long *nodes)
{
- while (addr < end) {
- struct page *p;
- pte_t *pte;
- pmd_t *pmd;
- pud_t *pud;
- pgd_t *pgd;
- pgd = pgd_offset(mm, addr);
- if (pgd_none(*pgd)) {
- unsigned long next = (addr + PGDIR_SIZE) & PGDIR_MASK;
- if (next > addr)
- break;
- addr = next;
+ pte_t *orig_pte;
+ pte_t *pte;
+
+ spin_lock(&mm->page_table_lock);
+ orig_pte = pte = pte_offset_map(pmd, addr);
+ do {
+ unsigned long pfn;
+ unsigned int nid;
+
+ if (!pte_present(*pte))
continue;
- }
- pud = pud_offset(pgd, addr);
- if (pud_none(*pud)) {
- addr = (addr + PUD_SIZE) & PUD_MASK;
+ pfn = pte_pfn(*pte);
+ if (!pfn_valid(pfn))
continue;
- }
- pmd = pmd_offset(pud, addr);
- if (pmd_none(*pmd)) {
- addr = (addr + PMD_SIZE) & PMD_MASK;
+ nid = pfn_to_nid(pfn);
+ if (!test_bit(nid, nodes))
+ break;
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ pte_unmap(orig_pte);
+ spin_unlock(&mm->page_table_lock);
+ return addr != end;
+}
+
+static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
+ unsigned long addr, unsigned long end, unsigned long *nodes)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (pmd_none_or_clear_bad(pmd))
continue;
- }
- p = NULL;
- pte = pte_offset_map(pmd, addr);
- if (pte_present(*pte))
- p = pte_page(*pte);
- pte_unmap(pte);
- if (p) {
- unsigned nid = page_to_nid(p);
- if (!test_bit(nid, nodes))
- return -EIO;
- }
- addr += PAGE_SIZE;
- }
+ if (check_pte_range(mm, pmd, addr, next, nodes))
+ return -EIO;
+ } while (pmd++, addr = next, addr != end);
+ return 0;
+}
+
+static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
+ unsigned long addr, unsigned long end, unsigned long *nodes)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ pud = pud_offset(pgd, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (pud_none_or_clear_bad(pud))
+ continue;
+ if (check_pmd_range(mm, pud, addr, next, nodes))
+ return -EIO;
+ } while (pud++, addr = next, addr != end);
+ return 0;
+}
+
+static inline int check_pgd_range(struct mm_struct *mm,
+ unsigned long addr, unsigned long end, unsigned long *nodes)
+{
+ pgd_t *pgd;
+ unsigned long next;
+
+ pgd = pgd_offset(mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd))
+ continue;
+ if (check_pud_range(mm, pgd, addr, next, nodes))
+ return -EIO;
+ } while (pgd++, addr = next, addr != end);
return 0;
}
@@ -299,7 +333,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
if (prev && prev->vm_end < vma->vm_start)
return ERR_PTR(-EFAULT);
if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
- err = verify_pages(vma->vm_mm,
+ err = check_pgd_range(vma->vm_mm,
vma->vm_start, vma->vm_end, nodes);
if (err) {
first = ERR_PTR(err);
@@ -721,7 +755,7 @@ static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned or
zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
page = __alloc_pages(gfp, order, zl);
if (page && page_zone(page) == zl->zones[0]) {
- zl->zones[0]->pageset[get_cpu()].interleave_hit++;
+ zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
put_cpu();
}
return page;
diff --git a/mm/mmap.c b/mm/mmap.c
index 01f9793591f6..da3fa90a0aae 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1175,7 +1175,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
(!vma || addr + len <= vma->vm_start))
return addr;
}
- start_addr = addr = mm->free_area_cache;
+ if (len > mm->cached_hole_size) {
+ start_addr = addr = mm->free_area_cache;
+ } else {
+ start_addr = addr = TASK_UNMAPPED_BASE;
+ mm->cached_hole_size = 0;
+ }
full_search:
for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
@@ -1186,7 +1191,9 @@ full_search:
* some holes.
*/
if (start_addr != TASK_UNMAPPED_BASE) {
- start_addr = addr = TASK_UNMAPPED_BASE;
+ addr = TASK_UNMAPPED_BASE;
+ start_addr = addr;
+ mm->cached_hole_size = 0;
goto full_search;
}
return -ENOMEM;
@@ -1198,19 +1205,22 @@ full_search:
mm->free_area_cache = addr + len;
return addr;
}
+ if (addr + mm->cached_hole_size < vma->vm_start)
+ mm->cached_hole_size = vma->vm_start - addr;
addr = vma->vm_end;
}
}
#endif
-void arch_unmap_area(struct vm_area_struct *area)
+void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
{
/*
* Is this a new hole at the lowest possible address?
*/
- if (area->vm_start >= TASK_UNMAPPED_BASE &&
- area->vm_start < area->vm_mm->free_area_cache)
- area->vm_mm->free_area_cache = area->vm_start;
+ if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) {
+ mm->free_area_cache = addr;
+ mm->cached_hole_size = ~0UL;
+ }
}
/*
@@ -1240,17 +1250,26 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
return addr;
}
+ /* check if free_area_cache is useful for us */
+ if (len <= mm->cached_hole_size) {
+ mm->cached_hole_size = 0;
+ mm->free_area_cache = mm->mmap_base;
+ }
+
/* either no address requested or can't fit in requested address hole */
addr = mm->free_area_cache;
/* make sure it can fit in the remaining address space */
- if (addr >= len) {
+ if (addr > len) {
vma = find_vma(mm, addr-len);
if (!vma || addr <= vma->vm_start)
/* remember the address as a hint for next time */
return (mm->free_area_cache = addr-len);
}
+ if (mm->mmap_base < len)
+ goto bottomup;
+
addr = mm->mmap_base-len;
do {
@@ -1264,75 +1283,85 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
/* remember the address as a hint for next time */
return (mm->free_area_cache = addr);
+ /* remember the largest hole we saw so far */
+ if (addr + mm->cached_hole_size < vma->vm_start)
+ mm->cached_hole_size = vma->vm_start - addr;
+
/* try just below the current vma->vm_start */
addr = vma->vm_start-len;
- } while (len <= vma->vm_start);
+ } while (len < vma->vm_start);
+bottomup:
/*
* A failed mmap() very likely causes application failure,
* so fall back to the bottom-up function here. This scenario
* can happen with large stack limits and large mmap()
* allocations.
*/
- mm->free_area_cache = TASK_UNMAPPED_BASE;
+ mm->cached_hole_size = ~0UL;
+ mm->free_area_cache = TASK_UNMAPPED_BASE;
addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
/*
* Restore the topdown base:
*/
mm->free_area_cache = mm->mmap_base;
+ mm->cached_hole_size = ~0UL;
return addr;
}
#endif
-void arch_unmap_area_topdown(struct vm_area_struct *area)
+void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
{
/*
* Is this a new hole at the highest possible address?
*/
- if (area->vm_end > area->vm_mm->free_area_cache)
- area->vm_mm->free_area_cache = area->vm_end;
+ if (addr > mm->free_area_cache)
+ mm->free_area_cache = addr;
/* dont allow allocations above current base */
- if (area->vm_mm->free_area_cache > area->vm_mm->mmap_base)
- area->vm_mm->free_area_cache = area->vm_mm->mmap_base;
+ if (mm->free_area_cache > mm->mmap_base)
+ mm->free_area_cache = mm->mmap_base;
}
unsigned long
get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
- if (flags & MAP_FIXED) {
- unsigned long ret;
+ unsigned long ret;
- if (addr > TASK_SIZE - len)
- return -ENOMEM;
- if (addr & ~PAGE_MASK)
- return -EINVAL;
- if (file && is_file_hugepages(file)) {
- /*
- * Check if the given range is hugepage aligned, and
- * can be made suitable for hugepages.
- */
- ret = prepare_hugepage_range(addr, len);
- } else {
- /*
- * Ensure that a normal request is not falling in a
- * reserved hugepage range. For some archs like IA-64,
- * there is a separate region for hugepages.
- */
- ret = is_hugepage_only_range(current->mm, addr, len);
- }
- if (ret)
- return -EINVAL;
- return addr;
- }
+ if (!(flags & MAP_FIXED)) {
+ unsigned long (*get_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
- if (file && file->f_op && file->f_op->get_unmapped_area)
- return file->f_op->get_unmapped_area(file, addr, len,
- pgoff, flags);
+ get_area = current->mm->get_unmapped_area;
+ if (file && file->f_op && file->f_op->get_unmapped_area)
+ get_area = file->f_op->get_unmapped_area;
+ addr = get_area(file, addr, len, pgoff, flags);
+ if (IS_ERR_VALUE(addr))
+ return addr;
+ }
- return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+ if (addr > TASK_SIZE - len)
+ return -ENOMEM;
+ if (addr & ~PAGE_MASK)
+ return -EINVAL;
+ if (file && is_file_hugepages(file)) {
+ /*
+ * Check if the given range is hugepage aligned, and
+ * can be made suitable for hugepages.
+ */
+ ret = prepare_hugepage_range(addr, len);
+ } else {
+ /*
+ * Ensure that a normal request is not falling in a
+ * reserved hugepage range. For some archs like IA-64,
+ * there is a separate region for hugepages.
+ */
+ ret = is_hugepage_only_range(current->mm, addr, len);
+ }
+ if (ret)
+ return -EINVAL;
+ return addr;
}
EXPORT_SYMBOL(get_unmapped_area);
@@ -1592,7 +1621,6 @@ static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
if (area->vm_flags & VM_LOCKED)
area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
vm_stat_unaccount(area);
- area->vm_mm->unmap_area(area);
remove_vm_struct(area);
}
@@ -1646,6 +1674,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct vm_area_struct **insertion_point;
struct vm_area_struct *tail_vma = NULL;
+ unsigned long addr;
insertion_point = (prev ? &prev->vm_next : &mm->mmap);
do {
@@ -1656,6 +1685,11 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
} while (vma && vma->vm_start < end);
*insertion_point = vma;
tail_vma->vm_next = NULL;
+ if (mm->unmap_area == arch_unmap_area)
+ addr = prev ? prev->vm_end : mm->mmap_base;
+ else
+ addr = vma ? vma->vm_start : mm->mmap_base;
+ mm->unmap_area(mm, addr);
mm->mmap_cache = NULL; /* Kill the cache. */
}
diff --git a/mm/mremap.c b/mm/mremap.c
index 0dd7ace94e51..ec7238a78f36 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -224,6 +224,12 @@ static unsigned long move_vma(struct vm_area_struct *vma,
split = 1;
}
+ /*
+ * if we failed to move page tables we still do total_vm increment
+ * since do_munmap() will decrement it by old_len == new_len
+ */
+ mm->total_vm += new_len >> PAGE_SHIFT;
+
if (do_munmap(mm, old_addr, old_len) < 0) {
/* OOM: unable to split vma, just get accounts right */
vm_unacct_memory(excess >> PAGE_SHIFT);
@@ -237,7 +243,6 @@ static unsigned long move_vma(struct vm_area_struct *vma,
vma->vm_next->vm_flags |= VM_ACCOUNT;
}
- mm->total_vm += new_len >> PAGE_SHIFT;
__vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
mm->locked_vm += new_len >> PAGE_SHIFT;
diff --git a/mm/msync.c b/mm/msync.c
index 090f426bca7d..d0f5a1bce7cb 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -34,6 +34,8 @@ static void sync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (!pte_present(*pte))
continue;
+ if (!pte_maybe_dirty(*pte))
+ continue;
pfn = pte_pfn(*pte);
if (!pfn_valid(pfn))
continue;
diff --git a/mm/nommu.c b/mm/nommu.c
index b293ec1cc4e6..ce74452c02d9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -150,7 +150,8 @@ void vfree(void *addr)
kfree(addr);
}
-void *__vmalloc(unsigned long size, int gfp_mask, pgprot_t prot)
+void *__vmalloc(unsigned long size, unsigned int __nocast gfp_mask,
+ pgprot_t prot)
{
/*
* kmalloc doesn't like __GFP_HIGHMEM for some reason
@@ -1066,7 +1067,7 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
return -ENOMEM;
}
-void arch_unmap_area(struct vm_area_struct *area)
+void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
{
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4bbb1cb10495..59666d905f19 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -258,6 +258,10 @@ void out_of_memory(unsigned int __nocast gfp_mask)
struct mm_struct *mm = NULL;
task_t * p;
+ printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
+ /* print memory stats */
+ show_mem();
+
read_lock(&tasklist_lock);
retry:
p = select_bad_process();
@@ -268,12 +272,9 @@ retry:
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p) {
read_unlock(&tasklist_lock);
- show_free_areas();
panic("Out of memory and no killable processes...\n");
}
- printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
- show_free_areas();
mm = oom_kill_process(p);
if (!mm)
goto retry;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b1061b1962f8..206920796f5f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -105,11 +105,13 @@ static void bad_page(const char *function, struct page *page)
printk(KERN_EMERG "Backtrace:\n");
dump_stack();
printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
- page->flags &= ~(1 << PG_private |
+ page->flags &= ~(1 << PG_lru |
+ 1 << PG_private |
1 << PG_locked |
- 1 << PG_lru |
1 << PG_active |
1 << PG_dirty |
+ 1 << PG_reclaim |
+ 1 << PG_slab |
1 << PG_swapcache |
1 << PG_writeback);
set_page_count(page, 0);
@@ -440,14 +442,17 @@ void set_page_refs(struct page *page, int order)
*/
static void prep_new_page(struct page *page, int order)
{
- if (page->mapping || page_mapcount(page) ||
- (page->flags & (
+ if ( page_mapcount(page) ||
+ page->mapping != NULL ||
+ page_count(page) != 0 ||
+ (page->flags & (
+ 1 << PG_lru |
1 << PG_private |
1 << PG_locked |
- 1 << PG_lru |
1 << PG_active |
1 << PG_dirty |
1 << PG_reclaim |
+ 1 << PG_slab |
1 << PG_swapcache |
1 << PG_writeback )))
bad_page(__FUNCTION__, page);
@@ -511,6 +516,36 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
return allocated;
}
+#ifdef CONFIG_NUMA
+/* Called from the slab reaper to drain remote pagesets */
+void drain_remote_pages(void)
+{
+ struct zone *zone;
+ int i;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ for_each_zone(zone) {
+ struct per_cpu_pageset *pset;
+
+ /* Do not drain local pagesets */
+ if (zone->zone_pgdat->node_id == numa_node_id())
+ continue;
+
+ pset = zone->pageset[smp_processor_id()];
+ for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
+ struct per_cpu_pages *pcp;
+
+ pcp = &pset->pcp[i];
+ if (pcp->count)
+ pcp->count -= free_pages_bulk(zone, pcp->count,
+ &pcp->list, 0);
+ }
+ }
+ local_irq_restore(flags);
+}
+#endif
+
#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
static void __drain_pages(unsigned int cpu)
{
@@ -520,7 +555,7 @@ static void __drain_pages(unsigned int cpu)
for_each_zone(zone) {
struct per_cpu_pageset *pset;
- pset = &zone->pageset[cpu];
+ pset = zone_pcp(zone, cpu);
for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
struct per_cpu_pages *pcp;
@@ -583,12 +618,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
local_irq_save(flags);
cpu = smp_processor_id();
- p = &z->pageset[cpu];
+ p = zone_pcp(z,cpu);
if (pg == orig) {
- z->pageset[cpu].numa_hit++;
+ p->numa_hit++;
} else {
p->numa_miss++;
- zonelist->zones[0]->pageset[cpu].numa_foreign++;
+ zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
}
if (pg == NODE_DATA(numa_node_id()))
p->local_node++;
@@ -615,12 +650,12 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
if (PageAnon(page))
page->mapping = NULL;
free_pages_check(__FUNCTION__, page);
- pcp = &zone->pageset[get_cpu()].pcp[cold];
+ pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
local_irq_save(flags);
- if (pcp->count >= pcp->high)
- pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
list_add(&page->lru, &pcp->list);
pcp->count++;
+ if (pcp->count >= pcp->high)
+ pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
local_irq_restore(flags);
put_cpu();
}
@@ -659,7 +694,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
if (order == 0) {
struct per_cpu_pages *pcp;
- pcp = &zone->pageset[get_cpu()].pcp[cold];
+ pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
local_irq_save(flags);
if (pcp->count <= pcp->low)
pcp->count += rmqueue_bulk(zone, 0,
@@ -724,6 +759,16 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
return 1;
}
+static inline int
+should_reclaim_zone(struct zone *z, unsigned int gfp_mask)
+{
+ if (!z->reclaim_pages)
+ return 0;
+ if (gfp_mask & __GFP_NORECLAIM)
+ return 0;
+ return 1;
+}
+
/*
* This is the 'heart' of the zoned buddy allocator.
*/
@@ -760,17 +805,32 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
classzone_idx = zone_idx(zones[0]);
- restart:
+restart:
/* Go through the zonelist once, looking for a zone with enough free */
for (i = 0; (z = zones[i]) != NULL; i++) {
-
- if (!zone_watermark_ok(z, order, z->pages_low,
- classzone_idx, 0, 0))
- continue;
+ int do_reclaim = should_reclaim_zone(z, gfp_mask);
if (!cpuset_zone_allowed(z))
continue;
+ /*
+ * If the zone is to attempt early page reclaim then this loop
+ * will try to reclaim pages and check the watermark a second
+ * time before giving up and falling back to the next zone.
+ */
+zone_reclaim_retry:
+ if (!zone_watermark_ok(z, order, z->pages_low,
+ classzone_idx, 0, 0)) {
+ if (!do_reclaim)
+ continue;
+ else {
+ zone_reclaim(z, gfp_mask, order);
+ /* Only try reclaim once */
+ do_reclaim = 0;
+ goto zone_reclaim_retry;
+ }
+ }
+
page = buffered_rmqueue(z, order, gfp_mask);
if (page)
goto got_pg;
@@ -829,7 +889,7 @@ rebalance:
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
- did_some_progress = try_to_free_pages(zones, gfp_mask, order);
+ did_some_progress = try_to_free_pages(zones, gfp_mask);
p->reclaim_state = NULL;
p->flags &= ~PF_MEMALLOC;
@@ -905,6 +965,7 @@ nopage:
" order:%d, mode:0x%x\n",
p->comm, order, gfp_mask);
dump_stack();
+ show_mem();
}
return NULL;
got_pg:
@@ -1114,7 +1175,7 @@ void get_full_page_state(struct page_state *ret)
__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long));
}
-unsigned long __read_page_state(unsigned offset)
+unsigned long __read_page_state(unsigned long offset)
{
unsigned long ret = 0;
int cpu;
@@ -1128,7 +1189,7 @@ unsigned long __read_page_state(unsigned offset)
return ret;
}
-void __mod_page_state(unsigned offset, unsigned long delta)
+void __mod_page_state(unsigned long offset, unsigned long delta)
{
unsigned long flags;
void* ptr;
@@ -1237,22 +1298,23 @@ void show_free_areas(void)
if (!cpu_possible(cpu))
continue;
- pageset = zone->pageset + cpu;
+ pageset = zone_pcp(zone, cpu);
for (temperature = 0; temperature < 2; temperature++)
- printk("cpu %d %s: low %d, high %d, batch %d\n",
+ printk("cpu %d %s: low %d, high %d, batch %d used:%d\n",
cpu,
temperature ? "cold" : "hot",
pageset->pcp[temperature].low,
pageset->pcp[temperature].high,
- pageset->pcp[temperature].batch);
+ pageset->pcp[temperature].batch,
+ pageset->pcp[temperature].count);
}
}
get_page_state(&ps);
get_zone_counts(&active, &inactive, &free);
- printk("\nFree pages: %11ukB (%ukB HighMem)\n",
+ printk("Free pages: %11ukB (%ukB HighMem)\n",
K(nr_free_pages()),
K(nr_free_highpages()));
@@ -1620,6 +1682,155 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
memmap_init_zone((size), (nid), (zone), (start_pfn))
#endif
+static int __devinit zone_batchsize(struct zone *zone)
+{
+ int batch;
+
+ /*
+ * The per-cpu-pages pools are set to around 1000th of the
+ * size of the zone. But no more than 1/4 of a meg - there's
+ * no point in going beyond the size of L2 cache.
+ *
+ * OK, so we don't know how big the cache is. So guess.
+ */
+ batch = zone->present_pages / 1024;
+ if (batch * PAGE_SIZE > 256 * 1024)
+ batch = (256 * 1024) / PAGE_SIZE;
+ batch /= 4; /* We effectively *= 4 below */
+ if (batch < 1)
+ batch = 1;
+
+ /*
+ * Clamp the batch to a 2^n - 1 value. Having a power
+ * of 2 value was found to be more likely to have
+ * suboptimal cache aliasing properties in some cases.
+ *
+ * For example if 2 tasks are alternately allocating
+ * batches of pages, one task can end up with a lot
+ * of pages of one half of the possible page colors
+ * and the other with pages of the other colors.
+ */
+ batch = (1 << fls(batch + batch/2)) - 1;
+ return batch;
+}
+
+inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+{
+ struct per_cpu_pages *pcp;
+
+ pcp = &p->pcp[0]; /* hot */
+ pcp->count = 0;
+ pcp->low = 2 * batch;
+ pcp->high = 6 * batch;
+ pcp->batch = max(1UL, 1 * batch);
+ INIT_LIST_HEAD(&pcp->list);
+
+ pcp = &p->pcp[1]; /* cold*/
+ pcp->count = 0;
+ pcp->low = 0;
+ pcp->high = 2 * batch;
+ pcp->batch = max(1UL, 1 * batch);
+ INIT_LIST_HEAD(&pcp->list);
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * Boot pageset table. One per cpu which is going to be used for all
+ * zones and all nodes. The parameters will be set in such a way
+ * that an item put on a list will immediately be handed over to
+ * the buddy list. This is safe since pageset manipulation is done
+ * with interrupts disabled.
+ *
+ * Some NUMA counter updates may also be caught by the boot pagesets.
+ * These will be discarded when bootup is complete.
+ */
+static struct per_cpu_pageset
+ boot_pageset[NR_CPUS] __initdata;
+
+/*
+ * Dynamically allocate memory for the
+ * per cpu pageset array in struct zone.
+ */
+static int __devinit process_zones(int cpu)
+{
+ struct zone *zone, *dzone;
+
+ for_each_zone(zone) {
+
+ zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!zone->pageset[cpu])
+ goto bad;
+
+ setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
+ }
+
+ return 0;
+bad:
+ for_each_zone(dzone) {
+ if (dzone == zone)
+ break;
+ kfree(dzone->pageset[cpu]);
+ dzone->pageset[cpu] = NULL;
+ }
+ return -ENOMEM;
+}
+
+static inline void free_zone_pagesets(int cpu)
+{
+#ifdef CONFIG_NUMA
+ struct zone *zone;
+
+ for_each_zone(zone) {
+ struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
+
+ zone_pcp(zone, cpu) = NULL;
+ kfree(pset);
+ }
+#endif
+}
+
+static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ int cpu = (long)hcpu;
+ int ret = NOTIFY_OK;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ if (process_zones(cpu))
+ ret = NOTIFY_BAD;
+ break;
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_DEAD:
+ free_zone_pagesets(cpu);
+ break;
+#endif
+ default:
+ break;
+ }
+ return ret;
+}
+
+static struct notifier_block pageset_notifier =
+ { &pageset_cpuup_callback, NULL, 0 };
+
+void __init setup_per_cpu_pageset()
+{
+ int err;
+
+ /* Initialize per_cpu_pageset for cpu 0.
+ * A cpuup callback will do this for every cpu
+ * as it comes online
+ */
+ err = process_zones(smp_processor_id());
+ BUG_ON(err);
+ register_cpu_notifier(&pageset_notifier);
+}
+
+#endif
+
/*
* Set up the zone data structures:
* - mark all pages reserved
@@ -1662,48 +1873,16 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
- /*
- * The per-cpu-pages pools are set to around 1000th of the
- * size of the zone. But no more than 1/4 of a meg - there's
- * no point in going beyond the size of L2 cache.
- *
- * OK, so we don't know how big the cache is. So guess.
- */
- batch = zone->present_pages / 1024;
- if (batch * PAGE_SIZE > 256 * 1024)
- batch = (256 * 1024) / PAGE_SIZE;
- batch /= 4; /* We effectively *= 4 below */
- if (batch < 1)
- batch = 1;
-
- /*
- * Clamp the batch to a 2^n - 1 value. Having a power
- * of 2 value was found to be more likely to have
- * suboptimal cache aliasing properties in some cases.
- *
- * For example if 2 tasks are alternately allocating
- * batches of pages, one task can end up with a lot
- * of pages of one half of the possible page colors
- * and the other with pages of the other colors.
- */
- batch = (1 << fls(batch + batch/2)) - 1;
+ batch = zone_batchsize(zone);
for (cpu = 0; cpu < NR_CPUS; cpu++) {
- struct per_cpu_pages *pcp;
-
- pcp = &zone->pageset[cpu].pcp[0]; /* hot */
- pcp->count = 0;
- pcp->low = 2 * batch;
- pcp->high = 6 * batch;
- pcp->batch = 1 * batch;
- INIT_LIST_HEAD(&pcp->list);
-
- pcp = &zone->pageset[cpu].pcp[1]; /* cold */
- pcp->count = 0;
- pcp->low = 0;
- pcp->high = 2 * batch;
- pcp->batch = 1 * batch;
- INIT_LIST_HEAD(&pcp->list);
+#ifdef CONFIG_NUMA
+ /* Early boot. Slab allocator not functional yet */
+ zone->pageset[cpu] = &boot_pageset[cpu];
+ setup_pageset(&boot_pageset[cpu],0);
+#else
+ setup_pageset(zone_pcp(zone,cpu), batch);
+#endif
}
printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
zone_names[j], realsize, batch);
@@ -1713,6 +1892,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
zone->nr_scan_inactive = 0;
zone->nr_active = 0;
zone->nr_inactive = 0;
+ atomic_set(&zone->reclaim_in_progress, -1);
if (!size)
continue;
@@ -1853,6 +2033,115 @@ struct seq_operations fragmentation_op = {
.show = frag_show,
};
+/*
+ * Output information about zones in @pgdat.
+ */
+static int zoneinfo_show(struct seq_file *m, void *arg)
+{
+ pg_data_t *pgdat = arg;
+ struct zone *zone;
+ struct zone *node_zones = pgdat->node_zones;
+ unsigned long flags;
+
+ for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
+ int i;
+
+ if (!zone->present_pages)
+ continue;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
+ seq_printf(m,
+ "\n pages free %lu"
+ "\n min %lu"
+ "\n low %lu"
+ "\n high %lu"
+ "\n active %lu"
+ "\n inactive %lu"
+ "\n scanned %lu (a: %lu i: %lu)"
+ "\n spanned %lu"
+ "\n present %lu",
+ zone->free_pages,
+ zone->pages_min,
+ zone->pages_low,
+ zone->pages_high,
+ zone->nr_active,
+ zone->nr_inactive,
+ zone->pages_scanned,
+ zone->nr_scan_active, zone->nr_scan_inactive,
+ zone->spanned_pages,
+ zone->present_pages);
+ seq_printf(m,
+ "\n protection: (%lu",
+ zone->lowmem_reserve[0]);
+ for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
+ seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
+ seq_printf(m,
+ ")"
+ "\n pagesets");
+ for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) {
+ struct per_cpu_pageset *pageset;
+ int j;
+
+ pageset = zone_pcp(zone, i);
+ for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
+ if (pageset->pcp[j].count)
+ break;
+ }
+ if (j == ARRAY_SIZE(pageset->pcp))
+ continue;
+ for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
+ seq_printf(m,
+ "\n cpu: %i pcp: %i"
+ "\n count: %i"
+ "\n low: %i"
+ "\n high: %i"
+ "\n batch: %i",
+ i, j,
+ pageset->pcp[j].count,
+ pageset->pcp[j].low,
+ pageset->pcp[j].high,
+ pageset->pcp[j].batch);
+ }
+#ifdef CONFIG_NUMA
+ seq_printf(m,
+ "\n numa_hit: %lu"
+ "\n numa_miss: %lu"
+ "\n numa_foreign: %lu"
+ "\n interleave_hit: %lu"
+ "\n local_node: %lu"
+ "\n other_node: %lu",
+ pageset->numa_hit,
+ pageset->numa_miss,
+ pageset->numa_foreign,
+ pageset->interleave_hit,
+ pageset->local_node,
+ pageset->other_node);
+#endif
+ }
+ seq_printf(m,
+ "\n all_unreclaimable: %u"
+ "\n prev_priority: %i"
+ "\n temp_priority: %i"
+ "\n start_pfn: %lu",
+ zone->all_unreclaimable,
+ zone->prev_priority,
+ zone->temp_priority,
+ zone->zone_start_pfn);
+ spin_unlock_irqrestore(&zone->lock, flags);
+ seq_putc(m, '\n');
+ }
+ return 0;
+}
+
+struct seq_operations zoneinfo_op = {
+ .start = frag_start, /* iterate over all zones. The same as in
+ * fragmentation. */
+ .next = frag_next,
+ .stop = frag_stop,
+ .show = zoneinfo_show,
+};
+
static char *vmstat_text[] = {
"nr_dirty",
"nr_writeback",
@@ -2058,10 +2347,10 @@ static void setup_per_zone_pages_min(void)
min_pages = 128;
zone->pages_min = min_pages;
} else {
- /* if it's a lowmem zone, reserve a number of pages
+ /* if it's a lowmem zone, reserve a number of pages
* proportionate to the zone's size.
*/
- zone->pages_min = (pages_min * zone->present_pages) /
+ zone->pages_min = (pages_min * zone->present_pages) /
lowmem_pages;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 378de234c12b..89770bd25f31 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -539,27 +539,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
goto out_unmap;
}
- /*
- * Don't pull an anonymous page out from under get_user_pages.
- * GUP carefully breaks COW and raises page count (while holding
- * page_table_lock, as we have here) to make sure that the page
- * cannot be freed. If we unmap that page here, a user write
- * access to the virtual address will bring back the page, but
- * its raised count will (ironically) be taken to mean it's not
- * an exclusive swap page, do_wp_page will replace it by a copy
- * page, and the user never get to see the data GUP was holding
- * the original page for.
- *
- * This test is also useful for when swapoff (unuse_process) has
- * to drop page lock: its reference to the page stops existing
- * ptes from being unmapped, so swapoff can make progress.
- */
- if (PageSwapCache(page) &&
- page_count(page) != page_mapcount(page) + 2) {
- ret = SWAP_FAIL;
- goto out_unmap;
- }
-
/* Nuke the page table entry. */
flush_cache_page(vma, address, page_to_pfn(page));
pteval = ptep_clear_flush(vma, address, pte);
@@ -586,7 +565,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
dec_mm_counter(mm, anon_rss);
}
- inc_mm_counter(mm, rss);
+ dec_mm_counter(mm, rss);
page_remove_rmap(page);
page_cache_release(page);
@@ -626,7 +605,7 @@ static void try_to_unmap_cluster(unsigned long cursor,
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
- pte_t *pte;
+ pte_t *pte, *original_pte;
pte_t pteval;
struct page *page;
unsigned long address;
@@ -658,7 +637,7 @@ static void try_to_unmap_cluster(unsigned long cursor,
if (!pmd_present(*pmd))
goto out_unlock;
- for (pte = pte_offset_map(pmd, address);
+ for (original_pte = pte = pte_offset_map(pmd, address);
address < end; pte++, address += PAGE_SIZE) {
if (!pte_present(*pte))
@@ -694,7 +673,7 @@ static void try_to_unmap_cluster(unsigned long cursor,
(*mapcount)--;
}
- pte_unmap(pte);
+ pte_unmap(original_pte);
out_unlock:
spin_unlock(&mm->page_table_lock);
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 61574b81d979..e64fa726a790 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -6,8 +6,8 @@
* 2000-2001 Christoph Rohland
* 2000-2001 SAP AG
* 2002 Red Hat Inc.
- * Copyright (C) 2002-2004 Hugh Dickins.
- * Copyright (C) 2002-2004 VERITAS Software Corporation.
+ * Copyright (C) 2002-2005 Hugh Dickins.
+ * Copyright (C) 2002-2005 VERITAS Software Corporation.
* Copyright (C) 2004 Andi Kleen, SuSE Labs
*
* Extended attribute support for tmpfs:
@@ -194,7 +194,7 @@ static DEFINE_SPINLOCK(shmem_swaplist_lock);
static void shmem_free_blocks(struct inode *inode, long pages)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- if (sbinfo) {
+ if (sbinfo->max_blocks) {
spin_lock(&sbinfo->stat_lock);
sbinfo->free_blocks += pages;
inode->i_blocks -= pages*BLOCKS_PER_PAGE;
@@ -357,7 +357,7 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
* page (and perhaps indirect index pages) yet to allocate:
* a waste to allocate index if we cannot allocate data.
*/
- if (sbinfo) {
+ if (sbinfo->max_blocks) {
spin_lock(&sbinfo->stat_lock);
if (sbinfo->free_blocks <= 1) {
spin_unlock(&sbinfo->stat_lock);
@@ -677,8 +677,8 @@ static void shmem_delete_inode(struct inode *inode)
spin_unlock(&shmem_swaplist_lock);
}
}
- if (sbinfo) {
- BUG_ON(inode->i_blocks);
+ BUG_ON(inode->i_blocks);
+ if (sbinfo->max_inodes) {
spin_lock(&sbinfo->stat_lock);
sbinfo->free_inodes++;
spin_unlock(&sbinfo->stat_lock);
@@ -1080,7 +1080,7 @@ repeat:
} else {
shmem_swp_unmap(entry);
sbinfo = SHMEM_SB(inode->i_sb);
- if (sbinfo) {
+ if (sbinfo->max_blocks) {
spin_lock(&sbinfo->stat_lock);
if (sbinfo->free_blocks == 0 ||
shmem_acct_block(info->flags)) {
@@ -1269,7 +1269,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
struct shmem_inode_info *info;
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
- if (sbinfo) {
+ if (sbinfo->max_inodes) {
spin_lock(&sbinfo->stat_lock);
if (!sbinfo->free_inodes) {
spin_unlock(&sbinfo->stat_lock);
@@ -1319,7 +1319,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
mpol_shared_policy_init(&info->policy);
break;
}
- } else if (sbinfo) {
+ } else if (sbinfo->max_inodes) {
spin_lock(&sbinfo->stat_lock);
sbinfo->free_inodes++;
spin_unlock(&sbinfo->stat_lock);
@@ -1328,31 +1328,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
}
#ifdef CONFIG_TMPFS
-
-static int shmem_set_size(struct shmem_sb_info *sbinfo,
- unsigned long max_blocks, unsigned long max_inodes)
-{
- int error;
- unsigned long blocks, inodes;
-
- spin_lock(&sbinfo->stat_lock);
- blocks = sbinfo->max_blocks - sbinfo->free_blocks;
- inodes = sbinfo->max_inodes - sbinfo->free_inodes;
- error = -EINVAL;
- if (max_blocks < blocks)
- goto out;
- if (max_inodes < inodes)
- goto out;
- error = 0;
- sbinfo->max_blocks = max_blocks;
- sbinfo->free_blocks = max_blocks - blocks;
- sbinfo->max_inodes = max_inodes;
- sbinfo->free_inodes = max_inodes - inodes;
-out:
- spin_unlock(&sbinfo->stat_lock);
- return error;
-}
-
static struct inode_operations shmem_symlink_inode_operations;
static struct inode_operations shmem_symlink_inline_operations;
@@ -1607,15 +1582,17 @@ static int shmem_statfs(struct super_block *sb, struct kstatfs *buf)
buf->f_type = TMPFS_MAGIC;
buf->f_bsize = PAGE_CACHE_SIZE;
buf->f_namelen = NAME_MAX;
- if (sbinfo) {
- spin_lock(&sbinfo->stat_lock);
+ spin_lock(&sbinfo->stat_lock);
+ if (sbinfo->max_blocks) {
buf->f_blocks = sbinfo->max_blocks;
buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
+ }
+ if (sbinfo->max_inodes) {
buf->f_files = sbinfo->max_inodes;
buf->f_ffree = sbinfo->free_inodes;
- spin_unlock(&sbinfo->stat_lock);
}
/* else leave those fields 0 like simple_statfs */
+ spin_unlock(&sbinfo->stat_lock);
return 0;
}
@@ -1672,7 +1649,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
* but each new link needs a new dentry, pinning lowmem, and
* tmpfs dentries cannot be pruned until they are unlinked.
*/
- if (sbinfo) {
+ if (sbinfo->max_inodes) {
spin_lock(&sbinfo->stat_lock);
if (!sbinfo->free_inodes) {
spin_unlock(&sbinfo->stat_lock);
@@ -1697,7 +1674,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) {
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- if (sbinfo) {
+ if (sbinfo->max_inodes) {
spin_lock(&sbinfo->stat_lock);
sbinfo->free_inodes++;
spin_unlock(&sbinfo->stat_lock);
@@ -1921,22 +1898,42 @@ bad_val:
static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
- unsigned long max_blocks = 0;
- unsigned long max_inodes = 0;
+ unsigned long max_blocks = sbinfo->max_blocks;
+ unsigned long max_inodes = sbinfo->max_inodes;
+ unsigned long blocks;
+ unsigned long inodes;
+ int error = -EINVAL;
+
+ if (shmem_parse_options(data, NULL, NULL, NULL,
+ &max_blocks, &max_inodes))
+ return error;
- if (sbinfo) {
- max_blocks = sbinfo->max_blocks;
- max_inodes = sbinfo->max_inodes;
- }
- if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks, &max_inodes))
- return -EINVAL;
- /* Keep it simple: disallow limited <-> unlimited remount */
- if ((max_blocks || max_inodes) == !sbinfo)
- return -EINVAL;
- /* But allow the pointless unlimited -> unlimited remount */
- if (!sbinfo)
- return 0;
- return shmem_set_size(sbinfo, max_blocks, max_inodes);
+ spin_lock(&sbinfo->stat_lock);
+ blocks = sbinfo->max_blocks - sbinfo->free_blocks;
+ inodes = sbinfo->max_inodes - sbinfo->free_inodes;
+ if (max_blocks < blocks)
+ goto out;
+ if (max_inodes < inodes)
+ goto out;
+ /*
+ * Those tests also disallow limited->unlimited while any are in
+ * use, so i_blocks will always be zero when max_blocks is zero;
+ * but we must separately disallow unlimited->limited, because
+ * in that case we have no record of how much is already in use.
+ */
+ if (max_blocks && !sbinfo->max_blocks)
+ goto out;
+ if (max_inodes && !sbinfo->max_inodes)
+ goto out;
+
+ error = 0;
+ sbinfo->max_blocks = max_blocks;
+ sbinfo->free_blocks = max_blocks - blocks;
+ sbinfo->max_inodes = max_inodes;
+ sbinfo->free_inodes = max_inodes - inodes;
+out:
+ spin_unlock(&sbinfo->stat_lock);
+ return error;
}
#endif
@@ -1961,11 +1958,11 @@ static int shmem_fill_super(struct super_block *sb,
uid_t uid = current->fsuid;
gid_t gid = current->fsgid;
int err = -ENOMEM;
-
-#ifdef CONFIG_TMPFS
+ struct shmem_sb_info *sbinfo;
unsigned long blocks = 0;
unsigned long inodes = 0;
+#ifdef CONFIG_TMPFS
/*
* Per default we only allow half of the physical ram per
* tmpfs instance, limiting inodes to one per page of lowmem;
@@ -1976,34 +1973,34 @@ static int shmem_fill_super(struct super_block *sb,
inodes = totalram_pages - totalhigh_pages;
if (inodes > blocks)
inodes = blocks;
-
- if (shmem_parse_options(data, &mode,
- &uid, &gid, &blocks, &inodes))
+ if (shmem_parse_options(data, &mode, &uid, &gid,
+ &blocks, &inodes))
return -EINVAL;
}
-
- if (blocks || inodes) {
- struct shmem_sb_info *sbinfo;
- sbinfo = kmalloc(sizeof(struct shmem_sb_info), GFP_KERNEL);
- if (!sbinfo)
- return -ENOMEM;
- sb->s_fs_info = sbinfo;
- spin_lock_init(&sbinfo->stat_lock);
- sbinfo->max_blocks = blocks;
- sbinfo->free_blocks = blocks;
- sbinfo->max_inodes = inodes;
- sbinfo->free_inodes = inodes;
- }
- sb->s_xattr = shmem_xattr_handlers;
#else
sb->s_flags |= MS_NOUSER;
#endif
+ /* Round up to L1_CACHE_BYTES to resist false sharing */
+ sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info),
+ L1_CACHE_BYTES), GFP_KERNEL);
+ if (!sbinfo)
+ return -ENOMEM;
+
+ spin_lock_init(&sbinfo->stat_lock);
+ sbinfo->max_blocks = blocks;
+ sbinfo->free_blocks = blocks;
+ sbinfo->max_inodes = inodes;
+ sbinfo->free_inodes = inodes;
+
+ sb->s_fs_info = sbinfo;
sb->s_maxbytes = SHMEM_MAX_BYTES;
sb->s_blocksize = PAGE_CACHE_SIZE;
sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
sb->s_magic = TMPFS_MAGIC;
sb->s_op = &shmem_ops;
+ sb->s_xattr = shmem_xattr_handlers;
+
inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
if (!inode)
goto failed;
diff --git a/mm/slab.c b/mm/slab.c
index 840742641152..93cbbbb39f42 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2620,6 +2620,12 @@ unsigned int kmem_cache_size(kmem_cache_t *cachep)
}
EXPORT_SYMBOL(kmem_cache_size);
+const char *kmem_cache_name(kmem_cache_t *cachep)
+{
+ return cachep->name;
+}
+EXPORT_SYMBOL_GPL(kmem_cache_name);
+
struct ccupdate_struct {
kmem_cache_t *cachep;
struct array_cache *new[NR_CPUS];
@@ -2845,6 +2851,7 @@ next:
}
check_irq_on();
up(&cache_chain_sem);
+ drain_remote_pages();
/* Setup the next iteration */
schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id());
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index a60e0075d55b..60cd24a55204 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -79,7 +79,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
WARN_ON(page_count(page) <= 1);
bdi = bdev->bd_inode->i_mapping->backing_dev_info;
- bdi->unplug_io_fn(bdi, page);
+ blk_run_backing_dev(bdi, page);
}
up_read(&swap_unplug_sem);
}
@@ -276,61 +276,37 @@ void swap_free(swp_entry_t entry)
}
/*
- * Check if we're the only user of a swap page,
- * when the page is locked.
+ * How many references to page are currently swapped out?
*/
-static int exclusive_swap_page(struct page *page)
+static inline int page_swapcount(struct page *page)
{
- int retval = 0;
- struct swap_info_struct * p;
+ int count = 0;
+ struct swap_info_struct *p;
swp_entry_t entry;
entry.val = page->private;
p = swap_info_get(entry);
if (p) {
- /* Is the only swap cache user the cache itself? */
- if (p->swap_map[swp_offset(entry)] == 1) {
- /* Recheck the page count with the swapcache lock held.. */
- write_lock_irq(&swapper_space.tree_lock);
- if (page_count(page) == 2)
- retval = 1;
- write_unlock_irq(&swapper_space.tree_lock);
- }
+ /* Subtract the 1 for the swap cache itself */
+ count = p->swap_map[swp_offset(entry)] - 1;
swap_info_put(p);
}
- return retval;
+ return count;
}
/*
* We can use this swap cache entry directly
* if there are no other references to it.
- *
- * Here "exclusive_swap_page()" does the real
- * work, but we opportunistically check whether
- * we need to get all the locks first..
*/
int can_share_swap_page(struct page *page)
{
- int retval = 0;
+ int count;
- if (!PageLocked(page))
- BUG();
- switch (page_count(page)) {
- case 3:
- if (!PagePrivate(page))
- break;
- /* Fallthrough */
- case 2:
- if (!PageSwapCache(page))
- break;
- retval = exclusive_swap_page(page);
- break;
- case 1:
- if (PageReserved(page))
- break;
- retval = 1;
- }
- return retval;
+ BUG_ON(!PageLocked(page));
+ count = page_mapcount(page);
+ if (count <= 1 && PageSwapCache(page))
+ count += page_swapcount(page);
+ return count == 1;
}
/*
@@ -529,9 +505,10 @@ static int unuse_mm(struct mm_struct *mm,
if (!down_read_trylock(&mm->mmap_sem)) {
/*
- * Our reference to the page stops try_to_unmap_one from
- * unmapping its ptes, so swapoff can make progress.
+ * Activate page so shrink_cache is unlikely to unmap its
+ * ptes while lock is dropped, so swapoff can make progress.
*/
+ activate_page(page);
unlock_page(page);
down_read(&mm->mmap_sem);
lock_page(page);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2bd83e5c2bbf..8ff16a1eee6a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -248,31 +248,20 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END);
}
-/**
- * remove_vm_area - find and remove a contingous kernel virtual area
- *
- * @addr: base address
- *
- * Search for the kernel VM area starting at @addr, and remove it.
- * This function returns the found VM area, but using it is NOT safe
- * on SMP machines.
- */
-struct vm_struct *remove_vm_area(void *addr)
+/* Caller must hold vmlist_lock */
+struct vm_struct *__remove_vm_area(void *addr)
{
struct vm_struct **p, *tmp;
- write_lock(&vmlist_lock);
for (p = &vmlist ; (tmp = *p) != NULL ;p = &tmp->next) {
if (tmp->addr == addr)
goto found;
}
- write_unlock(&vmlist_lock);
return NULL;
found:
unmap_vm_area(tmp);
*p = tmp->next;
- write_unlock(&vmlist_lock);
/*
* Remove the guard page.
@@ -281,6 +270,24 @@ found:
return tmp;
}
+/**
+ * remove_vm_area - find and remove a contingous kernel virtual area
+ *
+ * @addr: base address
+ *
+ * Search for the kernel VM area starting at @addr, and remove it.
+ * This function returns the found VM area, but using it is NOT safe
+ * on SMP machines, except for its size or flags.
+ */
+struct vm_struct *remove_vm_area(void *addr)
+{
+ struct vm_struct *v;
+ write_lock(&vmlist_lock);
+ v = __remove_vm_area(addr);
+ write_unlock(&vmlist_lock);
+ return v;
+}
+
void __vunmap(void *addr, int deallocate_pages)
{
struct vm_struct *area;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 269eded9b459..4b8e62a19370 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -74,6 +74,9 @@ struct scan_control {
int may_writepage;
+ /* Can pages be swapped as part of reclaim? */
+ int may_swap;
+
/* This context's SWAP_CLUSTER_MAX. If freeing memory for
* suspend, we effectively ignore SWAP_CLUSTER_MAX.
* In this context, it doesn't matter that we scan the
@@ -180,17 +183,20 @@ EXPORT_SYMBOL(remove_shrinker);
* `lru_pages' represents the number of on-LRU pages in all the zones which
* are eligible for the caller's allocation attempt. It is used for balancing
* slab reclaim versus page reclaim.
+ *
+ * Returns the number of slab objects which we shrunk.
*/
static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
unsigned long lru_pages)
{
struct shrinker *shrinker;
+ int ret = 0;
if (scanned == 0)
scanned = SWAP_CLUSTER_MAX;
if (!down_read_trylock(&shrinker_rwsem))
- return 0;
+ return 1; /* Assume we'll be able to shrink next time */
list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta;
@@ -209,10 +215,14 @@ static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
while (total_scan >= SHRINK_BATCH) {
long this_scan = SHRINK_BATCH;
int shrink_ret;
+ int nr_before;
+ nr_before = (*shrinker->shrinker)(0, gfp_mask);
shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
if (shrink_ret == -1)
break;
+ if (shrink_ret < nr_before)
+ ret += nr_before - shrink_ret;
mod_page_state(slabs_scanned, this_scan);
total_scan -= this_scan;
@@ -222,7 +232,7 @@ static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
shrinker->nr += total_scan;
}
up_read(&shrinker_rwsem);
- return 0;
+ return ret;
}
/* Called without lock on whether page is mapped, so answer is unstable */
@@ -407,7 +417,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
*/
- if (PageAnon(page) && !PageSwapCache(page)) {
+ if (PageAnon(page) && !PageSwapCache(page) && sc->may_swap) {
if (!add_to_swap(page))
goto activate_locked;
}
@@ -890,7 +900,9 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
+ atomic_inc(&zone->reclaim_in_progress);
shrink_zone(zone, sc);
+ atomic_dec(&zone->reclaim_in_progress);
}
}
@@ -907,8 +919,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
* holds filesystem locks which prevent writeout this might not work, and the
* allocation attempt will fail.
*/
-int try_to_free_pages(struct zone **zones,
- unsigned int gfp_mask, unsigned int order)
+int try_to_free_pages(struct zone **zones, unsigned int gfp_mask)
{
int priority;
int ret = 0;
@@ -920,6 +931,7 @@ int try_to_free_pages(struct zone **zones,
sc.gfp_mask = gfp_mask;
sc.may_writepage = 0;
+ sc.may_swap = 1;
inc_page_state(allocstall);
@@ -1020,6 +1032,7 @@ loop_again:
total_reclaimed = 0;
sc.gfp_mask = GFP_KERNEL;
sc.may_writepage = 0;
+ sc.may_swap = 1;
sc.nr_mapped = read_page_state(nr_mapped);
inc_page_state(pageoutrun);
@@ -1079,6 +1092,7 @@ scan:
*/
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
+ int nr_slab;
if (zone->present_pages == 0)
continue;
@@ -1098,16 +1112,19 @@ scan:
sc.nr_reclaimed = 0;
sc.priority = priority;
sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
+ atomic_inc(&zone->reclaim_in_progress);
shrink_zone(zone, &sc);
+ atomic_dec(&zone->reclaim_in_progress);
reclaim_state->reclaimed_slab = 0;
- shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages);
+ nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
+ lru_pages);
sc.nr_reclaimed += reclaim_state->reclaimed_slab;
total_reclaimed += sc.nr_reclaimed;
total_scanned += sc.nr_scanned;
if (zone->all_unreclaimable)
continue;
- if (zone->pages_scanned >= (zone->nr_active +
- zone->nr_inactive) * 4)
+ if (nr_slab == 0 && zone->pages_scanned >=
+ (zone->nr_active + zone->nr_inactive) * 4)
zone->all_unreclaimable = 1;
/*
* If we've done a decent amount of scanning and
@@ -1309,3 +1326,73 @@ static int __init kswapd_init(void)
}
module_init(kswapd_init)
+
+
+/*
+ * Try to free up some pages from this zone through reclaim.
+ */
+int zone_reclaim(struct zone *zone, unsigned int gfp_mask, unsigned int order)
+{
+ struct scan_control sc;
+ int nr_pages = 1 << order;
+ int total_reclaimed = 0;
+
+ /* The reclaim may sleep, so don't do it if sleep isn't allowed */
+ if (!(gfp_mask & __GFP_WAIT))
+ return 0;
+ if (zone->all_unreclaimable)
+ return 0;
+
+ sc.gfp_mask = gfp_mask;
+ sc.may_writepage = 0;
+ sc.may_swap = 0;
+ sc.nr_mapped = read_page_state(nr_mapped);
+ sc.nr_scanned = 0;
+ sc.nr_reclaimed = 0;
+ /* scan at the highest priority */
+ sc.priority = 0;
+
+ if (nr_pages > SWAP_CLUSTER_MAX)
+ sc.swap_cluster_max = nr_pages;
+ else
+ sc.swap_cluster_max = SWAP_CLUSTER_MAX;
+
+ /* Don't reclaim the zone if there are other reclaimers active */
+ if (!atomic_inc_and_test(&zone->reclaim_in_progress))
+ goto out;
+
+ shrink_zone(zone, &sc);
+ total_reclaimed = sc.nr_reclaimed;
+
+ out:
+ atomic_dec(&zone->reclaim_in_progress);
+ return total_reclaimed;
+}
+
+asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
+ unsigned int state)
+{
+ struct zone *z;
+ int i;
+
+ if (node >= MAX_NUMNODES || !node_online(node))
+ return -EINVAL;
+
+ /* This will break if we ever add more zones */
+ if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM)))
+ return -EINVAL;
+
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ if (!(zone & 1<<i))
+ continue;
+
+ z = &NODE_DATA(node)->node_zones[i];
+
+ if (state)
+ z->reclaim_pages = 1;
+ else
+ z->reclaim_pages = 0;
+ }
+
+ return 0;
+}