summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-11-05 23:10:54 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2015-11-05 23:10:54 -0800
commit2e3078af2c67730c479f1d183af5b367f5d95337 (patch)
treeb7881c6c9c479aadac345df7e18e3c0e10f0811e /mm
parentea5c58e70c3a148ada0d3061a8f529589bb766ba (diff)
parentb3b0d09c7a2330759ac293f5269bd932439ea0ff (diff)
downloadlwn-2e3078af2c67730c479f1d183af5b367f5d95337.tar.gz
lwn-2e3078af2c67730c479f1d183af5b367f5d95337.zip
Merge branch 'akpm' (patches from Andrew)
Merge patch-bomb from Andrew Morton: - inotify tweaks - some ocfs2 updates (many more are awaiting review) - various misc bits - kernel/watchdog.c updates - Some of mm. I have a huge number of MM patches this time and quite a lot of it is quite difficult and much will be held over to next time. * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (162 commits) selftests: vm: add tests for lock on fault mm: mlock: add mlock flags to enable VM_LOCKONFAULT usage mm: introduce VM_LOCKONFAULT mm: mlock: add new mlock system call mm: mlock: refactor mlock, munlock, and munlockall code kasan: always taint kernel on report mm, slub, kasan: enable user tracking by default with KASAN=y kasan: use IS_ALIGNED in memory_is_poisoned_8() kasan: Fix a type conversion error lib: test_kasan: add some testcases kasan: update reference to kasan prototype repo kasan: move KASAN_SANITIZE in arch/x86/boot/Makefile kasan: various fixes in documentation kasan: update log messages kasan: accurately determine the type of the bad access kasan: update reported bug types for kernel memory accesses kasan: update reported bug types for not user nor kernel memory accesses mm/kasan: prevent deadlock in kasan reporting mm/kasan: don't use kasan shadow pointer in generic functions mm/kasan: MODULE_VADDR is not available on all archs ...
Diffstat (limited to 'mm')
-rw-r--r--mm/balloon_compaction.c10
-rw-r--r--mm/cma.c6
-rw-r--r--mm/compaction.c46
-rw-r--r--mm/debug.c1
-rw-r--r--mm/early_ioremap.c6
-rw-r--r--mm/filemap.c77
-rw-r--r--mm/frame_vector.c2
-rw-r--r--mm/gup.c10
-rw-r--r--mm/huge_memory.c2
-rw-r--r--mm/hugetlb.c139
-rw-r--r--mm/hugetlb_cgroup.c3
-rw-r--r--mm/internal.h9
-rw-r--r--mm/kasan/kasan.c38
-rw-r--r--mm/kasan/kasan.h5
-rw-r--r--mm/kasan/report.c113
-rw-r--r--mm/kmemleak.c2
-rw-r--r--mm/ksm.c49
-rw-r--r--mm/list_lru.c44
-rw-r--r--mm/maccess.c7
-rw-r--r--mm/memblock.c2
-rw-r--r--mm/memcontrol.c295
-rw-r--r--mm/memory-failure.c34
-rw-r--r--mm/memory_hotplug.c4
-rw-r--r--mm/migrate.c247
-rw-r--r--mm/mincore.c2
-rw-r--r--mm/mlock.c100
-rw-r--r--mm/mmap.c61
-rw-r--r--mm/mremap.c12
-rw-r--r--mm/msync.c2
-rw-r--r--mm/nommu.c18
-rw-r--r--mm/oom_kill.c59
-rw-r--r--mm/page_alloc.c41
-rw-r--r--mm/page_counter.c14
-rw-r--r--mm/percpu.c10
-rw-r--r--mm/readahead.c14
-rw-r--r--mm/rmap.c107
-rw-r--r--mm/shmem.c24
-rw-r--r--mm/slab.c17
-rw-r--r--mm/slab.h30
-rw-r--r--mm/slab_common.c142
-rw-r--r--mm/slub.c25
-rw-r--r--mm/util.c2
-rw-r--r--mm/vmacache.c2
-rw-r--r--mm/vmalloc.c12
-rw-r--r--mm/vmscan.c27
-rw-r--r--mm/vmstat.c22
46 files changed, 1097 insertions, 797 deletions
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index fcad8322ef36..d3116be5a00f 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -199,23 +199,17 @@ int balloon_page_migrate(struct page *newpage,
struct balloon_dev_info *balloon = balloon_page_device(page);
int rc = -EAGAIN;
- /*
- * Block others from accessing the 'newpage' when we get around to
- * establishing additional references. We should be the only one
- * holding a reference to the 'newpage' at this point.
- */
- BUG_ON(!trylock_page(newpage));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
if (WARN_ON(!__is_movable_balloon_page(page))) {
dump_page(page, "not movable balloon page");
- unlock_page(newpage);
return rc;
}
if (balloon && balloon->migratepage)
rc = balloon->migratepage(balloon, newpage, page, mode);
- unlock_page(newpage);
return rc;
}
#endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/cma.c b/mm/cma.c
index 4eb56badf37e..ea506eb18cd6 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -363,7 +363,9 @@ err:
*/
struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align)
{
- unsigned long mask, offset, pfn, start = 0;
+ unsigned long mask, offset;
+ unsigned long pfn = -1;
+ unsigned long start = 0;
unsigned long bitmap_maxno, bitmap_no, bitmap_count;
struct page *page = NULL;
int ret;
@@ -418,7 +420,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align)
start = bitmap_no + mask + 1;
}
- trace_cma_alloc(page ? pfn : -1UL, page, count, align);
+ trace_cma_alloc(pfn, page, count, align);
pr_debug("%s(): returned %p\n", __func__, page);
return page;
diff --git a/mm/compaction.c b/mm/compaction.c
index c5c627aae996..de3e1e71cd9f 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -35,17 +35,6 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
#endif
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
-#ifdef CONFIG_TRACEPOINTS
-static const char *const compaction_status_string[] = {
- "deferred",
- "skipped",
- "continue",
- "partial",
- "complete",
- "no_suitable_page",
- "not_suitable_zone",
-};
-#endif
#define CREATE_TRACE_POINTS
#include <trace/events/compaction.h>
@@ -1197,6 +1186,15 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
}
+/*
+ * order == -1 is expected when compacting via
+ * /proc/sys/vm/compact_memory
+ */
+static inline bool is_via_compact_memory(int order)
+{
+ return order == -1;
+}
+
static int __compact_finished(struct zone *zone, struct compact_control *cc,
const int migratetype)
{
@@ -1204,7 +1202,7 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
unsigned long watermark;
if (cc->contended || fatal_signal_pending(current))
- return COMPACT_PARTIAL;
+ return COMPACT_CONTENDED;
/* Compaction run completes if the migrate and free scanner meet */
if (compact_scanners_met(cc)) {
@@ -1223,11 +1221,7 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
return COMPACT_COMPLETE;
}
- /*
- * order == -1 is expected when compacting via
- * /proc/sys/vm/compact_memory
- */
- if (cc->order == -1)
+ if (is_via_compact_memory(cc->order))
return COMPACT_CONTINUE;
/* Compaction run is not finished if the watermark is not met */
@@ -1290,11 +1284,7 @@ static unsigned long __compaction_suitable(struct zone *zone, int order,
int fragindex;
unsigned long watermark;
- /*
- * order == -1 is expected when compacting via
- * /proc/sys/vm/compact_memory
- */
- if (order == -1)
+ if (is_via_compact_memory(order))
return COMPACT_CONTINUE;
watermark = low_wmark_pages(zone);
@@ -1403,7 +1393,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
switch (isolate_migratepages(zone, cc)) {
case ISOLATE_ABORT:
- ret = COMPACT_PARTIAL;
+ ret = COMPACT_CONTENDED;
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
goto out;
@@ -1434,7 +1424,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
* and we want compact_finished() to detect it
*/
if (err == -ENOMEM && !compact_scanners_met(cc)) {
- ret = COMPACT_PARTIAL;
+ ret = COMPACT_CONTENDED;
goto out;
}
}
@@ -1487,6 +1477,9 @@ out:
trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
cc->free_pfn, end_pfn, sync, ret);
+ if (ret == COMPACT_CONTENDED)
+ ret = COMPACT_PARTIAL;
+
return ret;
}
@@ -1658,10 +1651,11 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
* this makes sure we compact the whole zone regardless of
* cached scanner positions.
*/
- if (cc->order == -1)
+ if (is_via_compact_memory(cc->order))
__reset_isolation_suitable(zone);
- if (cc->order == -1 || !compaction_deferred(zone, cc->order))
+ if (is_via_compact_memory(cc->order) ||
+ !compaction_deferred(zone, cc->order))
compact_zone(zone, cc);
if (cc->order > 0) {
diff --git a/mm/debug.c b/mm/debug.c
index 6c1b3ea61bfd..e784110fb51d 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -125,6 +125,7 @@ static const struct trace_print_flags vmaflags_names[] = {
{VM_GROWSDOWN, "growsdown" },
{VM_PFNMAP, "pfnmap" },
{VM_DENYWRITE, "denywrite" },
+ {VM_LOCKONFAULT, "lockonfault" },
{VM_LOCKED, "locked" },
{VM_IO, "io" },
{VM_SEQ_READ, "seqread" },
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index 17ae14b5aefa..6d5717bd7197 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -126,7 +126,7 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
/*
* Mappings have to be page-aligned
*/
- offset = phys_addr & ~PAGE_MASK;
+ offset = offset_in_page(phys_addr);
phys_addr &= PAGE_MASK;
size = PAGE_ALIGN(last_addr + 1) - phys_addr;
@@ -189,7 +189,7 @@ void __init early_iounmap(void __iomem *addr, unsigned long size)
if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)))
return;
- offset = virt_addr & ~PAGE_MASK;
+ offset = offset_in_page(virt_addr);
nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
@@ -234,7 +234,7 @@ void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size)
char *p;
while (size) {
- slop = src & ~PAGE_MASK;
+ slop = offset_in_page(src);
clen = size;
if (clen > MAX_MAP_CHUNK - slop)
clen = MAX_MAP_CHUNK - slop;
diff --git a/mm/filemap.c b/mm/filemap.c
index 327910c2400c..58e04e26f996 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -331,23 +331,14 @@ int filemap_flush(struct address_space *mapping)
}
EXPORT_SYMBOL(filemap_flush);
-/**
- * filemap_fdatawait_range - wait for writeback to complete
- * @mapping: address space structure to wait for
- * @start_byte: offset in bytes where the range starts
- * @end_byte: offset in bytes where the range ends (inclusive)
- *
- * Walk the list of under-writeback pages of the given address space
- * in the given range and wait for all of them.
- */
-int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
- loff_t end_byte)
+static int __filemap_fdatawait_range(struct address_space *mapping,
+ loff_t start_byte, loff_t end_byte)
{
pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
struct pagevec pvec;
int nr_pages;
- int ret2, ret = 0;
+ int ret = 0;
if (end_byte < start_byte)
goto out;
@@ -374,6 +365,29 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
cond_resched();
}
out:
+ return ret;
+}
+
+/**
+ * filemap_fdatawait_range - wait for writeback to complete
+ * @mapping: address space structure to wait for
+ * @start_byte: offset in bytes where the range starts
+ * @end_byte: offset in bytes where the range ends (inclusive)
+ *
+ * Walk the list of under-writeback pages of the given address space
+ * in the given range and wait for all of them. Check error status of
+ * the address space and return it.
+ *
+ * Since the error status of the address space is cleared by this function,
+ * callers are responsible for checking the return value and handling and/or
+ * reporting the error.
+ */
+int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
+ loff_t end_byte)
+{
+ int ret, ret2;
+
+ ret = __filemap_fdatawait_range(mapping, start_byte, end_byte);
ret2 = filemap_check_errors(mapping);
if (!ret)
ret = ret2;
@@ -383,11 +397,38 @@ out:
EXPORT_SYMBOL(filemap_fdatawait_range);
/**
+ * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
+ * @mapping: address space structure to wait for
+ *
+ * Walk the list of under-writeback pages of the given address space
+ * and wait for all of them. Unlike filemap_fdatawait(), this function
+ * does not clear error status of the address space.
+ *
+ * Use this function if callers don't handle errors themselves. Expected
+ * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
+ * fsfreeze(8)
+ */
+void filemap_fdatawait_keep_errors(struct address_space *mapping)
+{
+ loff_t i_size = i_size_read(mapping->host);
+
+ if (i_size == 0)
+ return;
+
+ __filemap_fdatawait_range(mapping, 0, i_size - 1);
+}
+
+/**
* filemap_fdatawait - wait for all under-writeback pages to complete
* @mapping: address space structure to wait for
*
* Walk the list of under-writeback pages of the given address space
- * and wait for all of them.
+ * and wait for all of them. Check error status of the address space
+ * and return it.
+ *
+ * Since the error status of the address space is cleared by this function,
+ * callers are responsible for checking the return value and handling and/or
+ * reporting the error.
*/
int filemap_fdatawait(struct address_space *mapping)
{
@@ -510,7 +551,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
__inc_zone_page_state(new, NR_SHMEM);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
mem_cgroup_end_page_stat(memcg);
- mem_cgroup_migrate(old, new, true);
+ mem_cgroup_replace_page(old, new);
radix_tree_preload_end();
if (freepage)
freepage(old);
@@ -1807,7 +1848,6 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
struct file *file,
pgoff_t offset)
{
- unsigned long ra_pages;
struct address_space *mapping = file->f_mapping;
/* If we don't want any read-ahead, don't bother */
@@ -1836,10 +1876,9 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
/*
* mmap read-around
*/
- ra_pages = max_sane_readahead(ra->ra_pages);
- ra->start = max_t(long, 0, offset - ra_pages / 2);
- ra->size = ra_pages;
- ra->async_size = ra_pages / 4;
+ ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
+ ra->size = ra->ra_pages;
+ ra->async_size = ra->ra_pages / 4;
ra_submit(ra, mapping, file);
}
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
index cdabcb93c6a6..7cf2b7163222 100644
--- a/mm/frame_vector.c
+++ b/mm/frame_vector.c
@@ -7,7 +7,7 @@
#include <linux/pagemap.h>
#include <linux/sched.h>
-/*
+/**
* get_vaddr_frames() - map virtual addresses to pfns
* @start: starting user address
* @nr_frames: number of pages / pfns from start to map
diff --git a/mm/gup.c b/mm/gup.c
index a798293fc648..deafa2c91b36 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -129,7 +129,7 @@ retry:
*/
mark_page_accessed(page);
}
- if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
+ if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
/*
* The preliminary mapping check is mainly to avoid the
* pointless overhead of lock_page on the ZERO_PAGE
@@ -299,6 +299,9 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
unsigned int fault_flags = 0;
int ret;
+ /* mlock all present pages, but do not fault in new pages */
+ if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
+ return -ENOENT;
/* For mm_populate(), just skip the stack guard page. */
if ((*flags & FOLL_POPULATE) &&
(stack_guard_page_start(vma, address) ||
@@ -890,7 +893,10 @@ long populate_vma_page_range(struct vm_area_struct *vma,
VM_BUG_ON_VMA(end > vma->vm_end, vma);
VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
- gup_flags = FOLL_TOUCH | FOLL_POPULATE;
+ gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
+ if (vma->vm_flags & VM_LOCKONFAULT)
+ gup_flags &= ~FOLL_POPULATE;
+
/*
* We want to touch writable mappings with a write fault in order
* to break COW, except for shared mappings because these don't COW
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3fd0311c3ba7..f5c08b46fef8 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1307,7 +1307,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
pmd, _pmd, 1))
update_mmu_cache_pmd(vma, addr, pmd);
}
- if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
+ if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
if (page->mapping && trylock_page(page)) {
lru_add_drain();
if (page->mapping)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9cc773483624..74ef0c6a25dd 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1437,7 +1437,82 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
dissolve_free_huge_page(pfn_to_page(pfn));
}
-static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
+/*
+ * There are 3 ways this can get called:
+ * 1. With vma+addr: we use the VMA's memory policy
+ * 2. With !vma, but nid=NUMA_NO_NODE: We try to allocate a huge
+ * page from any node, and let the buddy allocator itself figure
+ * it out.
+ * 3. With !vma, but nid!=NUMA_NO_NODE. We allocate a huge page
+ * strictly from 'nid'
+ */
+static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr, int nid)
+{
+ int order = huge_page_order(h);
+ gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN;
+ unsigned int cpuset_mems_cookie;
+
+ /*
+ * We need a VMA to get a memory policy. If we do not
+ * have one, we use the 'nid' argument.
+ *
+ * The mempolicy stuff below has some non-inlined bits
+ * and calls ->vm_ops. That makes it hard to optimize at
+ * compile-time, even when NUMA is off and it does
+ * nothing. This helps the compiler optimize it out.
+ */
+ if (!IS_ENABLED(CONFIG_NUMA) || !vma) {
+ /*
+ * If a specific node is requested, make sure to
+ * get memory from there, but only when a node
+ * is explicitly specified.
+ */
+ if (nid != NUMA_NO_NODE)
+ gfp |= __GFP_THISNODE;
+ /*
+ * Make sure to call something that can handle
+ * nid=NUMA_NO_NODE
+ */
+ return alloc_pages_node(nid, gfp, order);
+ }
+
+ /*
+ * OK, so we have a VMA. Fetch the mempolicy and try to
+ * allocate a huge page with it. We will only reach this
+ * when CONFIG_NUMA=y.
+ */
+ do {
+ struct page *page;
+ struct mempolicy *mpol;
+ struct zonelist *zl;
+ nodemask_t *nodemask;
+
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ zl = huge_zonelist(vma, addr, gfp, &mpol, &nodemask);
+ mpol_cond_put(mpol);
+ page = __alloc_pages_nodemask(gfp, order, zl, nodemask);
+ if (page)
+ return page;
+ } while (read_mems_allowed_retry(cpuset_mems_cookie));
+
+ return NULL;
+}
+
+/*
+ * There are two ways to allocate a huge page:
+ * 1. When you have a VMA and an address (like a fault)
+ * 2. When you have no VMA (like when setting /proc/.../nr_hugepages)
+ *
+ * 'vma' and 'addr' are only for (1). 'nid' is always NUMA_NO_NODE in
+ * this case which signifies that the allocation should be done with
+ * respect for the VMA's memory policy.
+ *
+ * For (2), we ignore 'vma' and 'addr' and use 'nid' exclusively. This
+ * implies that memory policies will not be taken in to account.
+ */
+static struct page *__alloc_buddy_huge_page(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr, int nid)
{
struct page *page;
unsigned int r_nid;
@@ -1446,6 +1521,15 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
return NULL;
/*
+ * Make sure that anyone specifying 'nid' is not also specifying a VMA.
+ * This makes sure the caller is picking _one_ of the modes with which
+ * we can call this function, not both.
+ */
+ if (vma || (addr != -1)) {
+ VM_WARN_ON_ONCE(addr == -1);
+ VM_WARN_ON_ONCE(nid != NUMA_NO_NODE);
+ }
+ /*
* Assume we will successfully allocate the surplus page to
* prevent racing processes from causing the surplus to exceed
* overcommit
@@ -1478,14 +1562,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
}
spin_unlock(&hugetlb_lock);
- if (nid == NUMA_NO_NODE)
- page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP|
- __GFP_REPEAT|__GFP_NOWARN,
- huge_page_order(h));
- else
- page = __alloc_pages_node(nid,
- htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
- __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
+ page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid);
spin_lock(&hugetlb_lock);
if (page) {
@@ -1510,6 +1587,29 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
}
/*
+ * Allocate a huge page from 'nid'. Note, 'nid' may be
+ * NUMA_NO_NODE, which means that it may be allocated
+ * anywhere.
+ */
+static
+struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid)
+{
+ unsigned long addr = -1;
+
+ return __alloc_buddy_huge_page(h, NULL, addr, nid);
+}
+
+/*
+ * Use the VMA's mpolicy to allocate a huge page from the buddy.
+ */
+static
+struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ return __alloc_buddy_huge_page(h, vma, addr, NUMA_NO_NODE);
+}
+
+/*
* This allocation function is useful in the context where vma is irrelevant.
* E.g. soft-offlining uses this function because it only cares physical
* address of error page.
@@ -1524,7 +1624,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
spin_unlock(&hugetlb_lock);
if (!page)
- page = alloc_buddy_huge_page(h, nid);
+ page = __alloc_buddy_huge_page_no_mpol(h, nid);
return page;
}
@@ -1554,7 +1654,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
retry:
spin_unlock(&hugetlb_lock);
for (i = 0; i < needed; i++) {
- page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
+ page = __alloc_buddy_huge_page_no_mpol(h, NUMA_NO_NODE);
if (!page) {
alloc_ok = false;
break;
@@ -1787,7 +1887,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
if (!page) {
spin_unlock(&hugetlb_lock);
- page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
+ page = __alloc_buddy_huge_page_with_mpol(h, vma, addr);
if (!page)
goto out_uncharge_cgroup;
@@ -2376,7 +2476,7 @@ struct node_hstate {
struct kobject *hugepages_kobj;
struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
};
-struct node_hstate node_hstates[MAX_NUMNODES];
+static struct node_hstate node_hstates[MAX_NUMNODES];
/*
* A subset of global hstate attributes for node devices
@@ -2790,6 +2890,12 @@ void hugetlb_show_meminfo(void)
1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
}
+void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
+{
+ seq_printf(m, "HugetlbPages:\t%8lu kB\n",
+ atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
+}
+
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
@@ -3025,6 +3131,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
get_page(ptepage);
page_dup_rmap(ptepage);
set_huge_pte_at(dst, addr, dst_pte, entry);
+ hugetlb_count_add(pages_per_huge_page(h), dst);
}
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
@@ -3105,6 +3212,7 @@ again:
if (huge_pte_dirty(pte))
set_page_dirty(page);
+ hugetlb_count_sub(pages_per_huge_page(h), mm);
page_remove_rmap(page);
force_flush = !__tlb_remove_page(tlb, page);
if (force_flush) {
@@ -3509,6 +3617,7 @@ retry:
&& (vma->vm_flags & VM_SHARED)));
set_huge_pte_at(mm, address, ptep, new_pte);
+ hugetlb_count_add(pages_per_huge_page(h), mm);
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
@@ -4028,8 +4137,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
unsigned long s_end = sbase + PUD_SIZE;
/* Allow segments to share if only one is marked locked */
- unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
- unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
+ unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
+ unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
/*
* match the virtual addresses, permission and the alignment of the
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 6e0057439a46..33d59abe91f1 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -186,7 +186,8 @@ again:
}
rcu_read_unlock();
- ret = page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter);
+ if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter))
+ ret = -ENOMEM;
css_put(&h_cg->css);
done:
*ptr = h_cg;
diff --git a/mm/internal.h b/mm/internal.h
index bc0fa9a69e46..d4b807d6c963 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -271,20 +271,19 @@ extern unsigned int munlock_vma_page(struct page *page);
extern void clear_page_mlock(struct page *page);
/*
- * mlock_migrate_page - called only from migrate_page_copy() to
- * migrate the Mlocked page flag; update statistics.
+ * mlock_migrate_page - called only from migrate_misplaced_transhuge_page()
+ * (because that does not go through the full procedure of migration ptes):
+ * to migrate the Mlocked page flag; update statistics.
*/
static inline void mlock_migrate_page(struct page *newpage, struct page *page)
{
if (TestClearPageMlocked(page)) {
- unsigned long flags;
int nr_pages = hpage_nr_pages(page);
- local_irq_save(flags);
+ /* Holding pmd lock, no change in irq context: __mod is safe */
__mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
SetPageMlocked(newpage);
__mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
- local_irq_restore(flags);
}
}
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 8da211411b57..d41b21bce6a0 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -4,7 +4,7 @@
* Copyright (c) 2014 Samsung Electronics Co., Ltd.
* Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
*
- * Some of code borrowed from https://github.com/xairy/linux by
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
* Andrey Konovalov <adech.fo@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
@@ -86,6 +86,11 @@ static __always_inline bool memory_is_poisoned_2(unsigned long addr)
if (memory_is_poisoned_1(addr + 1))
return true;
+ /*
+ * If single shadow byte covers 2-byte access, we don't
+ * need to do anything more. Otherwise, test the first
+ * shadow byte.
+ */
if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0))
return false;
@@ -103,6 +108,11 @@ static __always_inline bool memory_is_poisoned_4(unsigned long addr)
if (memory_is_poisoned_1(addr + 3))
return true;
+ /*
+ * If single shadow byte covers 4-byte access, we don't
+ * need to do anything more. Otherwise, test the first
+ * shadow byte.
+ */
if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3))
return false;
@@ -120,7 +130,12 @@ static __always_inline bool memory_is_poisoned_8(unsigned long addr)
if (memory_is_poisoned_1(addr + 7))
return true;
- if (likely(((addr + 7) & KASAN_SHADOW_MASK) >= 7))
+ /*
+ * If single shadow byte covers 8-byte access, we don't
+ * need to do anything more. Otherwise, test the first
+ * shadow byte.
+ */
+ if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
return false;
return unlikely(*(u8 *)shadow_addr);
@@ -139,7 +154,12 @@ static __always_inline bool memory_is_poisoned_16(unsigned long addr)
if (unlikely(shadow_first_bytes))
return true;
- if (likely(IS_ALIGNED(addr, 8)))
+ /*
+ * If two shadow bytes covers 16-byte access, we don't
+ * need to do anything more. Otherwise, test the last
+ * shadow byte.
+ */
+ if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
return false;
return memory_is_poisoned_1(addr + 15);
@@ -203,7 +223,7 @@ static __always_inline bool memory_is_poisoned_n(unsigned long addr,
s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
if (unlikely(ret != (unsigned long)last_shadow ||
- ((last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
+ ((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
return true;
}
return false;
@@ -235,18 +255,12 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
static __always_inline void check_memory_region(unsigned long addr,
size_t size, bool write)
{
- struct kasan_access_info info;
-
if (unlikely(size == 0))
return;
if (unlikely((void *)addr <
kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
- info.access_addr = (void *)addr;
- info.access_size = size;
- info.is_write = write;
- info.ip = _RET_IP_;
- kasan_report_user_access(&info);
+ kasan_report(addr, size, write, _RET_IP_);
return;
}
@@ -524,7 +538,7 @@ static int kasan_mem_notifier(struct notifier_block *nb,
static int __init kasan_memhotplug_init(void)
{
- pr_err("WARNING: KASan doesn't support memory hot-add\n");
+ pr_err("WARNING: KASAN doesn't support memory hot-add\n");
pr_err("Memory hot-add will be disabled\n");
hotplug_memory_notifier(kasan_mem_notifier, 0);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index c242adf6bc85..4f6c62e5c21e 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -54,16 +54,13 @@ struct kasan_global {
#endif
};
-void kasan_report_error(struct kasan_access_info *info);
-void kasan_report_user_access(struct kasan_access_info *info);
-
static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
{
return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
<< KASAN_SHADOW_SCALE_SHIFT);
}
-static inline bool kasan_enabled(void)
+static inline bool kasan_report_enabled(void)
{
return !current->kasan_depth;
}
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index e07c94fbd0ac..12f222d0224b 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -4,7 +4,7 @@
* Copyright (c) 2014 Samsung Electronics Co., Ltd.
* Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
*
- * Some of code borrowed from https://github.com/xairy/linux by
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
* Andrey Konovalov <adech.fo@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
@@ -22,6 +22,7 @@
#include <linux/string.h>
#include <linux/types.h>
#include <linux/kasan.h>
+#include <linux/module.h>
#include <asm/sections.h>
@@ -48,34 +49,49 @@ static const void *find_first_bad_addr(const void *addr, size_t size)
static void print_error_description(struct kasan_access_info *info)
{
- const char *bug_type = "unknown crash";
- u8 shadow_val;
+ const char *bug_type = "unknown-crash";
+ u8 *shadow_addr;
info->first_bad_addr = find_first_bad_addr(info->access_addr,
info->access_size);
- shadow_val = *(u8 *)kasan_mem_to_shadow(info->first_bad_addr);
+ shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr);
- switch (shadow_val) {
- case KASAN_FREE_PAGE:
- case KASAN_KMALLOC_FREE:
- bug_type = "use after free";
+ /*
+ * If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look
+ * at the next shadow byte to determine the type of the bad access.
+ */
+ if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1)
+ shadow_addr++;
+
+ switch (*shadow_addr) {
+ case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
+ /*
+ * In theory it's still possible to see these shadow values
+ * due to a data race in the kernel code.
+ */
+ bug_type = "out-of-bounds";
break;
case KASAN_PAGE_REDZONE:
case KASAN_KMALLOC_REDZONE:
+ bug_type = "slab-out-of-bounds";
+ break;
case KASAN_GLOBAL_REDZONE:
- case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
- bug_type = "out of bounds access";
+ bug_type = "global-out-of-bounds";
break;
case KASAN_STACK_LEFT:
case KASAN_STACK_MID:
case KASAN_STACK_RIGHT:
case KASAN_STACK_PARTIAL:
- bug_type = "out of bounds on stack";
+ bug_type = "stack-out-of-bounds";
+ break;
+ case KASAN_FREE_PAGE:
+ case KASAN_KMALLOC_FREE:
+ bug_type = "use-after-free";
break;
}
- pr_err("BUG: KASan: %s in %pS at addr %p\n",
+ pr_err("BUG: KASAN: %s in %pS at addr %p\n",
bug_type, (void *)info->ip,
info->access_addr);
pr_err("%s of size %zu by task %s/%d\n",
@@ -85,9 +101,11 @@ static void print_error_description(struct kasan_access_info *info)
static inline bool kernel_or_module_addr(const void *addr)
{
- return (addr >= (void *)_stext && addr < (void *)_end)
- || (addr >= (void *)MODULES_VADDR
- && addr < (void *)MODULES_END);
+ if (addr >= (void *)_stext && addr < (void *)_end)
+ return true;
+ if (is_module_address((unsigned long)addr))
+ return true;
+ return false;
}
static inline bool init_task_stack_addr(const void *addr)
@@ -161,15 +179,19 @@ static void print_shadow_for_address(const void *addr)
for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) {
const void *kaddr = kasan_shadow_to_mem(shadow_row);
char buffer[4 + (BITS_PER_LONG/8)*2];
+ char shadow_buf[SHADOW_BYTES_PER_ROW];
snprintf(buffer, sizeof(buffer),
(i == 0) ? ">%p: " : " %p: ", kaddr);
-
- kasan_disable_current();
+ /*
+ * We should not pass a shadow pointer to generic
+ * function, because generic functions may try to
+ * access kasan mapping for the passed address.
+ */
+ memcpy(shadow_buf, shadow_row, SHADOW_BYTES_PER_ROW);
print_hex_dump(KERN_ERR, buffer,
DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1,
- shadow_row, SHADOW_BYTES_PER_ROW, 0);
- kasan_enable_current();
+ shadow_buf, SHADOW_BYTES_PER_ROW, 0);
if (row_is_guilty(shadow_row, shadow))
pr_err("%*c\n",
@@ -182,37 +204,43 @@ static void print_shadow_for_address(const void *addr)
static DEFINE_SPINLOCK(report_lock);
-void kasan_report_error(struct kasan_access_info *info)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&report_lock, flags);
- pr_err("================================="
- "=================================\n");
- print_error_description(info);
- print_address_description(info);
- print_shadow_for_address(info->first_bad_addr);
- pr_err("================================="
- "=================================\n");
- spin_unlock_irqrestore(&report_lock, flags);
-}
-
-void kasan_report_user_access(struct kasan_access_info *info)
+static void kasan_report_error(struct kasan_access_info *info)
{
unsigned long flags;
+ const char *bug_type;
+ /*
+ * Make sure we don't end up in loop.
+ */
+ kasan_disable_current();
spin_lock_irqsave(&report_lock, flags);
pr_err("================================="
"=================================\n");
- pr_err("BUG: KASan: user-memory-access on address %p\n",
- info->access_addr);
- pr_err("%s of size %zu by task %s/%d\n",
- info->is_write ? "Write" : "Read",
- info->access_size, current->comm, task_pid_nr(current));
- dump_stack();
+ if (info->access_addr <
+ kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) {
+ if ((unsigned long)info->access_addr < PAGE_SIZE)
+ bug_type = "null-ptr-deref";
+ else if ((unsigned long)info->access_addr < TASK_SIZE)
+ bug_type = "user-memory-access";
+ else
+ bug_type = "wild-memory-access";
+ pr_err("BUG: KASAN: %s on address %p\n",
+ bug_type, info->access_addr);
+ pr_err("%s of size %zu by task %s/%d\n",
+ info->is_write ? "Write" : "Read",
+ info->access_size, current->comm,
+ task_pid_nr(current));
+ dump_stack();
+ } else {
+ print_error_description(info);
+ print_address_description(info);
+ print_shadow_for_address(info->first_bad_addr);
+ }
pr_err("================================="
"=================================\n");
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
spin_unlock_irqrestore(&report_lock, flags);
+ kasan_enable_current();
}
void kasan_report(unsigned long addr, size_t size,
@@ -220,13 +248,14 @@ void kasan_report(unsigned long addr, size_t size,
{
struct kasan_access_info info;
- if (likely(!kasan_enabled()))
+ if (likely(!kasan_report_enabled()))
return;
info.access_addr = (void *)addr;
info.access_size = size;
info.is_write = is_write;
info.ip = ip;
+
kasan_report_error(&info);
}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 77191eccdc6f..19423a45d7d7 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -479,7 +479,7 @@ static void put_object(struct kmemleak_object *object)
static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
{
unsigned long flags;
- struct kmemleak_object *object = NULL;
+ struct kmemleak_object *object;
rcu_read_lock();
read_lock_irqsave(&kmemleak_lock, flags);
diff --git a/mm/ksm.c b/mm/ksm.c
index 7ee101eaacdf..b5cd647daa52 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -475,7 +475,8 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
flush_dcache_page(page);
} else {
put_page(page);
-out: page = NULL;
+out:
+ page = NULL;
}
up_read(&mm->mmap_sem);
return page;
@@ -625,7 +626,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
unlock_page(page);
put_page(page);
- if (stable_node->hlist.first)
+ if (!hlist_empty(&stable_node->hlist))
ksm_pages_sharing--;
else
ksm_pages_shared--;
@@ -1021,8 +1022,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
if (page == kpage) /* ksm page forked */
return 0;
- if (!(vma->vm_flags & VM_MERGEABLE))
- goto out;
if (PageTransCompound(page) && page_trans_compound_anon_split(page))
goto out;
BUG_ON(PageTransCompound(page));
@@ -1087,10 +1086,8 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
int err = -EFAULT;
down_read(&mm->mmap_sem);
- if (ksm_test_exit(mm))
- goto out;
- vma = find_vma(mm, rmap_item->address);
- if (!vma || vma->vm_start > rmap_item->address)
+ vma = find_mergeable_vma(mm, rmap_item->address);
+ if (!vma)
goto out;
err = try_to_merge_one_page(vma, page, kpage);
@@ -1177,8 +1174,18 @@ again:
cond_resched();
stable_node = rb_entry(*new, struct stable_node, node);
tree_page = get_ksm_page(stable_node, false);
- if (!tree_page)
- return NULL;
+ if (!tree_page) {
+ /*
+ * If we walked over a stale stable_node,
+ * get_ksm_page() will call rb_erase() and it
+ * may rebalance the tree from under us. So
+ * restart the search from scratch. Returning
+ * NULL would be safe too, but we'd generate
+ * false negative insertions just because some
+ * stable_node was stale.
+ */
+ goto again;
+ }
ret = memcmp_pages(page, tree_page);
put_page(tree_page);
@@ -1254,12 +1261,14 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
unsigned long kpfn;
struct rb_root *root;
struct rb_node **new;
- struct rb_node *parent = NULL;
+ struct rb_node *parent;
struct stable_node *stable_node;
kpfn = page_to_pfn(kpage);
nid = get_kpfn_nid(kpfn);
root = root_stable_tree + nid;
+again:
+ parent = NULL;
new = &root->rb_node;
while (*new) {
@@ -1269,8 +1278,18 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
cond_resched();
stable_node = rb_entry(*new, struct stable_node, node);
tree_page = get_ksm_page(stable_node, false);
- if (!tree_page)
- return NULL;
+ if (!tree_page) {
+ /*
+ * If we walked over a stale stable_node,
+ * get_ksm_page() will call rb_erase() and it
+ * may rebalance the tree from under us. So
+ * restart the search from scratch. Returning
+ * NULL would be safe too, but we'd generate
+ * false negative insertions just because some
+ * stable_node was stale.
+ */
+ goto again;
+ }
ret = memcmp_pages(kpage, tree_page);
put_page(tree_page);
@@ -1340,7 +1359,7 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
cond_resched();
tree_rmap_item = rb_entry(*new, struct rmap_item, node);
tree_page = get_mergeable_page(tree_rmap_item);
- if (IS_ERR_OR_NULL(tree_page))
+ if (!tree_page)
return NULL;
/*
@@ -1914,9 +1933,11 @@ again:
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
+ cond_resched();
anon_vma_lock_read(anon_vma);
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
0, ULONG_MAX) {
+ cond_resched();
vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
rmap_item->address >= vma->vm_end)
diff --git a/mm/list_lru.c b/mm/list_lru.c
index e1da19fac1b3..afc71ea9a381 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -42,6 +42,10 @@ static void list_lru_unregister(struct list_lru *lru)
#ifdef CONFIG_MEMCG_KMEM
static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
+ /*
+ * This needs node 0 to be always present, even
+ * in the systems supporting sparse numa ids.
+ */
return !!lru->node[0].memcg_lrus;
}
@@ -59,6 +63,16 @@ list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
return &nlru->lru;
}
+static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
+{
+ struct page *page;
+
+ if (!memcg_kmem_enabled())
+ return NULL;
+ page = virt_to_head_page(ptr);
+ return page->mem_cgroup;
+}
+
static inline struct list_lru_one *
list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
{
@@ -377,16 +391,20 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
{
int i;
- for (i = 0; i < nr_node_ids; i++) {
- if (!memcg_aware)
- lru->node[i].memcg_lrus = NULL;
- else if (memcg_init_list_lru_node(&lru->node[i]))
+ if (!memcg_aware)
+ return 0;
+
+ for_each_node(i) {
+ if (memcg_init_list_lru_node(&lru->node[i]))
goto fail;
}
return 0;
fail:
- for (i = i - 1; i >= 0; i--)
+ for (i = i - 1; i >= 0; i--) {
+ if (!lru->node[i].memcg_lrus)
+ continue;
memcg_destroy_list_lru_node(&lru->node[i]);
+ }
return -ENOMEM;
}
@@ -397,7 +415,7 @@ static void memcg_destroy_list_lru(struct list_lru *lru)
if (!list_lru_memcg_aware(lru))
return;
- for (i = 0; i < nr_node_ids; i++)
+ for_each_node(i)
memcg_destroy_list_lru_node(&lru->node[i]);
}
@@ -409,16 +427,20 @@ static int memcg_update_list_lru(struct list_lru *lru,
if (!list_lru_memcg_aware(lru))
return 0;
- for (i = 0; i < nr_node_ids; i++) {
+ for_each_node(i) {
if (memcg_update_list_lru_node(&lru->node[i],
old_size, new_size))
goto fail;
}
return 0;
fail:
- for (i = i - 1; i >= 0; i--)
+ for (i = i - 1; i >= 0; i--) {
+ if (!lru->node[i].memcg_lrus)
+ continue;
+
memcg_cancel_update_list_lru_node(&lru->node[i],
old_size, new_size);
+ }
return -ENOMEM;
}
@@ -430,7 +452,7 @@ static void memcg_cancel_update_list_lru(struct list_lru *lru,
if (!list_lru_memcg_aware(lru))
return;
- for (i = 0; i < nr_node_ids; i++)
+ for_each_node(i)
memcg_cancel_update_list_lru_node(&lru->node[i],
old_size, new_size);
}
@@ -485,7 +507,7 @@ static void memcg_drain_list_lru(struct list_lru *lru,
if (!list_lru_memcg_aware(lru))
return;
- for (i = 0; i < nr_node_ids; i++)
+ for_each_node(i)
memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx);
}
@@ -522,7 +544,7 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
if (!lru->node)
goto out;
- for (i = 0; i < nr_node_ids; i++) {
+ for_each_node(i) {
spin_lock_init(&lru->node[i].lock);
if (key)
lockdep_set_class(&lru->node[i].lock, key);
diff --git a/mm/maccess.c b/mm/maccess.c
index 34fe24759ed1..d159b1c96e48 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -13,6 +13,11 @@
*
* Safely read from address @src to the buffer at @dst. If a kernel fault
* happens, handle that and return -EFAULT.
+ *
+ * We ensure that the copy_from_user is executed in atomic context so that
+ * do_page_fault() doesn't attempt to take mmap_sem. This makes
+ * probe_kernel_read() suitable for use within regions where the caller
+ * already holds mmap_sem, or other locks which nest inside mmap_sem.
*/
long __weak probe_kernel_read(void *dst, const void *src, size_t size)
@@ -99,5 +104,5 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
pagefault_enable();
set_fs(old_fs);
- return ret < 0 ? ret : src - unsafe_addr;
+ return ret ? -EFAULT : src - unsafe_addr;
}
diff --git a/mm/memblock.c b/mm/memblock.c
index 1c7b647e5897..d300f1329814 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -706,7 +706,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
return 0;
}
-int __init_memblock memblock_remove_range(struct memblock_type *type,
+static int __init_memblock memblock_remove_range(struct memblock_type *type,
phys_addr_t base, phys_addr_t size)
{
int start_rgn, end_rgn;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b732edfddb76..bc502e590366 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -62,6 +62,7 @@
#include <linux/oom.h>
#include <linux/lockdep.h>
#include <linux/file.h>
+#include <linux/tracehook.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
@@ -1661,7 +1662,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{
- if (!current->memcg_oom.may_oom)
+ if (!current->memcg_may_oom)
return;
/*
* We are in the middle of the charge context here, so we
@@ -1678,9 +1679,9 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
* and when we know whether the fault was overall successful.
*/
css_get(&memcg->css);
- current->memcg_oom.memcg = memcg;
- current->memcg_oom.gfp_mask = mask;
- current->memcg_oom.order = order;
+ current->memcg_in_oom = memcg;
+ current->memcg_oom_gfp_mask = mask;
+ current->memcg_oom_order = order;
}
/**
@@ -1702,7 +1703,7 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
*/
bool mem_cgroup_oom_synchronize(bool handle)
{
- struct mem_cgroup *memcg = current->memcg_oom.memcg;
+ struct mem_cgroup *memcg = current->memcg_in_oom;
struct oom_wait_info owait;
bool locked;
@@ -1730,8 +1731,8 @@ bool mem_cgroup_oom_synchronize(bool handle)
if (locked && !memcg->oom_kill_disable) {
mem_cgroup_unmark_under_oom(memcg);
finish_wait(&memcg_oom_waitq, &owait.wait);
- mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
- current->memcg_oom.order);
+ mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
+ current->memcg_oom_order);
} else {
schedule();
mem_cgroup_unmark_under_oom(memcg);
@@ -1748,7 +1749,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
memcg_oom_recover(memcg);
}
cleanup:
- current->memcg_oom.memcg = NULL;
+ current->memcg_in_oom = NULL;
css_put(&memcg->css);
return true;
}
@@ -1972,6 +1973,31 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
return NOTIFY_OK;
}
+/*
+ * Scheduled by try_charge() to be executed from the userland return path
+ * and reclaims memory over the high limit.
+ */
+void mem_cgroup_handle_over_high(void)
+{
+ unsigned int nr_pages = current->memcg_nr_pages_over_high;
+ struct mem_cgroup *memcg, *pos;
+
+ if (likely(!nr_pages))
+ return;
+
+ pos = memcg = get_mem_cgroup_from_mm(current->mm);
+
+ do {
+ if (page_counter_read(&pos->memory) <= pos->high)
+ continue;
+ mem_cgroup_events(pos, MEMCG_HIGH, 1);
+ try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true);
+ } while ((pos = parent_mem_cgroup(pos)));
+
+ css_put(&memcg->css);
+ current->memcg_nr_pages_over_high = 0;
+}
+
static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages)
{
@@ -1982,17 +2008,16 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned long nr_reclaimed;
bool may_swap = true;
bool drained = false;
- int ret = 0;
if (mem_cgroup_is_root(memcg))
- goto done;
+ return 0;
retry:
if (consume_stock(memcg, nr_pages))
- goto done;
+ return 0;
if (!do_swap_account ||
- !page_counter_try_charge(&memcg->memsw, batch, &counter)) {
- if (!page_counter_try_charge(&memcg->memory, batch, &counter))
+ page_counter_try_charge(&memcg->memsw, batch, &counter)) {
+ if (page_counter_try_charge(&memcg->memory, batch, &counter))
goto done_restock;
if (do_swap_account)
page_counter_uncharge(&memcg->memsw, batch);
@@ -2016,7 +2041,7 @@ retry:
if (unlikely(test_thread_flag(TIF_MEMDIE) ||
fatal_signal_pending(current) ||
current->flags & PF_EXITING))
- goto bypass;
+ goto force;
if (unlikely(task_in_memcg_oom(current)))
goto nomem;
@@ -2062,38 +2087,54 @@ retry:
goto retry;
if (gfp_mask & __GFP_NOFAIL)
- goto bypass;
+ goto force;
if (fatal_signal_pending(current))
- goto bypass;
+ goto force;
mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
- mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
+ mem_cgroup_oom(mem_over_limit, gfp_mask,
+ get_order(nr_pages * PAGE_SIZE));
nomem:
if (!(gfp_mask & __GFP_NOFAIL))
return -ENOMEM;
-bypass:
- return -EINTR;
+force:
+ /*
+ * The allocation either can't fail or will lead to more memory
+ * being freed very soon. Allow memory usage go over the limit
+ * temporarily by force charging it.
+ */
+ page_counter_charge(&memcg->memory, nr_pages);
+ if (do_swap_account)
+ page_counter_charge(&memcg->memsw, nr_pages);
+ css_get_many(&memcg->css, nr_pages);
+
+ return 0;
done_restock:
css_get_many(&memcg->css, batch);
if (batch > nr_pages)
refill_stock(memcg, batch - nr_pages);
- if (!(gfp_mask & __GFP_WAIT))
- goto done;
+
/*
- * If the hierarchy is above the normal consumption range,
- * make the charging task trim their excess contribution.
+ * If the hierarchy is above the normal consumption range, schedule
+ * reclaim on returning to userland. We can perform reclaim here
+ * if __GFP_WAIT but let's always punt for simplicity and so that
+ * GFP_KERNEL can consistently be used during reclaim. @memcg is
+ * not recorded as it most likely matches current's and won't
+ * change in the meantime. As high limit is checked again before
+ * reclaim, the cost of mismatch is negligible.
*/
do {
- if (page_counter_read(&memcg->memory) <= memcg->high)
- continue;
- mem_cgroup_events(memcg, MEMCG_HIGH, 1);
- try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+ if (page_counter_read(&memcg->memory) > memcg->high) {
+ current->memcg_nr_pages_over_high += nr_pages;
+ set_notify_resume(current);
+ break;
+ }
} while ((memcg = parent_mem_cgroup(memcg)));
-done:
- return ret;
+
+ return 0;
}
static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
@@ -2174,55 +2215,6 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
}
#ifdef CONFIG_MEMCG_KMEM
-int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
- unsigned long nr_pages)
-{
- struct page_counter *counter;
- int ret = 0;
-
- ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
- if (ret < 0)
- return ret;
-
- ret = try_charge(memcg, gfp, nr_pages);
- if (ret == -EINTR) {
- /*
- * try_charge() chose to bypass to root due to OOM kill or
- * fatal signal. Since our only options are to either fail
- * the allocation or charge it to this cgroup, do it as a
- * temporary condition. But we can't fail. From a kmem/slab
- * perspective, the cache has already been selected, by
- * mem_cgroup_kmem_get_cache(), so it is too late to change
- * our minds.
- *
- * This condition will only trigger if the task entered
- * memcg_charge_kmem in a sane state, but was OOM-killed
- * during try_charge() above. Tasks that were already dying
- * when the allocation triggers should have been already
- * directed to the root cgroup in memcontrol.h
- */
- page_counter_charge(&memcg->memory, nr_pages);
- if (do_swap_account)
- page_counter_charge(&memcg->memsw, nr_pages);
- css_get_many(&memcg->css, nr_pages);
- ret = 0;
- } else if (ret)
- page_counter_uncharge(&memcg->kmem, nr_pages);
-
- return ret;
-}
-
-void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
-{
- page_counter_uncharge(&memcg->memory, nr_pages);
- if (do_swap_account)
- page_counter_uncharge(&memcg->memsw, nr_pages);
-
- page_counter_uncharge(&memcg->kmem, nr_pages);
-
- css_put_many(&memcg->css, nr_pages);
-}
-
static int memcg_alloc_cache_id(void)
{
int id, size;
@@ -2384,85 +2376,58 @@ void __memcg_kmem_put_cache(struct kmem_cache *cachep)
css_put(&cachep->memcg_params.memcg->css);
}
-/*
- * We need to verify if the allocation against current->mm->owner's memcg is
- * possible for the given order. But the page is not allocated yet, so we'll
- * need a further commit step to do the final arrangements.
- *
- * It is possible for the task to switch cgroups in this mean time, so at
- * commit time, we can't rely on task conversion any longer. We'll then use
- * the handle argument to return to the caller which cgroup we should commit
- * against. We could also return the memcg directly and avoid the pointer
- * passing, but a boolean return value gives better semantics considering
- * the compiled-out case as well.
- *
- * Returning true means the allocation is possible.
- */
-bool
-__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
+int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
+ struct mem_cgroup *memcg)
{
- struct mem_cgroup *memcg;
+ unsigned int nr_pages = 1 << order;
+ struct page_counter *counter;
int ret;
- *_memcg = NULL;
+ if (!memcg_kmem_is_active(memcg))
+ return 0;
- memcg = get_mem_cgroup_from_mm(current->mm);
+ if (!page_counter_try_charge(&memcg->kmem, nr_pages, &counter))
+ return -ENOMEM;
- if (!memcg_kmem_is_active(memcg)) {
- css_put(&memcg->css);
- return true;
+ ret = try_charge(memcg, gfp, nr_pages);
+ if (ret) {
+ page_counter_uncharge(&memcg->kmem, nr_pages);
+ return ret;
}
- ret = memcg_charge_kmem(memcg, gfp, 1 << order);
- if (!ret)
- *_memcg = memcg;
+ page->mem_cgroup = memcg;
- css_put(&memcg->css);
- return (ret == 0);
+ return 0;
}
-void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
- int order)
+int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
{
- VM_BUG_ON(mem_cgroup_is_root(memcg));
+ struct mem_cgroup *memcg;
+ int ret;
- /* The page allocation failed. Revert */
- if (!page) {
- memcg_uncharge_kmem(memcg, 1 << order);
- return;
- }
- page->mem_cgroup = memcg;
+ memcg = get_mem_cgroup_from_mm(current->mm);
+ ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
+ css_put(&memcg->css);
+ return ret;
}
-void __memcg_kmem_uncharge_pages(struct page *page, int order)
+void __memcg_kmem_uncharge(struct page *page, int order)
{
struct mem_cgroup *memcg = page->mem_cgroup;
+ unsigned int nr_pages = 1 << order;
if (!memcg)
return;
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
- memcg_uncharge_kmem(memcg, 1 << order);
- page->mem_cgroup = NULL;
-}
-
-struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
-{
- struct mem_cgroup *memcg = NULL;
- struct kmem_cache *cachep;
- struct page *page;
-
- page = virt_to_head_page(ptr);
- if (PageSlab(page)) {
- cachep = page->slab_cache;
- if (!is_root_cache(cachep))
- memcg = cachep->memcg_params.memcg;
- } else
- /* page allocated by alloc_kmem_pages */
- memcg = page->mem_cgroup;
+ page_counter_uncharge(&memcg->kmem, nr_pages);
+ page_counter_uncharge(&memcg->memory, nr_pages);
+ if (do_swap_account)
+ page_counter_uncharge(&memcg->memsw, nr_pages);
- return memcg;
+ page->mem_cgroup = NULL;
+ css_put_many(&memcg->css, nr_pages);
}
#endif /* CONFIG_MEMCG_KMEM */
@@ -2836,9 +2801,9 @@ static unsigned long tree_stat(struct mem_cgroup *memcg,
return val;
}
-static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
+static inline unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
{
- u64 val;
+ unsigned long val;
if (mem_cgroup_is_root(memcg)) {
val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
@@ -2851,7 +2816,7 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
else
val = page_counter_read(&memcg->memsw);
}
- return val << PAGE_SHIFT;
+ return val;
}
enum {
@@ -2885,9 +2850,9 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
switch (MEMFILE_ATTR(cft->private)) {
case RES_USAGE:
if (counter == &memcg->memory)
- return mem_cgroup_usage(memcg, false);
+ return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
if (counter == &memcg->memsw)
- return mem_cgroup_usage(memcg, true);
+ return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
return (u64)page_counter_read(counter) * PAGE_SIZE;
case RES_LIMIT:
return (u64)counter->limit * PAGE_SIZE;
@@ -3387,7 +3352,6 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
ret = page_counter_memparse(args, "-1", &threshold);
if (ret)
return ret;
- threshold <<= PAGE_SHIFT;
mutex_lock(&memcg->thresholds_lock);
@@ -4406,22 +4370,10 @@ static int mem_cgroup_do_precharge(unsigned long count)
mc.precharge += count;
return ret;
}
- if (ret == -EINTR) {
- cancel_charge(root_mem_cgroup, count);
- return ret;
- }
/* Try charges one by one with reclaim */
while (count--) {
ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
- /*
- * In case of failure, any residual charges against
- * mc.to will be dropped by mem_cgroup_clear_mc()
- * later on. However, cancel any charges that are
- * bypassed to root right away or they'll be lost.
- */
- if (ret == -EINTR)
- cancel_charge(root_mem_cgroup, 1);
if (ret)
return ret;
mc.precharge++;
@@ -4576,9 +4528,8 @@ static int mem_cgroup_move_account(struct page *page,
goto out;
/*
- * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
- * of its source page while we change it: page migration takes
- * both pages off the LRU, but page cache replacement doesn't.
+ * Prevent mem_cgroup_replace_page() from looking at
+ * page->mem_cgroup of its source page while we change it.
*/
if (!trylock_page(page))
goto out;
@@ -5085,7 +5036,9 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
static u64 memory_current_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
- return mem_cgroup_usage(mem_cgroup_from_css(css), false);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
}
static int memory_low_show(struct seq_file *m, void *v)
@@ -5197,6 +5150,7 @@ static int memory_events_show(struct seq_file *m, void *v)
static struct cftype memory_files[] = {
{
.name = "current",
+ .flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = memory_current_read,
},
{
@@ -5340,11 +5294,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
ret = try_charge(memcg, gfp_mask, nr_pages);
css_put(&memcg->css);
-
- if (ret == -EINTR) {
- memcg = root_mem_cgroup;
- ret = 0;
- }
out:
*memcgp = memcg;
return ret;
@@ -5559,7 +5508,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
}
/**
- * mem_cgroup_migrate - migrate a charge to another page
+ * mem_cgroup_replace_page - migrate a charge to another page
* @oldpage: currently charged page
* @newpage: page to transfer the charge to
* @lrucare: either or both pages might be on the LRU already
@@ -5568,16 +5517,13 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
*
* Both pages must be locked, @newpage->mapping must be set up.
*/
-void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
- bool lrucare)
+void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
{
struct mem_cgroup *memcg;
int isolated;
VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
- VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);
- VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);
VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
newpage);
@@ -5589,25 +5535,16 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
if (newpage->mem_cgroup)
return;
- /*
- * Swapcache readahead pages can get migrated before being
- * charged, and migration from compaction can happen to an
- * uncharged page when the PFN walker finds a page that
- * reclaim just put back on the LRU but has not released yet.
- */
+ /* Swapcache readahead pages can get replaced before being charged */
memcg = oldpage->mem_cgroup;
if (!memcg)
return;
- if (lrucare)
- lock_page_lru(oldpage, &isolated);
-
+ lock_page_lru(oldpage, &isolated);
oldpage->mem_cgroup = NULL;
+ unlock_page_lru(oldpage, isolated);
- if (lrucare)
- unlock_page_lru(oldpage, isolated);
-
- commit_charge(newpage, memcg, lrucare);
+ commit_charge(newpage, memcg, true);
}
/*
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 95882692e747..16a0ec385320 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -56,6 +56,7 @@
#include <linux/memory_hotplug.h>
#include <linux/mm_inline.h>
#include <linux/kfifo.h>
+#include <linux/ratelimit.h>
#include "internal.h"
#include "ras/ras_event.h"
@@ -1403,6 +1404,12 @@ static int __init memory_failure_init(void)
}
core_initcall(memory_failure_init);
+#define unpoison_pr_info(fmt, pfn, rs) \
+({ \
+ if (__ratelimit(rs)) \
+ pr_info(fmt, pfn); \
+})
+
/**
* unpoison_memory - Unpoison a previously poisoned page
* @pfn: Page number of the to be unpoisoned page
@@ -1421,6 +1428,8 @@ int unpoison_memory(unsigned long pfn)
struct page *p;
int freeit = 0;
unsigned int nr_pages;
+ static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
if (!pfn_valid(pfn))
return -ENXIO;
@@ -1429,23 +1438,26 @@ int unpoison_memory(unsigned long pfn)
page = compound_head(p);
if (!PageHWPoison(p)) {
- pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
+ unpoison_pr_info("MCE: Page was already unpoisoned %#lx\n",
+ pfn, &unpoison_rs);
return 0;
}
if (page_count(page) > 1) {
- pr_info("MCE: Someone grabs the hwpoison page %#lx\n", pfn);
+ unpoison_pr_info("MCE: Someone grabs the hwpoison page %#lx\n",
+ pfn, &unpoison_rs);
return 0;
}
if (page_mapped(page)) {
- pr_info("MCE: Someone maps the hwpoison page %#lx\n", pfn);
+ unpoison_pr_info("MCE: Someone maps the hwpoison page %#lx\n",
+ pfn, &unpoison_rs);
return 0;
}
if (page_mapping(page)) {
- pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n",
- pfn);
+ unpoison_pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n",
+ pfn, &unpoison_rs);
return 0;
}
@@ -1455,7 +1467,8 @@ int unpoison_memory(unsigned long pfn)
* In such case, we yield to memory_failure() and make unpoison fail.
*/
if (!PageHuge(page) && PageTransHuge(page)) {
- pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
+ unpoison_pr_info("MCE: Memory failure is now running on %#lx\n",
+ pfn, &unpoison_rs);
return 0;
}
@@ -1469,12 +1482,14 @@ int unpoison_memory(unsigned long pfn)
* to the end.
*/
if (PageHuge(page)) {
- pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
+ unpoison_pr_info("MCE: Memory failure is now running on free hugepage %#lx\n",
+ pfn, &unpoison_rs);
return 0;
}
if (TestClearPageHWPoison(p))
num_poisoned_pages_dec();
- pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
+ unpoison_pr_info("MCE: Software-unpoisoned free page %#lx\n",
+ pfn, &unpoison_rs);
return 0;
}
@@ -1486,7 +1501,8 @@ int unpoison_memory(unsigned long pfn)
* the free buddy page pool.
*/
if (TestClearPageHWPoison(page)) {
- pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
+ unpoison_pr_info("MCE: Software-unpoisoned page %#lx\n",
+ pfn, &unpoison_rs);
num_poisoned_pages_sub(nr_pages);
freeit = 1;
if (PageHuge(page))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0780d118d26e..67d488ab495e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -339,8 +339,8 @@ static int __ref ensure_zone_is_initialized(struct zone *zone,
unsigned long start_pfn, unsigned long num_pages)
{
if (!zone_is_initialized(zone))
- return init_currently_empty_zone(zone, start_pfn, num_pages,
- MEMMAP_HOTPLUG);
+ return init_currently_empty_zone(zone, start_pfn, num_pages);
+
return 0;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 842ecd7aaf7f..2834faba719a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1,5 +1,5 @@
/*
- * Memory Migration functionality - linux/mm/migration.c
+ * Memory Migration functionality - linux/mm/migrate.c
*
* Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
*
@@ -30,7 +30,7 @@
#include <linux/mempolicy.h>
#include <linux/vmalloc.h>
#include <linux/security.h>
-#include <linux/memcontrol.h>
+#include <linux/backing-dev.h>
#include <linux/syscalls.h>
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
@@ -171,6 +171,9 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
else
page_add_file_rmap(new);
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_vma_page(new);
+
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, addr, ptep);
unlock:
@@ -311,6 +314,8 @@ int migrate_page_move_mapping(struct address_space *mapping,
struct buffer_head *head, enum migrate_mode mode,
int extra_count)
{
+ struct zone *oldzone, *newzone;
+ int dirty;
int expected_count = 1 + extra_count;
void **pslot;
@@ -318,9 +323,20 @@ int migrate_page_move_mapping(struct address_space *mapping,
/* Anonymous page without mapping */
if (page_count(page) != expected_count)
return -EAGAIN;
+
+ /* No turning back from here */
+ set_page_memcg(newpage, page_memcg(page));
+ newpage->index = page->index;
+ newpage->mapping = page->mapping;
+ if (PageSwapBacked(page))
+ SetPageSwapBacked(newpage);
+
return MIGRATEPAGE_SUCCESS;
}
+ oldzone = page_zone(page);
+ newzone = page_zone(newpage);
+
spin_lock_irq(&mapping->tree_lock);
pslot = radix_tree_lookup_slot(&mapping->page_tree,
@@ -353,14 +369,28 @@ int migrate_page_move_mapping(struct address_space *mapping,
}
/*
- * Now we know that no one else is looking at the page.
+ * Now we know that no one else is looking at the page:
+ * no turning back from here.
*/
+ set_page_memcg(newpage, page_memcg(page));
+ newpage->index = page->index;
+ newpage->mapping = page->mapping;
+ if (PageSwapBacked(page))
+ SetPageSwapBacked(newpage);
+
get_page(newpage); /* add cache reference */
if (PageSwapCache(page)) {
SetPageSwapCache(newpage);
set_page_private(newpage, page_private(page));
}
+ /* Move dirty while page refs frozen and newpage not yet exposed */
+ dirty = PageDirty(page);
+ if (dirty) {
+ ClearPageDirty(page);
+ SetPageDirty(newpage);
+ }
+
radix_tree_replace_slot(pslot, newpage);
/*
@@ -370,6 +400,9 @@ int migrate_page_move_mapping(struct address_space *mapping,
*/
page_unfreeze_refs(page, expected_count - 1);
+ spin_unlock(&mapping->tree_lock);
+ /* Leave irq disabled to prevent preemption while updating stats */
+
/*
* If moved to a different zone then also account
* the page for that zone. Other VM counters will be
@@ -380,13 +413,19 @@ int migrate_page_move_mapping(struct address_space *mapping,
* via NR_FILE_PAGES and NR_ANON_PAGES if they
* are mapped to swap space.
*/
- __dec_zone_page_state(page, NR_FILE_PAGES);
- __inc_zone_page_state(newpage, NR_FILE_PAGES);
- if (!PageSwapCache(page) && PageSwapBacked(page)) {
- __dec_zone_page_state(page, NR_SHMEM);
- __inc_zone_page_state(newpage, NR_SHMEM);
+ if (newzone != oldzone) {
+ __dec_zone_state(oldzone, NR_FILE_PAGES);
+ __inc_zone_state(newzone, NR_FILE_PAGES);
+ if (PageSwapBacked(page) && !PageSwapCache(page)) {
+ __dec_zone_state(oldzone, NR_SHMEM);
+ __inc_zone_state(newzone, NR_SHMEM);
+ }
+ if (dirty && mapping_cap_account_dirty(mapping)) {
+ __dec_zone_state(oldzone, NR_FILE_DIRTY);
+ __inc_zone_state(newzone, NR_FILE_DIRTY);
+ }
}
- spin_unlock_irq(&mapping->tree_lock);
+ local_irq_enable();
return MIGRATEPAGE_SUCCESS;
}
@@ -401,12 +440,6 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
int expected_count;
void **pslot;
- if (!mapping) {
- if (page_count(page) != 1)
- return -EAGAIN;
- return MIGRATEPAGE_SUCCESS;
- }
-
spin_lock_irq(&mapping->tree_lock);
pslot = radix_tree_lookup_slot(&mapping->page_tree,
@@ -424,6 +457,9 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
return -EAGAIN;
}
+ set_page_memcg(newpage, page_memcg(page));
+ newpage->index = page->index;
+ newpage->mapping = page->mapping;
get_page(newpage);
radix_tree_replace_slot(pslot, newpage);
@@ -510,20 +546,9 @@ void migrate_page_copy(struct page *newpage, struct page *page)
if (PageMappedToDisk(page))
SetPageMappedToDisk(newpage);
- if (PageDirty(page)) {
- clear_page_dirty_for_io(page);
- /*
- * Want to mark the page and the radix tree as dirty, and
- * redo the accounting that clear_page_dirty_for_io undid,
- * but we can't use set_page_dirty because that function
- * is actually a signal that all of the page has become dirty.
- * Whereas only part of our page may be dirty.
- */
- if (PageSwapBacked(page))
- SetPageDirty(newpage);
- else
- __set_page_dirty_nobuffers(newpage);
- }
+ /* Move dirty on pages not done by migrate_page_move_mapping() */
+ if (PageDirty(page))
+ SetPageDirty(newpage);
if (page_is_young(page))
set_page_young(newpage);
@@ -537,7 +562,6 @@ void migrate_page_copy(struct page *newpage, struct page *page)
cpupid = page_cpupid_xchg_last(page, -1);
page_cpupid_xchg_last(newpage, cpupid);
- mlock_migrate_page(newpage, page);
ksm_migrate_page(newpage, page);
/*
* Please do not reorder this without considering how mm/ksm.c's
@@ -721,33 +745,13 @@ static int fallback_migrate_page(struct address_space *mapping,
* MIGRATEPAGE_SUCCESS - success
*/
static int move_to_new_page(struct page *newpage, struct page *page,
- int page_was_mapped, enum migrate_mode mode)
+ enum migrate_mode mode)
{
struct address_space *mapping;
int rc;
- /*
- * Block others from accessing the page when we get around to
- * establishing additional references. We are the only one
- * holding a reference to the new page at this point.
- */
- if (!trylock_page(newpage))
- BUG();
-
- /* Prepare mapping for the new page.*/
- newpage->index = page->index;
- newpage->mapping = page->mapping;
- if (PageSwapBacked(page))
- SetPageSwapBacked(newpage);
-
- /*
- * Indirectly called below, migrate_page_copy() copies PG_dirty and thus
- * needs newpage's memcg set to transfer memcg dirty page accounting.
- * So perform memcg migration in two steps:
- * 1. set newpage->mem_cgroup (here)
- * 2. clear page->mem_cgroup (below)
- */
- set_page_memcg(newpage, page_memcg(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
mapping = page_mapping(page);
if (!mapping)
@@ -759,23 +763,19 @@ static int move_to_new_page(struct page *newpage, struct page *page,
* space which also has its own migratepage callback. This
* is the most common path for page migration.
*/
- rc = mapping->a_ops->migratepage(mapping,
- newpage, page, mode);
+ rc = mapping->a_ops->migratepage(mapping, newpage, page, mode);
else
rc = fallback_migrate_page(mapping, newpage, page, mode);
- if (rc != MIGRATEPAGE_SUCCESS) {
- set_page_memcg(newpage, NULL);
- newpage->mapping = NULL;
- } else {
+ /*
+ * When successful, old pagecache page->mapping must be cleared before
+ * page is freed; but stats require that PageAnon be left as PageAnon.
+ */
+ if (rc == MIGRATEPAGE_SUCCESS) {
set_page_memcg(page, NULL);
- if (page_was_mapped)
- remove_migration_ptes(page, newpage);
- page->mapping = NULL;
+ if (!PageAnon(page))
+ page->mapping = NULL;
}
-
- unlock_page(newpage);
-
return rc;
}
@@ -824,6 +824,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
goto out_unlock;
wait_on_page_writeback(page);
}
+
/*
* By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
* we cannot notice that anon_vma is freed while we migrates a page.
@@ -831,34 +832,26 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
* of migration. File cache pages are no problem because of page_lock()
* File Caches may use write_page() or lock_page() in migration, then,
* just care Anon page here.
+ *
+ * Only page_get_anon_vma() understands the subtleties of
+ * getting a hold on an anon_vma from outside one of its mms.
+ * But if we cannot get anon_vma, then we won't need it anyway,
+ * because that implies that the anon page is no longer mapped
+ * (and cannot be remapped so long as we hold the page lock).
*/
- if (PageAnon(page) && !PageKsm(page)) {
- /*
- * Only page_lock_anon_vma_read() understands the subtleties of
- * getting a hold on an anon_vma from outside one of its mms.
- */
+ if (PageAnon(page) && !PageKsm(page))
anon_vma = page_get_anon_vma(page);
- if (anon_vma) {
- /*
- * Anon page
- */
- } else if (PageSwapCache(page)) {
- /*
- * We cannot be sure that the anon_vma of an unmapped
- * swapcache page is safe to use because we don't
- * know in advance if the VMA that this page belonged
- * to still exists. If the VMA and others sharing the
- * data have been freed, then the anon_vma could
- * already be invalid.
- *
- * To avoid this possibility, swapcache pages get
- * migrated but are not remapped when migration
- * completes
- */
- } else {
- goto out_unlock;
- }
- }
+
+ /*
+ * Block others from accessing the new page when we get around to
+ * establishing additional references. We are usually the only one
+ * holding a reference to newpage at this point. We used to have a BUG
+ * here if trylock_page(newpage) fails, but would like to allow for
+ * cases where there might be a race with the previous use of newpage.
+ * This is much like races on refcount of oldpage: just don't BUG().
+ */
+ if (unlikely(!trylock_page(newpage)))
+ goto out_unlock;
if (unlikely(isolated_balloon_page(page))) {
/*
@@ -869,7 +862,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
* the page migration right away (proteced by page lock).
*/
rc = balloon_page_migrate(newpage, page, mode);
- goto out_unlock;
+ goto out_unlock_both;
}
/*
@@ -888,30 +881,30 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
VM_BUG_ON_PAGE(PageAnon(page), page);
if (page_has_private(page)) {
try_to_free_buffers(page);
- goto out_unlock;
+ goto out_unlock_both;
}
- goto skip_unmap;
- }
-
- /* Establish migration ptes or remove ptes */
- if (page_mapped(page)) {
+ } else if (page_mapped(page)) {
+ /* Establish migration ptes */
+ VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
+ page);
try_to_unmap(page,
TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
page_was_mapped = 1;
}
-skip_unmap:
if (!page_mapped(page))
- rc = move_to_new_page(newpage, page, page_was_mapped, mode);
+ rc = move_to_new_page(newpage, page, mode);
- if (rc && page_was_mapped)
- remove_migration_ptes(page, page);
+ if (page_was_mapped)
+ remove_migration_ptes(page,
+ rc == MIGRATEPAGE_SUCCESS ? newpage : page);
+out_unlock_both:
+ unlock_page(newpage);
+out_unlock:
/* Drop an anon_vma reference if we took one */
if (anon_vma)
put_anon_vma(anon_vma);
-
-out_unlock:
unlock_page(page);
out:
return rc;
@@ -937,10 +930,11 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
int force, enum migrate_mode mode,
enum migrate_reason reason)
{
- int rc = 0;
+ int rc = MIGRATEPAGE_SUCCESS;
int *result = NULL;
- struct page *newpage = get_new_page(page, private, &result);
+ struct page *newpage;
+ newpage = get_new_page(page, private, &result);
if (!newpage)
return -ENOMEM;
@@ -954,6 +948,8 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
goto out;
rc = __unmap_and_move(page, newpage, force, mode);
+ if (rc == MIGRATEPAGE_SUCCESS)
+ put_new_page = NULL;
out:
if (rc != -EAGAIN) {
@@ -980,10 +976,9 @@ out:
* it. Otherwise, putback_lru_page() will drop the reference grabbed
* during isolation.
*/
- if (rc != MIGRATEPAGE_SUCCESS && put_new_page) {
- ClearPageSwapBacked(newpage);
+ if (put_new_page)
put_new_page(newpage, private);
- } else if (unlikely(__is_movable_balloon_page(newpage))) {
+ else if (unlikely(__is_movable_balloon_page(newpage))) {
/* drop our reference, page already in the balloon */
put_page(newpage);
} else
@@ -1021,7 +1016,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
struct page *hpage, int force,
enum migrate_mode mode)
{
- int rc = 0;
+ int rc = -EAGAIN;
int *result = NULL;
int page_was_mapped = 0;
struct page *new_hpage;
@@ -1043,8 +1038,6 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
if (!new_hpage)
return -ENOMEM;
- rc = -EAGAIN;
-
if (!trylock_page(hpage)) {
if (!force || mode != MIGRATE_SYNC)
goto out;
@@ -1054,6 +1047,9 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
if (PageAnon(hpage))
anon_vma = page_get_anon_vma(hpage);
+ if (unlikely(!trylock_page(new_hpage)))
+ goto put_anon;
+
if (page_mapped(hpage)) {
try_to_unmap(hpage,
TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
@@ -1061,16 +1057,22 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
}
if (!page_mapped(hpage))
- rc = move_to_new_page(new_hpage, hpage, page_was_mapped, mode);
+ rc = move_to_new_page(new_hpage, hpage, mode);
+
+ if (page_was_mapped)
+ remove_migration_ptes(hpage,
+ rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage);
- if (rc != MIGRATEPAGE_SUCCESS && page_was_mapped)
- remove_migration_ptes(hpage, hpage);
+ unlock_page(new_hpage);
+put_anon:
if (anon_vma)
put_anon_vma(anon_vma);
- if (rc == MIGRATEPAGE_SUCCESS)
+ if (rc == MIGRATEPAGE_SUCCESS) {
hugetlb_cgroup_migrate(hpage, new_hpage);
+ put_new_page = NULL;
+ }
unlock_page(hpage);
out:
@@ -1082,7 +1084,7 @@ out:
* it. Otherwise, put_page() will drop the reference grabbed during
* isolation.
*/
- if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+ if (put_new_page)
put_new_page(new_hpage, private);
else
putback_active_hugepage(new_hpage);
@@ -1112,7 +1114,7 @@ out:
*
* The function returns after 10 attempts or if no pages are movable any more
* because the list has become empty or no retryable pages exist any more.
- * The caller should call putback_lru_pages() to return pages to the LRU
+ * The caller should call putback_movable_pages() to return pages to the LRU
* or free list only if ret != 0.
*
* Returns the number of pages that were not migrated, or an error code.
@@ -1169,7 +1171,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
}
}
}
- rc = nr_failed + retry;
+ nr_failed += retry;
+ rc = nr_failed;
out:
if (nr_succeeded)
count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
@@ -1786,7 +1789,6 @@ fail_putback:
SetPageActive(page);
if (TestClearPageUnevictable(new_page))
SetPageUnevictable(page);
- mlock_migrate_page(page, new_page);
unlock_page(new_page);
put_page(new_page); /* Free it */
@@ -1828,8 +1830,9 @@ fail_putback:
goto fail_putback;
}
- mem_cgroup_migrate(page, new_page, false);
-
+ mlock_migrate_page(new_page, page);
+ set_page_memcg(new_page, page_memcg(page));
+ set_page_memcg(page, NULL);
page_remove_rmap(page);
spin_unlock(ptl);
diff --git a/mm/mincore.c b/mm/mincore.c
index be25efde64a4..14bb9fb37f0c 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -234,7 +234,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
/* This also avoids any overflows on PAGE_CACHE_ALIGN */
pages = len >> PAGE_SHIFT;
- pages += (len & ~PAGE_MASK) != 0;
+ pages += (offset_in_page(len)) != 0;
if (!access_ok(VERIFY_WRITE, vec, pages))
return -EFAULT;
diff --git a/mm/mlock.c b/mm/mlock.c
index 25936680064f..339d9e0949b6 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -422,7 +422,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
void munlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
- vma->vm_flags &= ~VM_LOCKED;
+ vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
while (start < end) {
struct page *page = NULL;
@@ -506,7 +506,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
- goto out; /* don't set VM_LOCKED, don't count */
+ /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
+ goto out;
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
@@ -554,13 +555,14 @@ out:
return ret;
}
-static int do_mlock(unsigned long start, size_t len, int on)
+static int apply_vma_lock_flags(unsigned long start, size_t len,
+ vm_flags_t flags)
{
unsigned long nstart, end, tmp;
struct vm_area_struct * vma, * prev;
int error;
- VM_BUG_ON(start & ~PAGE_MASK);
+ VM_BUG_ON(offset_in_page(start));
VM_BUG_ON(len != PAGE_ALIGN(len));
end = start + len;
if (end < start)
@@ -576,14 +578,11 @@ static int do_mlock(unsigned long start, size_t len, int on)
prev = vma;
for (nstart = start ; ; ) {
- vm_flags_t newflags;
+ vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
- /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
-
- newflags = vma->vm_flags & ~VM_LOCKED;
- if (on)
- newflags |= VM_LOCKED;
+ newflags |= flags;
+ /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
tmp = vma->vm_end;
if (tmp > end)
tmp = end;
@@ -605,7 +604,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
return error;
}
-SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
+static int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
{
unsigned long locked;
unsigned long lock_limit;
@@ -616,7 +615,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
lru_add_drain_all(); /* flush pagevec */
- len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
+ len = PAGE_ALIGN(len + (offset_in_page(start)));
start &= PAGE_MASK;
lock_limit = rlimit(RLIMIT_MEMLOCK);
@@ -629,7 +628,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
/* check against resource limits */
if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
- error = do_mlock(start, len, 1);
+ error = apply_vma_lock_flags(start, len, flags);
up_write(&current->mm->mmap_sem);
if (error)
@@ -641,37 +640,75 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
return 0;
}
+SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
+{
+ return do_mlock(start, len, VM_LOCKED);
+}
+
+SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
+{
+ vm_flags_t vm_flags = VM_LOCKED;
+
+ if (flags & ~MLOCK_ONFAULT)
+ return -EINVAL;
+
+ if (flags & MLOCK_ONFAULT)
+ vm_flags |= VM_LOCKONFAULT;
+
+ return do_mlock(start, len, vm_flags);
+}
+
SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
{
int ret;
- len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
+ len = PAGE_ALIGN(len + (offset_in_page(start)));
start &= PAGE_MASK;
down_write(&current->mm->mmap_sem);
- ret = do_mlock(start, len, 0);
+ ret = apply_vma_lock_flags(start, len, 0);
up_write(&current->mm->mmap_sem);
return ret;
}
-static int do_mlockall(int flags)
+/*
+ * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall)
+ * and translate into the appropriate modifications to mm->def_flags and/or the
+ * flags for all current VMAs.
+ *
+ * There are a couple of subtleties with this. If mlockall() is called multiple
+ * times with different flags, the values do not necessarily stack. If mlockall
+ * is called once including the MCL_FUTURE flag and then a second time without
+ * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags.
+ */
+static int apply_mlockall_flags(int flags)
{
struct vm_area_struct * vma, * prev = NULL;
+ vm_flags_t to_add = 0;
- if (flags & MCL_FUTURE)
+ current->mm->def_flags &= VM_LOCKED_CLEAR_MASK;
+ if (flags & MCL_FUTURE) {
current->mm->def_flags |= VM_LOCKED;
- else
- current->mm->def_flags &= ~VM_LOCKED;
- if (flags == MCL_FUTURE)
- goto out;
+
+ if (flags & MCL_ONFAULT)
+ current->mm->def_flags |= VM_LOCKONFAULT;
+
+ if (!(flags & MCL_CURRENT))
+ goto out;
+ }
+
+ if (flags & MCL_CURRENT) {
+ to_add |= VM_LOCKED;
+ if (flags & MCL_ONFAULT)
+ to_add |= VM_LOCKONFAULT;
+ }
for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
vm_flags_t newflags;
- newflags = vma->vm_flags & ~VM_LOCKED;
- if (flags & MCL_CURRENT)
- newflags |= VM_LOCKED;
+ newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
+ newflags |= to_add;
/* Ignore errors */
mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
@@ -684,14 +721,13 @@ out:
SYSCALL_DEFINE1(mlockall, int, flags)
{
unsigned long lock_limit;
- int ret = -EINVAL;
+ int ret;
- if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE)))
- goto out;
+ if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)))
+ return -EINVAL;
- ret = -EPERM;
if (!can_do_mlock())
- goto out;
+ return -EPERM;
if (flags & MCL_CURRENT)
lru_add_drain_all(); /* flush pagevec */
@@ -704,11 +740,11 @@ SYSCALL_DEFINE1(mlockall, int, flags)
if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
capable(CAP_IPC_LOCK))
- ret = do_mlockall(flags);
+ ret = apply_mlockall_flags(flags);
up_write(&current->mm->mmap_sem);
if (!ret && (flags & MCL_CURRENT))
mm_populate(0, TASK_SIZE);
-out:
+
return ret;
}
@@ -717,7 +753,7 @@ SYSCALL_DEFINE0(munlockall)
int ret;
down_write(&current->mm->mmap_sem);
- ret = do_mlockall(0);
+ ret = apply_mlockall_flags(0);
up_write(&current->mm->mmap_sem);
return ret;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 79bcc9f92e48..2ce04a649f6b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1302,7 +1302,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
* that it represents a valid section of the address space.
*/
addr = get_unmapped_area(file, addr, len, pgoff, flags);
- if (addr & ~PAGE_MASK)
+ if (offset_in_page(addr))
return addr;
/* Do simple checking here so the lower-level routines won't have
@@ -1412,13 +1412,13 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
unsigned long, fd, unsigned long, pgoff)
{
struct file *file = NULL;
- unsigned long retval = -EBADF;
+ unsigned long retval;
if (!(flags & MAP_ANONYMOUS)) {
audit_mmap_fd(fd, flags);
file = fget(fd);
if (!file)
- goto out;
+ return -EBADF;
if (is_file_hugepages(file))
len = ALIGN(len, huge_page_size(hstate_file(file)));
retval = -EINVAL;
@@ -1453,7 +1453,6 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
out_fput:
if (file)
fput(file);
-out:
return retval;
}
@@ -1473,7 +1472,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
if (copy_from_user(&a, arg, sizeof(a)))
return -EFAULT;
- if (a.offset & ~PAGE_MASK)
+ if (offset_in_page(a.offset))
return -EINVAL;
return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
@@ -1562,7 +1561,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
}
/* Clear old maps */
- error = -ENOMEM;
while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
&rb_parent)) {
if (do_munmap(mm, addr, len))
@@ -1663,7 +1661,7 @@ out:
vma == get_gate_vma(current->mm)))
mm->locked_vm += (len >> PAGE_SHIFT);
else
- vma->vm_flags &= ~VM_LOCKED;
+ vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
}
if (file)
@@ -1989,7 +1987,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
* can happen with large stack limits and large mmap()
* allocations.
*/
- if (addr & ~PAGE_MASK) {
+ if (offset_in_page(addr)) {
VM_BUG_ON(addr != -ENOMEM);
info.flags = 0;
info.low_limit = TASK_UNMAPPED_BASE;
@@ -2025,7 +2023,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
if (addr > TASK_SIZE - len)
return -ENOMEM;
- if (addr & ~PAGE_MASK)
+ if (offset_in_page(addr))
return -EINVAL;
addr = arch_rebalance_pgtables(addr, len);
@@ -2047,7 +2045,6 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
return vma;
rb_node = mm->mm_rb.rb_node;
- vma = NULL;
while (rb_node) {
struct vm_area_struct *tmp;
@@ -2139,10 +2136,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
if (security_vm_enough_memory_mm(mm, grow))
return -ENOMEM;
- /* Ok, everything looks good - let it rip */
- if (vma->vm_flags & VM_LOCKED)
- mm->locked_vm += grow;
- vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
return 0;
}
@@ -2153,6 +2146,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
*/
int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
+ struct mm_struct *mm = vma->vm_mm;
int error;
if (!(vma->vm_flags & VM_GROWSUP))
@@ -2202,15 +2196,19 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
* So, we reuse mm->page_table_lock to guard
* against concurrent vma expansions.
*/
- spin_lock(&vma->vm_mm->page_table_lock);
+ spin_lock(&mm->page_table_lock);
+ if (vma->vm_flags & VM_LOCKED)
+ mm->locked_vm += grow;
+ vm_stat_account(mm, vma->vm_flags,
+ vma->vm_file, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
anon_vma_interval_tree_post_update_vma(vma);
if (vma->vm_next)
vma_gap_update(vma->vm_next);
else
- vma->vm_mm->highest_vm_end = address;
- spin_unlock(&vma->vm_mm->page_table_lock);
+ mm->highest_vm_end = address;
+ spin_unlock(&mm->page_table_lock);
perf_event_mmap(vma);
}
@@ -2218,7 +2216,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
}
vma_unlock_anon_vma(vma);
khugepaged_enter_vma_merge(vma, vma->vm_flags);
- validate_mm(vma->vm_mm);
+ validate_mm(mm);
return error;
}
#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -2229,6 +2227,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
int expand_downwards(struct vm_area_struct *vma,
unsigned long address)
{
+ struct mm_struct *mm = vma->vm_mm;
int error;
/*
@@ -2273,13 +2272,17 @@ int expand_downwards(struct vm_area_struct *vma,
* So, we reuse mm->page_table_lock to guard
* against concurrent vma expansions.
*/
- spin_lock(&vma->vm_mm->page_table_lock);
+ spin_lock(&mm->page_table_lock);
+ if (vma->vm_flags & VM_LOCKED)
+ mm->locked_vm += grow;
+ vm_stat_account(mm, vma->vm_flags,
+ vma->vm_file, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_start = address;
vma->vm_pgoff -= grow;
anon_vma_interval_tree_post_update_vma(vma);
vma_gap_update(vma);
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(&mm->page_table_lock);
perf_event_mmap(vma);
}
@@ -2287,7 +2290,7 @@ int expand_downwards(struct vm_area_struct *vma,
}
vma_unlock_anon_vma(vma);
khugepaged_enter_vma_merge(vma, vma->vm_flags);
- validate_mm(vma->vm_mm);
+ validate_mm(mm);
return error;
}
@@ -2536,7 +2539,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
unsigned long end;
struct vm_area_struct *vma, *prev, *last;
- if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
+ if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
return -EINVAL;
len = PAGE_ALIGN(len);
@@ -2734,7 +2737,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
- if (error & ~PAGE_MASK)
+ if (offset_in_page(error))
return error;
error = mlock_future_check(mm, mm->def_flags, len);
@@ -3049,8 +3052,8 @@ static int special_mapping_fault(struct vm_area_struct *vma,
static struct vm_area_struct *__install_special_mapping(
struct mm_struct *mm,
unsigned long addr, unsigned long len,
- unsigned long vm_flags, const struct vm_operations_struct *ops,
- void *priv)
+ unsigned long vm_flags, void *priv,
+ const struct vm_operations_struct *ops)
{
int ret;
struct vm_area_struct *vma;
@@ -3099,8 +3102,8 @@ struct vm_area_struct *_install_special_mapping(
unsigned long addr, unsigned long len,
unsigned long vm_flags, const struct vm_special_mapping *spec)
{
- return __install_special_mapping(mm, addr, len, vm_flags,
- &special_mapping_vmops, (void *)spec);
+ return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec,
+ &special_mapping_vmops);
}
int install_special_mapping(struct mm_struct *mm,
@@ -3108,8 +3111,8 @@ int install_special_mapping(struct mm_struct *mm,
unsigned long vm_flags, struct page **pages)
{
struct vm_area_struct *vma = __install_special_mapping(
- mm, addr, len, vm_flags, &legacy_special_mapping_vmops,
- (void *)pages);
+ mm, addr, len, vm_flags, (void *)pages,
+ &legacy_special_mapping_vmops);
return PTR_ERR_OR_ZERO(vma);
}
diff --git a/mm/mremap.c b/mm/mremap.c
index 5a71cce8c6ea..c25bc6268e46 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -401,7 +401,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
unsigned long charged = 0;
unsigned long map_flags;
- if (new_addr & ~PAGE_MASK)
+ if (offset_in_page(new_addr))
goto out;
if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
@@ -435,11 +435,11 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
((addr - vma->vm_start) >> PAGE_SHIFT),
map_flags);
- if (ret & ~PAGE_MASK)
+ if (offset_in_page(ret))
goto out1;
ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
- if (!(ret & ~PAGE_MASK))
+ if (!(offset_in_page(ret)))
goto out;
out1:
vm_unacct_memory(charged);
@@ -484,7 +484,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
return ret;
- if (addr & ~PAGE_MASK)
+ if (offset_in_page(addr))
return ret;
old_len = PAGE_ALIGN(old_len);
@@ -566,7 +566,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
vma->vm_pgoff +
((addr - vma->vm_start) >> PAGE_SHIFT),
map_flags);
- if (new_addr & ~PAGE_MASK) {
+ if (offset_in_page(new_addr)) {
ret = new_addr;
goto out;
}
@@ -574,7 +574,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
}
out:
- if (ret & ~PAGE_MASK) {
+ if (offset_in_page(ret)) {
vm_unacct_memory(charged);
locked = 0;
}
diff --git a/mm/msync.c b/mm/msync.c
index bb04d53ae852..24e612fefa04 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -38,7 +38,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
goto out;
- if (start & ~PAGE_MASK)
+ if (offset_in_page(start))
goto out;
if ((flags & MS_ASYNC) && (flags & MS_SYNC))
goto out;
diff --git a/mm/nommu.c b/mm/nommu.c
index ab14a2014dea..92be862c859b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -578,16 +578,16 @@ static noinline void validate_nommu_regions(void)
return;
last = rb_entry(lastp, struct vm_region, vm_rb);
- BUG_ON(unlikely(last->vm_end <= last->vm_start));
- BUG_ON(unlikely(last->vm_top < last->vm_end));
+ BUG_ON(last->vm_end <= last->vm_start);
+ BUG_ON(last->vm_top < last->vm_end);
while ((p = rb_next(lastp))) {
region = rb_entry(p, struct vm_region, vm_rb);
last = rb_entry(lastp, struct vm_region, vm_rb);
- BUG_ON(unlikely(region->vm_end <= region->vm_start));
- BUG_ON(unlikely(region->vm_top < region->vm_end));
- BUG_ON(unlikely(region->vm_start < last->vm_top));
+ BUG_ON(region->vm_end <= region->vm_start);
+ BUG_ON(region->vm_top < region->vm_end);
+ BUG_ON(region->vm_start < last->vm_top);
lastp = p;
}
@@ -1497,7 +1497,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
if (copy_from_user(&a, arg, sizeof(a)))
return -EFAULT;
- if (a.offset & ~PAGE_MASK)
+ if (offset_in_page(a.offset))
return -EINVAL;
return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
@@ -1653,9 +1653,9 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
goto erase_whole_vma;
if (start < vma->vm_start || end > vma->vm_end)
return -EINVAL;
- if (start & ~PAGE_MASK)
+ if (offset_in_page(start))
return -EINVAL;
- if (end != vma->vm_end && end & ~PAGE_MASK)
+ if (end != vma->vm_end && offset_in_page(end))
return -EINVAL;
if (start != vma->vm_start && end != vma->vm_end) {
ret = split_vma(mm, vma, start, 1);
@@ -1736,7 +1736,7 @@ static unsigned long do_mremap(unsigned long addr,
if (old_len == 0 || new_len == 0)
return (unsigned long) -EINVAL;
- if (addr & ~PAGE_MASK)
+ if (offset_in_page(addr))
return -EINVAL;
if (flags & MREMAP_FIXED && new_addr != addr)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1ecc0bcaecc5..e4778285d8d1 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -377,13 +377,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
static void dump_header(struct oom_control *oc, struct task_struct *p,
struct mem_cgroup *memcg)
{
- task_lock(current);
pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
"oom_score_adj=%hd\n",
current->comm, oc->gfp_mask, oc->order,
current->signal->oom_score_adj);
- cpuset_print_task_mems_allowed(current);
- task_unlock(current);
+ cpuset_print_current_mems_allowed();
dump_stack();
if (memcg)
mem_cgroup_print_oom_info(memcg, p);
@@ -476,6 +474,24 @@ void oom_killer_enable(void)
oom_killer_disabled = false;
}
+/*
+ * task->mm can be NULL if the task is the exited group leader. So to
+ * determine whether the task is using a particular mm, we examine all the
+ * task's threads: if one of those is using this mm then this task was also
+ * using it.
+ */
+static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
+{
+ struct task_struct *t;
+
+ for_each_thread(p, t) {
+ struct mm_struct *t_mm = READ_ONCE(t->mm);
+ if (t_mm)
+ return t_mm == mm;
+ }
+ return false;
+}
+
#define K(x) ((x) << (PAGE_SHIFT-10))
/*
* Must be called while holding a reference to p, which will be released upon
@@ -509,10 +525,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
if (__ratelimit(&oom_rs))
dump_header(oc, p, memcg);
- task_lock(p);
pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
message, task_pid_nr(p), p->comm, points);
- task_unlock(p);
/*
* If any of p's children has a different mm and is eligible for kill,
@@ -525,7 +539,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
list_for_each_entry(child, &t->children, sibling) {
unsigned int child_points;
- if (child->mm == p->mm)
+ if (process_shares_mm(child, p->mm))
continue;
/*
* oom_badness() returns 0 if the thread is unkillable
@@ -552,8 +566,15 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
victim = p;
}
- /* mm cannot safely be dereferenced after task_unlock(victim) */
+ /* Get a reference to safely compare mm after task_unlock(victim) */
mm = victim->mm;
+ atomic_inc(&mm->mm_count);
+ /*
+ * We should send SIGKILL before setting TIF_MEMDIE in order to prevent
+ * the OOM victim from depleting the memory reserves from the user
+ * space under its control.
+ */
+ do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
mark_oom_victim(victim);
pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
@@ -571,21 +592,21 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
* pending fatal signal.
*/
rcu_read_lock();
- for_each_process(p)
- if (p->mm == mm && !same_thread_group(p, victim) &&
- !(p->flags & PF_KTHREAD)) {
- if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
- continue;
+ for_each_process(p) {
+ if (!process_shares_mm(p, mm))
+ continue;
+ if (same_thread_group(p, victim))
+ continue;
+ if (unlikely(p->flags & PF_KTHREAD))
+ continue;
+ if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+ continue;
- task_lock(p); /* Protect ->comm from prctl() */
- pr_err("Kill process %d (%s) sharing same memory\n",
- task_pid_nr(p), p->comm);
- task_unlock(p);
- do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
- }
+ do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
+ }
rcu_read_unlock();
- do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
+ mmdrop(mm);
put_task_struct(victim);
}
#undef K
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 805bbad2e24e..446bb36ee59d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3428,24 +3428,24 @@ EXPORT_SYMBOL(__free_page_frag);
struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order)
{
struct page *page;
- struct mem_cgroup *memcg = NULL;
- if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
- return NULL;
page = alloc_pages(gfp_mask, order);
- memcg_kmem_commit_charge(page, memcg, order);
+ if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
+ __free_pages(page, order);
+ page = NULL;
+ }
return page;
}
struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
{
struct page *page;
- struct mem_cgroup *memcg = NULL;
- if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
- return NULL;
page = alloc_pages_node(nid, gfp_mask, order);
- memcg_kmem_commit_charge(page, memcg, order);
+ if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
+ __free_pages(page, order);
+ page = NULL;
+ }
return page;
}
@@ -3455,7 +3455,7 @@ struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
*/
void __free_kmem_pages(struct page *page, unsigned int order)
{
- memcg_kmem_uncharge_pages(page, order);
+ memcg_kmem_uncharge(page, order);
__free_pages(page, order);
}
@@ -4900,8 +4900,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
int __meminit init_currently_empty_zone(struct zone *zone,
unsigned long zone_start_pfn,
- unsigned long size,
- enum memmap_context context)
+ unsigned long size)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int ret;
@@ -5413,8 +5412,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
set_pageblock_order();
setup_usemap(pgdat, zone, zone_start_pfn, size);
- ret = init_currently_empty_zone(zone, zone_start_pfn,
- size, MEMMAP_EARLY);
+ ret = init_currently_empty_zone(zone, zone_start_pfn, size);
BUG_ON(ret);
memmap_init(size, nid, j, zone_start_pfn);
zone_start_pfn += size;
@@ -5423,6 +5421,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
{
+ unsigned long __maybe_unused offset = 0;
+
/* Skip empty nodes */
if (!pgdat->node_spanned_pages)
return;
@@ -5439,6 +5439,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
* for the buddy allocator to function correctly.
*/
start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
+ offset = pgdat->node_start_pfn - start;
end = pgdat_end_pfn(pgdat);
end = ALIGN(end, MAX_ORDER_NR_PAGES);
size = (end - start) * sizeof(struct page);
@@ -5446,7 +5447,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
if (!map)
map = memblock_virt_alloc_node_nopanic(size,
pgdat->node_id);
- pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
+ pgdat->node_mem_map = map + offset;
}
#ifndef CONFIG_NEED_MULTIPLE_NODES
/*
@@ -5454,9 +5455,9 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
*/
if (pgdat == NODE_DATA(0)) {
mem_map = NODE_DATA(0)->node_mem_map;
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM)
if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
- mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
+ mem_map -= offset;
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
}
#endif
@@ -5668,13 +5669,17 @@ static void __init find_zone_movable_pfns_for_nodes(void)
*/
required_movablecore =
roundup(required_movablecore, MAX_ORDER_NR_PAGES);
+ required_movablecore = min(totalpages, required_movablecore);
corepages = totalpages - required_movablecore;
required_kernelcore = max(required_kernelcore, corepages);
}
- /* If kernelcore was not specified, there is no ZONE_MOVABLE */
- if (!required_kernelcore)
+ /*
+ * If kernelcore was not specified or kernelcore size is larger
+ * than totalpages, there is no ZONE_MOVABLE.
+ */
+ if (!required_kernelcore || required_kernelcore >= totalpages)
goto out;
/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
diff --git a/mm/page_counter.c b/mm/page_counter.c
index 11b4beda14ba..7c6a63d2c27f 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -56,12 +56,12 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
* @nr_pages: number of pages to charge
* @fail: points first counter to hit its limit, if any
*
- * Returns 0 on success, or -ENOMEM and @fail if the counter or one of
- * its ancestors has hit its configured limit.
+ * Returns %true on success, or %false and @fail if the counter or one
+ * of its ancestors has hit its configured limit.
*/
-int page_counter_try_charge(struct page_counter *counter,
- unsigned long nr_pages,
- struct page_counter **fail)
+bool page_counter_try_charge(struct page_counter *counter,
+ unsigned long nr_pages,
+ struct page_counter **fail)
{
struct page_counter *c;
@@ -99,13 +99,13 @@ int page_counter_try_charge(struct page_counter *counter,
if (new > c->watermark)
c->watermark = new;
}
- return 0;
+ return true;
failed:
for (c = counter; c != *fail; c = c->parent)
page_counter_cancel(c, nr_pages);
- return -ENOMEM;
+ return false;
}
/**
diff --git a/mm/percpu.c b/mm/percpu.c
index a63b4d82a141..8a943b97a053 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1554,12 +1554,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
#ifdef CONFIG_SMP
PCPU_SETUP_BUG_ON(!ai->static_size);
- PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK);
+ PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
#endif
PCPU_SETUP_BUG_ON(!base_addr);
- PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK);
+ PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
- PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
+ PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
@@ -1806,7 +1806,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
alloc_size = roundup(min_unit_size, atom_size);
upa = alloc_size / min_unit_size;
- while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+ while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
upa--;
max_upa = upa;
@@ -1838,7 +1838,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
for (upa = max_upa; upa; upa--) {
int allocs = 0, wasted = 0;
- if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+ if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
continue;
for (group = 0; group < nr_groups; group++) {
diff --git a/mm/readahead.c b/mm/readahead.c
index 24682f6f4cfd..998ad592f408 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -213,7 +213,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
return -EINVAL;
- nr_to_read = max_sane_readahead(nr_to_read);
+ nr_to_read = min(nr_to_read, inode_to_bdi(mapping->host)->ra_pages);
while (nr_to_read) {
int err;
@@ -232,16 +232,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
return 0;
}
-#define MAX_READAHEAD ((512*4096)/PAGE_CACHE_SIZE)
-/*
- * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
- * sensible upper limit.
- */
-unsigned long max_sane_readahead(unsigned long nr)
-{
- return min(nr, MAX_READAHEAD);
-}
-
/*
* Set the initial window size, round to next power of 2 and square
* for small size, x 4 for medium, and x 2 for large
@@ -380,7 +370,7 @@ ondemand_readahead(struct address_space *mapping,
bool hit_readahead_marker, pgoff_t offset,
unsigned long req_size)
{
- unsigned long max = max_sane_readahead(ra->ra_pages);
+ unsigned long max = ra->ra_pages;
pgoff_t prev_offset;
/*
diff --git a/mm/rmap.c b/mm/rmap.c
index f5b5c1f3dcd7..b577fbb98d4b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1304,6 +1304,10 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
int ret = SWAP_AGAIN;
enum ttu_flags flags = (enum ttu_flags)arg;
+ /* munlock has nothing to gain from examining un-locked vmas */
+ if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
+ goto out;
+
pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
goto out;
@@ -1314,9 +1318,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* skipped over this mm) then we should reactivate it.
*/
if (!(flags & TTU_IGNORE_MLOCK)) {
- if (vma->vm_flags & VM_LOCKED)
- goto out_mlock;
-
+ if (vma->vm_flags & VM_LOCKED) {
+ /* Holding pte lock, we do *not* need mmap_sem here */
+ mlock_vma_page(page);
+ ret = SWAP_MLOCK;
+ goto out_unmap;
+ }
if (flags & TTU_MUNLOCK)
goto out_unmap;
}
@@ -1352,7 +1359,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
update_hiwater_rss(mm);
if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
- if (!PageHuge(page)) {
+ if (PageHuge(page)) {
+ hugetlb_count_sub(1 << compound_order(page), mm);
+ } else {
if (PageAnon(page))
dec_mm_counter(mm, MM_ANONPAGES);
else
@@ -1370,47 +1379,44 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
dec_mm_counter(mm, MM_ANONPAGES);
else
dec_mm_counter(mm, MM_FILEPAGES);
+ } else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION)) {
+ swp_entry_t entry;
+ pte_t swp_pte;
+ /*
+ * Store the pfn of the page in a special migration
+ * pte. do_swap_page() will wait until the migration
+ * pte is removed and then restart fault handling.
+ */
+ entry = make_migration_entry(page, pte_write(pteval));
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ set_pte_at(mm, address, pte, swp_pte);
} else if (PageAnon(page)) {
swp_entry_t entry = { .val = page_private(page) };
pte_t swp_pte;
-
- if (PageSwapCache(page)) {
- /*
- * Store the swap location in the pte.
- * See handle_pte_fault() ...
- */
- if (swap_duplicate(entry) < 0) {
- set_pte_at(mm, address, pte, pteval);
- ret = SWAP_FAIL;
- goto out_unmap;
- }
- if (list_empty(&mm->mmlist)) {
- spin_lock(&mmlist_lock);
- if (list_empty(&mm->mmlist))
- list_add(&mm->mmlist, &init_mm.mmlist);
- spin_unlock(&mmlist_lock);
- }
- dec_mm_counter(mm, MM_ANONPAGES);
- inc_mm_counter(mm, MM_SWAPENTS);
- } else if (IS_ENABLED(CONFIG_MIGRATION)) {
- /*
- * Store the pfn of the page in a special migration
- * pte. do_swap_page() will wait until the migration
- * pte is removed and then restart fault handling.
- */
- BUG_ON(!(flags & TTU_MIGRATION));
- entry = make_migration_entry(page, pte_write(pteval));
+ /*
+ * Store the swap location in the pte.
+ * See handle_pte_fault() ...
+ */
+ VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+ if (swap_duplicate(entry) < 0) {
+ set_pte_at(mm, address, pte, pteval);
+ ret = SWAP_FAIL;
+ goto out_unmap;
+ }
+ if (list_empty(&mm->mmlist)) {
+ spin_lock(&mmlist_lock);
+ if (list_empty(&mm->mmlist))
+ list_add(&mm->mmlist, &init_mm.mmlist);
+ spin_unlock(&mmlist_lock);
}
+ dec_mm_counter(mm, MM_ANONPAGES);
+ inc_mm_counter(mm, MM_SWAPENTS);
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
set_pte_at(mm, address, pte, swp_pte);
- } else if (IS_ENABLED(CONFIG_MIGRATION) &&
- (flags & TTU_MIGRATION)) {
- /* Establish migration entry for a file page */
- swp_entry_t entry;
- entry = make_migration_entry(page, pte_write(pteval));
- set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
} else
dec_mm_counter(mm, MM_FILEPAGES);
@@ -1419,31 +1425,10 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
out_unmap:
pte_unmap_unlock(pte, ptl);
- if (ret != SWAP_FAIL && !(flags & TTU_MUNLOCK))
+ if (ret != SWAP_FAIL && ret != SWAP_MLOCK && !(flags & TTU_MUNLOCK))
mmu_notifier_invalidate_page(mm, address);
out:
return ret;
-
-out_mlock:
- pte_unmap_unlock(pte, ptl);
-
-
- /*
- * We need mmap_sem locking, Otherwise VM_LOCKED check makes
- * unstable result and race. Plus, We can't wait here because
- * we now hold anon_vma->rwsem or mapping->i_mmap_rwsem.
- * if trylock failed, the page remain in evictable lru and later
- * vmscan could retry to move the page to unevictable lru if the
- * page is actually mlocked.
- */
- if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
- if (vma->vm_flags & VM_LOCKED) {
- mlock_vma_page(page);
- ret = SWAP_MLOCK;
- }
- up_read(&vma->vm_mm->mmap_sem);
- }
- return ret;
}
bool is_vma_temporary_stack(struct vm_area_struct *vma)
@@ -1607,6 +1592,8 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
+ cond_resched();
+
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
continue;
@@ -1656,6 +1643,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
+ cond_resched();
+
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
continue;
diff --git a/mm/shmem.c b/mm/shmem.c
index 48ce82926d93..3b8b73928398 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -548,12 +548,12 @@ static int shmem_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct inode *inode = dentry->d_inode;
struct shmem_inode_info *info = SHMEM_I(inode);
- spin_lock(&info->lock);
- shmem_recalc_inode(inode);
- spin_unlock(&info->lock);
-
+ if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
+ spin_lock(&info->lock);
+ shmem_recalc_inode(inode);
+ spin_unlock(&info->lock);
+ }
generic_fillattr(inode, stat);
-
return 0;
}
@@ -586,10 +586,16 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
}
if (newsize <= oldsize) {
loff_t holebegin = round_up(newsize, PAGE_SIZE);
- unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
- shmem_truncate_range(inode, newsize, (loff_t)-1);
+ if (oldsize > holebegin)
+ unmap_mapping_range(inode->i_mapping,
+ holebegin, 0, 1);
+ if (info->alloced)
+ shmem_truncate_range(inode,
+ newsize, (loff_t)-1);
/* unmap again to remove racily COWed private pages */
- unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
+ if (oldsize > holebegin)
+ unmap_mapping_range(inode->i_mapping,
+ holebegin, 0, 1);
}
}
@@ -1023,7 +1029,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
*/
oldpage = newpage;
} else {
- mem_cgroup_migrate(oldpage, newpage, true);
+ mem_cgroup_replace_page(oldpage, newpage);
lru_cache_add_anon(newpage);
*pagep = newpage;
}
diff --git a/mm/slab.c b/mm/slab.c
index 4fcc5dd8d5a6..272e809404d5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -282,6 +282,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
#define CFLGS_OFF_SLAB (0x80000000UL)
#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
+#define OFF_SLAB_MIN_SIZE (max_t(size_t, PAGE_SIZE >> 5, KMALLOC_MIN_SIZE + 1))
#define BATCHREFILL_LIMIT 16
/*
@@ -1592,16 +1593,17 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
flags |= __GFP_RECLAIMABLE;
- if (memcg_charge_slab(cachep, flags, cachep->gfporder))
- return NULL;
-
page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
if (!page) {
- memcg_uncharge_slab(cachep, cachep->gfporder);
slab_out_of_memory(cachep, flags, nodeid);
return NULL;
}
+ if (memcg_charge_slab(page, flags, cachep->gfporder, cachep)) {
+ __free_pages(page, cachep->gfporder);
+ return NULL;
+ }
+
/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
if (page_is_pfmemalloc(page))
pfmemalloc_active = true;
@@ -1653,8 +1655,7 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += nr_freed;
- __free_pages(page, cachep->gfporder);
- memcg_uncharge_slab(cachep, cachep->gfporder);
+ __free_kmem_pages(page, cachep->gfporder);
}
static void kmem_rcu_free(struct rcu_head *head)
@@ -2212,7 +2213,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
* it too early on. Always use on-slab management when
* SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
*/
- if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init &&
+ if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init &&
!(flags & SLAB_NOLEAKTRACE))
/*
* Size is large, assume best to place the slab management obj
@@ -2276,7 +2277,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
/*
* This is a possibility for one of the kmalloc_{dma,}_caches.
* But since we go off slab only for object size greater than
- * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created
+ * OFF_SLAB_MIN_SIZE, and kmalloc_{dma,}_caches get created
* in ascending order,this should not happen at all.
* But leave a BUG_ON for some lucky dude.
*/
diff --git a/mm/slab.h b/mm/slab.h
index a3a967d7d7c2..27492eb678f7 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -181,10 +181,6 @@ bool __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
list_for_each_entry(iter, &(root)->memcg_params.list, \
memcg_params.list)
-#define for_each_memcg_cache_safe(iter, tmp, root) \
- list_for_each_entry_safe(iter, tmp, &(root)->memcg_params.list, \
- memcg_params.list)
-
static inline bool is_root_cache(struct kmem_cache *s)
{
return s->memcg_params.is_root_cache;
@@ -240,23 +236,16 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
return s->memcg_params.root_cache;
}
-static __always_inline int memcg_charge_slab(struct kmem_cache *s,
- gfp_t gfp, int order)
+static __always_inline int memcg_charge_slab(struct page *page,
+ gfp_t gfp, int order,
+ struct kmem_cache *s)
{
if (!memcg_kmem_enabled())
return 0;
if (is_root_cache(s))
return 0;
- return memcg_charge_kmem(s->memcg_params.memcg, gfp, 1 << order);
-}
-
-static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
-{
- if (!memcg_kmem_enabled())
- return;
- if (is_root_cache(s))
- return;
- memcg_uncharge_kmem(s->memcg_params.memcg, 1 << order);
+ return __memcg_kmem_charge_memcg(page, gfp, order,
+ s->memcg_params.memcg);
}
extern void slab_init_memcg_params(struct kmem_cache *);
@@ -265,8 +254,6 @@ extern void slab_init_memcg_params(struct kmem_cache *);
#define for_each_memcg_cache(iter, root) \
for ((void)(iter), (void)(root); 0; )
-#define for_each_memcg_cache_safe(iter, tmp, root) \
- for ((void)(iter), (void)(tmp), (void)(root); 0; )
static inline bool is_root_cache(struct kmem_cache *s)
{
@@ -295,15 +282,12 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
return s;
}
-static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
+static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order,
+ struct kmem_cache *s)
{
return 0;
}
-static inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
-{
-}
-
static inline void slab_init_memcg_params(struct kmem_cache *s)
{
}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 5ce4faeb16fb..d88e97c10a2e 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -316,10 +316,10 @@ unsigned long calculate_alignment(unsigned long flags,
return ALIGN(align, sizeof(void *));
}
-static struct kmem_cache *
-do_kmem_cache_create(const char *name, size_t object_size, size_t size,
- size_t align, unsigned long flags, void (*ctor)(void *),
- struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+static struct kmem_cache *create_cache(const char *name,
+ size_t object_size, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *),
+ struct mem_cgroup *memcg, struct kmem_cache *root_cache)
{
struct kmem_cache *s;
int err;
@@ -384,7 +384,7 @@ struct kmem_cache *
kmem_cache_create(const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void *))
{
- struct kmem_cache *s;
+ struct kmem_cache *s = NULL;
const char *cache_name;
int err;
@@ -396,7 +396,6 @@ kmem_cache_create(const char *name, size_t size, size_t align,
err = kmem_cache_sanity_check(name, size);
if (err) {
- s = NULL; /* suppress uninit var warning */
goto out_unlock;
}
@@ -418,9 +417,9 @@ kmem_cache_create(const char *name, size_t size, size_t align,
goto out_unlock;
}
- s = do_kmem_cache_create(cache_name, size, size,
- calculate_alignment(flags, align, size),
- flags, ctor, NULL, NULL);
+ s = create_cache(cache_name, size, size,
+ calculate_alignment(flags, align, size),
+ flags, ctor, NULL, NULL);
if (IS_ERR(s)) {
err = PTR_ERR(s);
kfree_const(cache_name);
@@ -448,29 +447,20 @@ out_unlock:
}
EXPORT_SYMBOL(kmem_cache_create);
-static int do_kmem_cache_shutdown(struct kmem_cache *s,
+static int shutdown_cache(struct kmem_cache *s,
struct list_head *release, bool *need_rcu_barrier)
{
- if (__kmem_cache_shutdown(s) != 0) {
- printk(KERN_ERR "kmem_cache_destroy %s: "
- "Slab cache still has objects\n", s->name);
- dump_stack();
+ if (__kmem_cache_shutdown(s) != 0)
return -EBUSY;
- }
if (s->flags & SLAB_DESTROY_BY_RCU)
*need_rcu_barrier = true;
-#ifdef CONFIG_MEMCG_KMEM
- if (!is_root_cache(s))
- list_del(&s->memcg_params.list);
-#endif
list_move(&s->list, release);
return 0;
}
-static void do_kmem_cache_release(struct list_head *release,
- bool need_rcu_barrier)
+static void release_caches(struct list_head *release, bool need_rcu_barrier)
{
struct kmem_cache *s, *s2;
@@ -536,10 +526,10 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
if (!cache_name)
goto out_unlock;
- s = do_kmem_cache_create(cache_name, root_cache->object_size,
- root_cache->size, root_cache->align,
- root_cache->flags, root_cache->ctor,
- memcg, root_cache);
+ s = create_cache(cache_name, root_cache->object_size,
+ root_cache->size, root_cache->align,
+ root_cache->flags, root_cache->ctor,
+ memcg, root_cache);
/*
* If we could not create a memcg cache, do not complain, because
* that's not critical at all as we can always proceed with the root
@@ -598,6 +588,18 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
put_online_cpus();
}
+static int __shutdown_memcg_cache(struct kmem_cache *s,
+ struct list_head *release, bool *need_rcu_barrier)
+{
+ BUG_ON(is_root_cache(s));
+
+ if (shutdown_cache(s, release, need_rcu_barrier))
+ return -EBUSY;
+
+ list_del(&s->memcg_params.list);
+ return 0;
+}
+
void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
{
LIST_HEAD(release);
@@ -615,14 +617,76 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
* The cgroup is about to be freed and therefore has no charges
* left. Hence, all its caches must be empty by now.
*/
- BUG_ON(do_kmem_cache_shutdown(s, &release, &need_rcu_barrier));
+ BUG_ON(__shutdown_memcg_cache(s, &release, &need_rcu_barrier));
}
mutex_unlock(&slab_mutex);
put_online_mems();
put_online_cpus();
- do_kmem_cache_release(&release, need_rcu_barrier);
+ release_caches(&release, need_rcu_barrier);
+}
+
+static int shutdown_memcg_caches(struct kmem_cache *s,
+ struct list_head *release, bool *need_rcu_barrier)
+{
+ struct memcg_cache_array *arr;
+ struct kmem_cache *c, *c2;
+ LIST_HEAD(busy);
+ int i;
+
+ BUG_ON(!is_root_cache(s));
+
+ /*
+ * First, shutdown active caches, i.e. caches that belong to online
+ * memory cgroups.
+ */
+ arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
+ lockdep_is_held(&slab_mutex));
+ for_each_memcg_cache_index(i) {
+ c = arr->entries[i];
+ if (!c)
+ continue;
+ if (__shutdown_memcg_cache(c, release, need_rcu_barrier))
+ /*
+ * The cache still has objects. Move it to a temporary
+ * list so as not to try to destroy it for a second
+ * time while iterating over inactive caches below.
+ */
+ list_move(&c->memcg_params.list, &busy);
+ else
+ /*
+ * The cache is empty and will be destroyed soon. Clear
+ * the pointer to it in the memcg_caches array so that
+ * it will never be accessed even if the root cache
+ * stays alive.
+ */
+ arr->entries[i] = NULL;
+ }
+
+ /*
+ * Second, shutdown all caches left from memory cgroups that are now
+ * offline.
+ */
+ list_for_each_entry_safe(c, c2, &s->memcg_params.list,
+ memcg_params.list)
+ __shutdown_memcg_cache(c, release, need_rcu_barrier);
+
+ list_splice(&busy, &s->memcg_params.list);
+
+ /*
+ * A cache being destroyed must be empty. In particular, this means
+ * that all per memcg caches attached to it must be empty too.
+ */
+ if (!list_empty(&s->memcg_params.list))
+ return -EBUSY;
+ return 0;
+}
+#else
+static inline int shutdown_memcg_caches(struct kmem_cache *s,
+ struct list_head *release, bool *need_rcu_barrier)
+{
+ return 0;
}
#endif /* CONFIG_MEMCG_KMEM */
@@ -635,16 +699,13 @@ void slab_kmem_cache_release(struct kmem_cache *s)
void kmem_cache_destroy(struct kmem_cache *s)
{
- struct kmem_cache *c, *c2;
LIST_HEAD(release);
bool need_rcu_barrier = false;
- bool busy = false;
+ int err;
if (unlikely(!s))
return;
- BUG_ON(!is_root_cache(s));
-
get_online_cpus();
get_online_mems();
@@ -654,21 +715,22 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (s->refcount)
goto out_unlock;
- for_each_memcg_cache_safe(c, c2, s) {
- if (do_kmem_cache_shutdown(c, &release, &need_rcu_barrier))
- busy = true;
- }
-
- if (!busy)
- do_kmem_cache_shutdown(s, &release, &need_rcu_barrier);
+ err = shutdown_memcg_caches(s, &release, &need_rcu_barrier);
+ if (!err)
+ err = shutdown_cache(s, &release, &need_rcu_barrier);
+ if (err) {
+ pr_err("kmem_cache_destroy %s: "
+ "Slab cache still has objects\n", s->name);
+ dump_stack();
+ }
out_unlock:
mutex_unlock(&slab_mutex);
put_online_mems();
put_online_cpus();
- do_kmem_cache_release(&release, need_rcu_barrier);
+ release_caches(&release, need_rcu_barrier);
}
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -692,7 +754,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
}
EXPORT_SYMBOL(kmem_cache_shrink);
-int slab_is_available(void)
+bool slab_is_available(void)
{
return slab_state >= UP;
}
diff --git a/mm/slub.c b/mm/slub.c
index f614b5dc396b..75a5fa92ac2a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -459,8 +459,10 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
/*
* Debug settings:
*/
-#ifdef CONFIG_SLUB_DEBUG_ON
+#if defined(CONFIG_SLUB_DEBUG_ON)
static int slub_debug = DEBUG_DEFAULT_FLAGS;
+#elif defined(CONFIG_KASAN)
+static int slub_debug = SLAB_STORE_USER;
#else
static int slub_debug;
#endif
@@ -1328,16 +1330,15 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
flags |= __GFP_NOTRACK;
- if (memcg_charge_slab(s, flags, order))
- return NULL;
-
if (node == NUMA_NO_NODE)
page = alloc_pages(flags, order);
else
page = __alloc_pages_node(node, flags, order);
- if (!page)
- memcg_uncharge_slab(s, order);
+ if (page && memcg_charge_slab(page, flags, order, s)) {
+ __free_pages(page, order);
+ page = NULL;
+ }
return page;
}
@@ -1476,8 +1477,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
page_mapcount_reset(page);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
- __free_pages(page, order);
- memcg_uncharge_slab(s, order);
+ __free_kmem_pages(page, order);
}
#define need_reserve_slab_rcu \
@@ -2912,20 +2912,15 @@ static inline int slab_order(int size, int min_objects,
if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
return get_order(size * MAX_OBJS_PER_PAGE) - 1;
- for (order = max(min_order,
- fls(min_objects * size - 1) - PAGE_SHIFT);
+ for (order = max(min_order, get_order(min_objects * size + reserved));
order <= max_order; order++) {
unsigned long slab_size = PAGE_SIZE << order;
- if (slab_size < min_objects * size + reserved)
- continue;
-
rem = (slab_size - reserved) % size;
if (rem <= slab_size / fract_leftover)
break;
-
}
return order;
@@ -2943,7 +2938,7 @@ static inline int calculate_order(int size, int reserved)
* works by first attempting to generate a layout with
* the best configuration and backing off gradually.
*
- * First we reduce the acceptable waste in a slab. Then
+ * First we increase the acceptable waste in a slab. Then
* we reduce the minimum objects required in a slab.
*/
min_objects = slub_min_objects;
diff --git a/mm/util.c b/mm/util.c
index 68ff8a5361e7..9af1c12b310c 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -309,7 +309,7 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
{
if (unlikely(offset + PAGE_ALIGN(len) < offset))
return -EINVAL;
- if (unlikely(offset & ~PAGE_MASK))
+ if (unlikely(offset_in_page(offset)))
return -EINVAL;
return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
diff --git a/mm/vmacache.c b/mm/vmacache.c
index b6e3662fe339..fd09dc9c6812 100644
--- a/mm/vmacache.c
+++ b/mm/vmacache.c
@@ -52,7 +52,7 @@ void vmacache_flush_all(struct mm_struct *mm)
* Also handle the case where a kernel thread has adopted this mm via use_mm().
* That kernel thread's vmacache is not applicable to this mm.
*/
-static bool vmacache_valid_mm(struct mm_struct *mm)
+static inline bool vmacache_valid_mm(struct mm_struct *mm)
{
return current->mm == mm && !(current->flags & PF_KTHREAD);
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index af3a519e40c2..9db9ef5e8481 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -358,7 +358,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
struct vmap_area *first;
BUG_ON(!size);
- BUG_ON(size & ~PAGE_MASK);
+ BUG_ON(offset_in_page(size));
BUG_ON(!is_power_of_2(align));
va = kmalloc_node(sizeof(struct vmap_area),
@@ -936,7 +936,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
void *vaddr = NULL;
unsigned int order;
- BUG_ON(size & ~PAGE_MASK);
+ BUG_ON(offset_in_page(size));
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
if (WARN_ON(size == 0)) {
/*
@@ -989,7 +989,7 @@ static void vb_free(const void *addr, unsigned long size)
unsigned int order;
struct vmap_block *vb;
- BUG_ON(size & ~PAGE_MASK);
+ BUG_ON(offset_in_page(size));
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
@@ -1902,7 +1902,7 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
while (count) {
unsigned long offset, length;
- offset = (unsigned long)addr & ~PAGE_MASK;
+ offset = offset_in_page(addr);
length = PAGE_SIZE - offset;
if (length > count)
length = count;
@@ -1941,7 +1941,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
while (count) {
unsigned long offset, length;
- offset = (unsigned long)addr & ~PAGE_MASK;
+ offset = offset_in_page(addr);
length = PAGE_SIZE - offset;
if (length > count)
length = count;
@@ -2392,7 +2392,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
bool purged = false;
/* verify parameters and allocate data structures */
- BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
+ BUG_ON(offset_in_page(align) || !is_power_of_2(align));
for (last_area = 0, area = 0; area < nr_vms; area++) {
start = offsets[area];
end = start + sizes[area];
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e7057af54b6e..55721b619aee 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -194,7 +194,7 @@ static bool sane_reclaim(struct scan_control *sc)
static unsigned long zone_reclaimable_pages(struct zone *zone)
{
- int nr;
+ unsigned long nr;
nr = zone_page_state(zone, NR_ACTIVE_FILE) +
zone_page_state(zone, NR_INACTIVE_FILE);
@@ -1859,17 +1859,14 @@ static void shrink_active_list(unsigned long nr_to_scan,
}
#ifdef CONFIG_SWAP
-static int inactive_anon_is_low_global(struct zone *zone)
+static bool inactive_anon_is_low_global(struct zone *zone)
{
unsigned long active, inactive;
active = zone_page_state(zone, NR_ACTIVE_ANON);
inactive = zone_page_state(zone, NR_INACTIVE_ANON);
- if (inactive * zone->inactive_ratio < active)
- return 1;
-
- return 0;
+ return inactive * zone->inactive_ratio < active;
}
/**
@@ -1879,14 +1876,14 @@ static int inactive_anon_is_low_global(struct zone *zone)
* Returns true if the zone does not have enough inactive anon pages,
* meaning some active anon pages need to be deactivated.
*/
-static int inactive_anon_is_low(struct lruvec *lruvec)
+static bool inactive_anon_is_low(struct lruvec *lruvec)
{
/*
* If we don't have swap space, anonymous page deactivation
* is pointless.
*/
if (!total_swap_pages)
- return 0;
+ return false;
if (!mem_cgroup_disabled())
return mem_cgroup_inactive_anon_is_low(lruvec);
@@ -1894,9 +1891,9 @@ static int inactive_anon_is_low(struct lruvec *lruvec)
return inactive_anon_is_low_global(lruvec_zone(lruvec));
}
#else
-static inline int inactive_anon_is_low(struct lruvec *lruvec)
+static inline bool inactive_anon_is_low(struct lruvec *lruvec)
{
- return 0;
+ return false;
}
#endif
@@ -1914,7 +1911,7 @@ static inline int inactive_anon_is_low(struct lruvec *lruvec)
* This uses a different ratio than the anonymous pages, because
* the page cache uses a use-once replacement algorithm.
*/
-static int inactive_file_is_low(struct lruvec *lruvec)
+static bool inactive_file_is_low(struct lruvec *lruvec)
{
unsigned long inactive;
unsigned long active;
@@ -1925,7 +1922,7 @@ static int inactive_file_is_low(struct lruvec *lruvec)
return active > inactive;
}
-static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
+static bool inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
{
if (is_file_lru(lru))
return inactive_file_is_low(lruvec);
@@ -3696,10 +3693,10 @@ static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
}
/* Work out how many page cache pages we can reclaim in this reclaim_mode */
-static long zone_pagecache_reclaimable(struct zone *zone)
+static unsigned long zone_pagecache_reclaimable(struct zone *zone)
{
- long nr_pagecache_reclaimable;
- long delta = 0;
+ unsigned long nr_pagecache_reclaimable;
+ unsigned long delta = 0;
/*
* If RECLAIM_UNMAP is set, then all file pages are considered
diff --git a/mm/vmstat.c b/mm/vmstat.c
index fbf14485a049..ffcb4f58bf3e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -591,6 +591,28 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
else
__inc_zone_state(z, NUMA_OTHER);
}
+
+/*
+ * Determine the per node value of a stat item.
+ */
+unsigned long node_page_state(int node, enum zone_stat_item item)
+{
+ struct zone *zones = NODE_DATA(node)->node_zones;
+
+ return
+#ifdef CONFIG_ZONE_DMA
+ zone_page_state(&zones[ZONE_DMA], item) +
+#endif
+#ifdef CONFIG_ZONE_DMA32
+ zone_page_state(&zones[ZONE_DMA32], item) +
+#endif
+#ifdef CONFIG_HIGHMEM
+ zone_page_state(&zones[ZONE_HIGHMEM], item) +
+#endif
+ zone_page_state(&zones[ZONE_NORMAL], item) +
+ zone_page_state(&zones[ZONE_MOVABLE], item);
+}
+
#endif
#ifdef CONFIG_COMPACTION