summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-02-03 10:10:02 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2016-02-03 10:10:02 -0800
commitb37a05c083c85c2657dca9bbe1f5d79dccf756d5 (patch)
tree0a9bd376a437484e21a6728ca16f2266a0e3e788 /mm
parentd5bfb96bdad3588961f49a6eff89a625fbaa12bf (diff)
parent12c9d70bd5056b3ae84746fca973c286f48384cc (diff)
downloadlwn-b37a05c083c85c2657dca9bbe1f5d79dccf756d5.tar.gz
lwn-b37a05c083c85c2657dca9bbe1f5d79dccf756d5.zip
Merge branch 'akpm' (patches from Andrew)
Merge fixes from Andrew Morton: "18 fixes" [ The 18 fixes turned into 17 commits, because one of the fixes was a fix for another patch in the series that I just folded in by editing the patch manually - hopefully correctly - Linus ] * emailed patches from Andrew Morton <akpm@linux-foundation.org>: mm: fix memory leak in copy_huge_pmd() drivers/hwspinlock: fix race between radix tree insertion and lookup radix-tree: fix race in gang lookup mm/vmpressure.c: fix subtree pressure detection mm: polish virtual memory accounting mm: warn about VmData over RLIMIT_DATA Documentation: cgroup-v2: add memory.stat::sock description mm: memcontrol: drop superfluous entry in the per-memcg stats array drivers/scsi/sg.c: mark VMA as VM_IO to prevent migration proc: revert /proc/<pid>/maps [stack:TID] annotation numa: fix /proc/<pid>/numa_maps for hugetlbfs on s390 MAINTAINERS: update Seth email ocfs2/cluster: fix memory leak in o2hb_region_release lib/test-string_helpers.c: fix and improve string_get_size() tests thp: limit number of object to scan on deferred_split_scan() thp: change deferred_split_count() to return number of THP in queue thp: make split_queue per-node
Diffstat (limited to 'mm')
-rw-r--r--mm/huge_memory.c87
-rw-r--r--mm/internal.h31
-rw-r--r--mm/mmap.c23
-rw-r--r--mm/page_alloc.c5
-rw-r--r--mm/util.c27
-rw-r--r--mm/vmpressure.c3
6 files changed, 103 insertions, 73 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index fd3a07b3e6f4..36c070167b71 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -138,9 +138,6 @@ static struct khugepaged_scan khugepaged_scan = {
.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
};
-static DEFINE_SPINLOCK(split_queue_lock);
-static LIST_HEAD(split_queue);
-static unsigned long split_queue_len;
static struct shrinker deferred_split_shrinker;
static void set_recommended_min_free_kbytes(void)
@@ -861,7 +858,8 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
return false;
entry = mk_pmd(zero_page, vma->vm_page_prot);
entry = pmd_mkhuge(entry);
- pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ if (pgtable)
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
atomic_long_inc(&mm->nr_ptes);
return true;
@@ -1039,13 +1037,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
spinlock_t *dst_ptl, *src_ptl;
struct page *src_page;
pmd_t pmd;
- pgtable_t pgtable;
+ pgtable_t pgtable = NULL;
int ret;
- ret = -ENOMEM;
- pgtable = pte_alloc_one(dst_mm, addr);
- if (unlikely(!pgtable))
- goto out;
+ if (!vma_is_dax(vma)) {
+ ret = -ENOMEM;
+ pgtable = pte_alloc_one(dst_mm, addr);
+ if (unlikely(!pgtable))
+ goto out;
+ }
dst_ptl = pmd_lock(dst_mm, dst_pmd);
src_ptl = pmd_lockptr(src_mm, src_pmd);
@@ -1076,7 +1076,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
goto out_unlock;
}
- if (pmd_trans_huge(pmd)) {
+ if (!vma_is_dax(vma)) {
/* thp accounting separate from pmd_devmap accounting */
src_page = pmd_page(pmd);
VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
@@ -3358,6 +3358,7 @@ int total_mapcount(struct page *page)
int split_huge_page_to_list(struct page *page, struct list_head *list)
{
struct page *head = compound_head(page);
+ struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
struct anon_vma *anon_vma;
int count, mapcount, ret;
bool mlocked;
@@ -3401,19 +3402,19 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
lru_add_drain();
/* Prevent deferred_split_scan() touching ->_count */
- spin_lock_irqsave(&split_queue_lock, flags);
+ spin_lock_irqsave(&pgdata->split_queue_lock, flags);
count = page_count(head);
mapcount = total_mapcount(head);
if (!mapcount && count == 1) {
if (!list_empty(page_deferred_list(head))) {
- split_queue_len--;
+ pgdata->split_queue_len--;
list_del(page_deferred_list(head));
}
- spin_unlock_irqrestore(&split_queue_lock, flags);
+ spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
__split_huge_page(page, list);
ret = 0;
} else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
- spin_unlock_irqrestore(&split_queue_lock, flags);
+ spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
pr_alert("total_mapcount: %u, page_count(): %u\n",
mapcount, count);
if (PageTail(page))
@@ -3421,7 +3422,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
dump_page(page, "total_mapcount(head) > 0");
BUG();
} else {
- spin_unlock_irqrestore(&split_queue_lock, flags);
+ spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
unfreeze_page(anon_vma, head);
ret = -EBUSY;
}
@@ -3436,64 +3437,65 @@ out:
void free_transhuge_page(struct page *page)
{
+ struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
unsigned long flags;
- spin_lock_irqsave(&split_queue_lock, flags);
+ spin_lock_irqsave(&pgdata->split_queue_lock, flags);
if (!list_empty(page_deferred_list(page))) {
- split_queue_len--;
+ pgdata->split_queue_len--;
list_del(page_deferred_list(page));
}
- spin_unlock_irqrestore(&split_queue_lock, flags);
+ spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
free_compound_page(page);
}
void deferred_split_huge_page(struct page *page)
{
+ struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
unsigned long flags;
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- spin_lock_irqsave(&split_queue_lock, flags);
+ spin_lock_irqsave(&pgdata->split_queue_lock, flags);
if (list_empty(page_deferred_list(page))) {
- list_add_tail(page_deferred_list(page), &split_queue);
- split_queue_len++;
+ list_add_tail(page_deferred_list(page), &pgdata->split_queue);
+ pgdata->split_queue_len++;
}
- spin_unlock_irqrestore(&split_queue_lock, flags);
+ spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
}
static unsigned long deferred_split_count(struct shrinker *shrink,
struct shrink_control *sc)
{
- /*
- * Split a page from split_queue will free up at least one page,
- * at most HPAGE_PMD_NR - 1. We don't track exact number.
- * Let's use HPAGE_PMD_NR / 2 as ballpark.
- */
- return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2;
+ struct pglist_data *pgdata = NODE_DATA(sc->nid);
+ return ACCESS_ONCE(pgdata->split_queue_len);
}
static unsigned long deferred_split_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
+ struct pglist_data *pgdata = NODE_DATA(sc->nid);
unsigned long flags;
LIST_HEAD(list), *pos, *next;
struct page *page;
int split = 0;
- spin_lock_irqsave(&split_queue_lock, flags);
- list_splice_init(&split_queue, &list);
-
+ spin_lock_irqsave(&pgdata->split_queue_lock, flags);
/* Take pin on all head pages to avoid freeing them under us */
list_for_each_safe(pos, next, &list) {
page = list_entry((void *)pos, struct page, mapping);
page = compound_head(page);
- /* race with put_compound_page() */
- if (!get_page_unless_zero(page)) {
+ if (get_page_unless_zero(page)) {
+ list_move(page_deferred_list(page), &list);
+ } else {
+ /* We lost race with put_compound_page() */
list_del_init(page_deferred_list(page));
- split_queue_len--;
+ pgdata->split_queue_len--;
}
+ if (!--sc->nr_to_scan)
+ break;
}
- spin_unlock_irqrestore(&split_queue_lock, flags);
+ spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
list_for_each_safe(pos, next, &list) {
page = list_entry((void *)pos, struct page, mapping);
@@ -3505,17 +3507,24 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
put_page(page);
}
- spin_lock_irqsave(&split_queue_lock, flags);
- list_splice_tail(&list, &split_queue);
- spin_unlock_irqrestore(&split_queue_lock, flags);
+ spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+ list_splice_tail(&list, &pgdata->split_queue);
+ spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
- return split * HPAGE_PMD_NR / 2;
+ /*
+ * Stop shrinker if we didn't split any page, but the queue is empty.
+ * This can happen if pages were freed under us.
+ */
+ if (!split && list_empty(&pgdata->split_queue))
+ return SHRINK_STOP;
+ return split;
}
static struct shrinker deferred_split_shrinker = {
.count_objects = deferred_split_count,
.scan_objects = deferred_split_scan,
.seeks = DEFAULT_SEEKS,
+ .flags = SHRINKER_NUMA_AWARE,
};
#ifdef CONFIG_DEBUG_FS
diff --git a/mm/internal.h b/mm/internal.h
index ed8b5ffcf9b1..a38a21ebddb4 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -216,6 +216,37 @@ static inline bool is_cow_mapping(vm_flags_t flags)
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}
+/*
+ * These three helpers classifies VMAs for virtual memory accounting.
+ */
+
+/*
+ * Executable code area - executable, not writable, not stack
+ */
+static inline bool is_exec_mapping(vm_flags_t flags)
+{
+ return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC;
+}
+
+/*
+ * Stack area - atomatically grows in one direction
+ *
+ * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
+ * do_mmap() forbids all other combinations.
+ */
+static inline bool is_stack_mapping(vm_flags_t flags)
+{
+ return (flags & VM_STACK) == VM_STACK;
+}
+
+/*
+ * Data area - private, writable, not stack
+ */
+static inline bool is_data_mapping(vm_flags_t flags)
+{
+ return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE;
+}
+
/* mm/util.c */
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct rb_node *rb_parent);
diff --git a/mm/mmap.c b/mm/mmap.c
index 84b12624ceb0..cfc0cdca421e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -42,6 +42,7 @@
#include <linux/memory.h>
#include <linux/printk.h>
#include <linux/userfaultfd_k.h>
+#include <linux/moduleparam.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -69,6 +70,8 @@ const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
#endif
+static bool ignore_rlimit_data = true;
+core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
@@ -2982,9 +2985,17 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
return false;
- if ((flags & (VM_WRITE | VM_SHARED | (VM_STACK_FLAGS &
- (VM_GROWSUP | VM_GROWSDOWN)))) == VM_WRITE)
- return mm->data_vm + npages <= rlimit(RLIMIT_DATA);
+ if (is_data_mapping(flags) &&
+ mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
+ if (ignore_rlimit_data)
+ pr_warn_once("%s (%d): VmData %lu exceed data ulimit "
+ "%lu. Will be forbidden soon.\n",
+ current->comm, current->pid,
+ (mm->data_vm + npages) << PAGE_SHIFT,
+ rlimit(RLIMIT_DATA));
+ else
+ return false;
+ }
return true;
}
@@ -2993,11 +3004,11 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
{
mm->total_vm += npages;
- if ((flags & (VM_EXEC | VM_WRITE)) == VM_EXEC)
+ if (is_exec_mapping(flags))
mm->exec_vm += npages;
- else if (flags & (VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN)))
+ else if (is_stack_mapping(flags))
mm->stack_vm += npages;
- else if ((flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
+ else if (is_data_mapping(flags))
mm->data_vm += npages;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 63358d9f9aa9..ea2c4d3e0c03 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5210,6 +5210,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
pgdat->numabalancing_migrate_nr_pages = 0;
pgdat->numabalancing_migrate_next_window = jiffies;
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ spin_lock_init(&pgdat->split_queue_lock);
+ INIT_LIST_HEAD(&pgdat->split_queue);
+ pgdat->split_queue_len = 0;
+#endif
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
pgdat_page_ext_init(pgdat);
diff --git a/mm/util.c b/mm/util.c
index c108a6542d05..4fb14ca5a419 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -230,36 +230,11 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
}
/* Check if the vma is being used as a stack by this task */
-static int vm_is_stack_for_task(struct task_struct *t,
- struct vm_area_struct *vma)
+int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t)
{
return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
}
-/*
- * Check if the vma is being used as a stack.
- * If is_group is non-zero, check in the entire thread group or else
- * just check in the current task. Returns the task_struct of the task
- * that the vma is stack for. Must be called under rcu_read_lock().
- */
-struct task_struct *task_of_stack(struct task_struct *task,
- struct vm_area_struct *vma, bool in_group)
-{
- if (vm_is_stack_for_task(task, vma))
- return task;
-
- if (in_group) {
- struct task_struct *t;
-
- for_each_thread(task, t) {
- if (vm_is_stack_for_task(t, vma))
- return t;
- }
- }
-
- return NULL;
-}
-
#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
void arch_pick_mmap_layout(struct mm_struct *mm)
{
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 9a6c0704211c..149fdf6c5c56 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -248,9 +248,8 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
if (tree) {
spin_lock(&vmpr->sr_lock);
- vmpr->tree_scanned += scanned;
+ scanned = vmpr->tree_scanned += scanned;
vmpr->tree_reclaimed += reclaimed;
- scanned = vmpr->scanned;
spin_unlock(&vmpr->sr_lock);
if (scanned < vmpressure_win)