summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorJames Morris <jmorris@macbook.(none)>2009-12-03 12:03:40 +0530
committerJames Morris <jmorris@macbook.(none)>2009-12-03 12:03:40 +0530
commitc84d6efd363a3948eb32ec40d46bab6338580454 (patch)
tree3ba7ac46e6626fe8ac843834588609eb6ccee5c6 /mm
parent7539cf4b92be4aecc573ea962135f246a7a33401 (diff)
parent22763c5cf3690a681551162c15d34d935308c8d7 (diff)
downloadlwn-c84d6efd363a3948eb32ec40d46bab6338580454.tar.gz
lwn-c84d6efd363a3948eb32ec40d46bab6338580454.zip
Merge branch 'master' into next
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig13
-rw-r--r--mm/backing-dev.c27
-rw-r--r--mm/highmem.c17
-rw-r--r--mm/kmemleak.c5
-rw-r--r--mm/ksm.c11
-rw-r--r--mm/memcontrol.c127
-rw-r--r--mm/memory-failure.c59
-rw-r--r--mm/memory.c14
-rw-r--r--mm/memory_hotplug.c24
-rw-r--r--mm/mempolicy.c13
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/nommu.c6
-rw-r--r--mm/page-writeback.c3
-rw-r--r--mm/page_alloc.c7
-rw-r--r--mm/percpu.c217
-rw-r--r--mm/rmap.c4
-rw-r--r--mm/swapfile.c15
-rw-r--r--mm/vmalloc.c50
-rw-r--r--mm/vmscan.c14
19 files changed, 390 insertions, 238 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index edd300aca173..44cf6f0a3a6d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -67,7 +67,7 @@ config DISCONTIGMEM
config SPARSEMEM
def_bool y
- depends on SPARSEMEM_MANUAL
+ depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL
config FLATMEM
def_bool y
@@ -128,11 +128,8 @@ config SPARSEMEM_VMEMMAP
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
depends on SPARSEMEM || X86_64_ACPI_NUMA
- depends on HOTPLUG && !(HIBERNATION && !S390) && ARCH_ENABLE_MEMORY_HOTPLUG
- depends on (IA64 || X86 || PPC64 || SUPERH || S390)
-
-comment "Memory hotplug is currently incompatible with Software Suspend"
- depends on SPARSEMEM && HOTPLUG && HIBERNATION && !S390
+ depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
+ depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
config MEMORY_HOTPLUG_SPARSE
def_bool y
@@ -224,7 +221,9 @@ config KSM
the many instances by a single resident page with that content, so
saving memory until one or another app needs to modify the content.
Recommended for use with KVM, or with other duplicative applications.
- See Documentation/vm/ksm.txt for more information.
+ See Documentation/vm/ksm.txt for more information: KSM is inactive
+ until a program has madvised that an area is MADV_MERGEABLE, and
+ root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
config DEFAULT_MMAP_MIN_ADDR
int "Low address space to protect from user allocation"
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 3d3accb1f800..67a33a5a1a93 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -92,7 +92,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
"BdiDirtyThresh: %8lu kB\n"
"DirtyThresh: %8lu kB\n"
"BackgroundThresh: %8lu kB\n"
- "WriteBack threads:%8lu\n"
+ "WritebackThreads: %8lu\n"
"b_dirty: %8lu\n"
"b_io: %8lu\n"
"b_more_io: %8lu\n"
@@ -604,15 +604,36 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
/*
* Finally, kill the kernel threads. We don't need to be RCU
- * safe anymore, since the bdi is gone from visibility.
+ * safe anymore, since the bdi is gone from visibility. Force
+ * unfreeze of the thread before calling kthread_stop(), otherwise
+ * it would never exet if it is currently stuck in the refrigerator.
*/
- list_for_each_entry(wb, &bdi->wb_list, list)
+ list_for_each_entry(wb, &bdi->wb_list, list) {
+ wb->task->flags &= ~PF_FROZEN;
kthread_stop(wb->task);
+ }
+}
+
+/*
+ * This bdi is going away now, make sure that no super_blocks point to it
+ */
+static void bdi_prune_sb(struct backing_dev_info *bdi)
+{
+ struct super_block *sb;
+
+ spin_lock(&sb_lock);
+ list_for_each_entry(sb, &super_blocks, s_list) {
+ if (sb->s_bdi == bdi)
+ sb->s_bdi = NULL;
+ }
+ spin_unlock(&sb_lock);
}
void bdi_unregister(struct backing_dev_info *bdi)
{
if (bdi->dev) {
+ bdi_prune_sb(bdi);
+
if (!bdi_cap_flush_forker(bdi))
bdi_wb_shutdown(bdi);
bdi_debug_unregister(bdi);
diff --git a/mm/highmem.c b/mm/highmem.c
index 25878cc49daa..9c1e627f282e 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -426,16 +426,21 @@ void __init page_address_init(void)
void debug_kmap_atomic(enum km_type type)
{
- static unsigned warn_count = 10;
+ static int warn_count = 10;
- if (unlikely(warn_count == 0))
+ if (unlikely(warn_count < 0))
return;
if (unlikely(in_interrupt())) {
- if (in_irq()) {
+ if (in_nmi()) {
+ if (type != KM_NMI && type != KM_NMI_PTE) {
+ WARN_ON(1);
+ warn_count--;
+ }
+ } else if (in_irq()) {
if (type != KM_IRQ0 && type != KM_IRQ1 &&
type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
- type != KM_BOUNCE_READ) {
+ type != KM_BOUNCE_READ && type != KM_IRQ_PTE) {
WARN_ON(1);
warn_count--;
}
@@ -452,7 +457,9 @@ void debug_kmap_atomic(enum km_type type)
}
if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
- type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
+ type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ ||
+ type == KM_IRQ_PTE || type == KM_NMI ||
+ type == KM_NMI_PTE ) {
if (!irqs_disabled()) {
WARN_ON(1);
warn_count--;
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 4ea4510e2996..8bf765c4f58d 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -833,12 +833,15 @@ static void early_alloc(struct early_log *log)
*/
rcu_read_lock();
object = create_object((unsigned long)log->ptr, log->size,
- log->min_count, GFP_KERNEL);
+ log->min_count, GFP_ATOMIC);
+ if (!object)
+ goto out;
spin_lock_irqsave(&object->lock, flags);
for (i = 0; i < log->trace_len; i++)
object->trace[i] = log->trace[i];
object->trace_len = log->trace_len;
spin_unlock_irqrestore(&object->lock, flags);
+out:
rcu_read_unlock();
}
diff --git a/mm/ksm.c b/mm/ksm.c
index f7edac356f46..5575f8628fef 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -184,11 +184,6 @@ static DEFINE_SPINLOCK(ksm_mmlist_lock);
sizeof(struct __struct), __alignof__(struct __struct),\
(__flags), NULL)
-static void __init ksm_init_max_kernel_pages(void)
-{
- ksm_max_kernel_pages = nr_free_buffer_pages() / 4;
-}
-
static int __init ksm_slab_init(void)
{
rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
@@ -1017,6 +1012,7 @@ static struct rmap_item *unstable_tree_search_insert(struct page *page,
struct rmap_item *tree_rmap_item;
int ret;
+ cond_resched();
tree_rmap_item = rb_entry(*new, struct rmap_item, node);
page2[0] = get_mergeable_page(tree_rmap_item);
if (!page2[0])
@@ -1673,7 +1669,7 @@ static int __init ksm_init(void)
struct task_struct *ksm_thread;
int err;
- ksm_init_max_kernel_pages();
+ ksm_max_kernel_pages = totalram_pages / 4;
err = ksm_slab_init();
if (err)
@@ -1697,6 +1693,9 @@ static int __init ksm_init(void)
kthread_stop(ksm_thread);
goto out_free2;
}
+#else
+ ksm_run = KSM_RUN_MERGE; /* no way for user to start it */
+
#endif /* CONFIG_SYSFS */
return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e2b98a6875c0..f99f5991d6bb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -313,7 +313,8 @@ soft_limit_tree_from_page(struct page *page)
static void
__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
struct mem_cgroup_per_zone *mz,
- struct mem_cgroup_tree_per_zone *mctz)
+ struct mem_cgroup_tree_per_zone *mctz,
+ unsigned long long new_usage_in_excess)
{
struct rb_node **p = &mctz->rb_root.rb_node;
struct rb_node *parent = NULL;
@@ -322,7 +323,9 @@ __mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
if (mz->on_tree)
return;
- mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res);
+ mz->usage_in_excess = new_usage_in_excess;
+ if (!mz->usage_in_excess)
+ return;
while (*p) {
parent = *p;
mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
@@ -353,16 +356,6 @@ __mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
}
static void
-mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
- struct mem_cgroup_per_zone *mz,
- struct mem_cgroup_tree_per_zone *mctz)
-{
- spin_lock(&mctz->lock);
- __mem_cgroup_insert_exceeded(mem, mz, mctz);
- spin_unlock(&mctz->lock);
-}
-
-static void
mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
struct mem_cgroup_per_zone *mz,
struct mem_cgroup_tree_per_zone *mctz)
@@ -392,34 +385,36 @@ static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
{
- unsigned long long prev_usage_in_excess, new_usage_in_excess;
- bool updated_tree = false;
+ unsigned long long excess;
struct mem_cgroup_per_zone *mz;
struct mem_cgroup_tree_per_zone *mctz;
-
- mz = mem_cgroup_zoneinfo(mem, page_to_nid(page), page_zonenum(page));
+ int nid = page_to_nid(page);
+ int zid = page_zonenum(page);
mctz = soft_limit_tree_from_page(page);
/*
- * We do updates in lazy mode, mem's are removed
- * lazily from the per-zone, per-node rb tree
+ * Necessary to update all ancestors when hierarchy is used.
+ * because their event counter is not touched.
*/
- prev_usage_in_excess = mz->usage_in_excess;
-
- new_usage_in_excess = res_counter_soft_limit_excess(&mem->res);
- if (prev_usage_in_excess) {
- mem_cgroup_remove_exceeded(mem, mz, mctz);
- updated_tree = true;
- }
- if (!new_usage_in_excess)
- goto done;
- mem_cgroup_insert_exceeded(mem, mz, mctz);
-
-done:
- if (updated_tree) {
- spin_lock(&mctz->lock);
- mz->usage_in_excess = new_usage_in_excess;
- spin_unlock(&mctz->lock);
+ for (; mem; mem = parent_mem_cgroup(mem)) {
+ mz = mem_cgroup_zoneinfo(mem, nid, zid);
+ excess = res_counter_soft_limit_excess(&mem->res);
+ /*
+ * We have to update the tree if mz is on RB-tree or
+ * mem is over its softlimit.
+ */
+ if (excess || mz->on_tree) {
+ spin_lock(&mctz->lock);
+ /* if on-tree, remove it */
+ if (mz->on_tree)
+ __mem_cgroup_remove_exceeded(mem, mz, mctz);
+ /*
+ * Insert again. mz->usage_in_excess will be updated.
+ * If excess is 0, no tree ops.
+ */
+ __mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
+ spin_unlock(&mctz->lock);
+ }
}
}
@@ -447,9 +442,10 @@ static struct mem_cgroup_per_zone *
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
{
struct rb_node *rightmost = NULL;
- struct mem_cgroup_per_zone *mz = NULL;
+ struct mem_cgroup_per_zone *mz;
retry:
+ mz = NULL;
rightmost = rb_last(&mctz->rb_root);
if (!rightmost)
goto done; /* Nothing to reclaim from */
@@ -1270,9 +1266,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
gfp_t gfp_mask, struct mem_cgroup **memcg,
bool oom, struct page *page)
{
- struct mem_cgroup *mem, *mem_over_limit, *mem_over_soft_limit;
+ struct mem_cgroup *mem, *mem_over_limit;
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
- struct res_counter *fail_res, *soft_fail_res = NULL;
+ struct res_counter *fail_res;
if (unlikely(test_thread_flag(TIF_MEMDIE))) {
/* Don't account this! */
@@ -1304,17 +1300,16 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
if (mem_cgroup_is_root(mem))
goto done;
- ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res,
- &soft_fail_res);
+ ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
if (likely(!ret)) {
if (!do_swap_account)
break;
ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
- &fail_res, NULL);
+ &fail_res);
if (likely(!ret))
break;
/* mem+swap counter fails */
- res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
+ res_counter_uncharge(&mem->res, PAGE_SIZE);
flags |= MEM_CGROUP_RECLAIM_NOSWAP;
mem_over_limit = mem_cgroup_from_res_counter(fail_res,
memsw);
@@ -1353,16 +1348,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
}
}
/*
- * Insert just the ancestor, we should trickle down to the correct
- * cgroup for reclaim, since the other nodes will be below their
- * soft limit
+ * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
+ * if they exceeds softlimit.
*/
- if (soft_fail_res) {
- mem_over_soft_limit =
- mem_cgroup_from_res_counter(soft_fail_res, res);
- if (mem_cgroup_soft_limit_check(mem_over_soft_limit))
- mem_cgroup_update_tree(mem_over_soft_limit, page);
- }
+ if (mem_cgroup_soft_limit_check(mem))
+ mem_cgroup_update_tree(mem, page);
done:
return 0;
nomem:
@@ -1437,10 +1427,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
if (unlikely(PageCgroupUsed(pc))) {
unlock_page_cgroup(pc);
if (!mem_cgroup_is_root(mem)) {
- res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
+ res_counter_uncharge(&mem->res, PAGE_SIZE);
if (do_swap_account)
- res_counter_uncharge(&mem->memsw, PAGE_SIZE,
- NULL);
+ res_counter_uncharge(&mem->memsw, PAGE_SIZE);
}
css_put(&mem->css);
return;
@@ -1519,7 +1508,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
goto out;
if (!mem_cgroup_is_root(from))
- res_counter_uncharge(&from->res, PAGE_SIZE, NULL);
+ res_counter_uncharge(&from->res, PAGE_SIZE);
mem_cgroup_charge_statistics(from, pc, false);
page = pc->page;
@@ -1539,7 +1528,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
}
if (do_swap_account && !mem_cgroup_is_root(from))
- res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL);
+ res_counter_uncharge(&from->memsw, PAGE_SIZE);
css_put(&from->css);
css_get(&to->css);
@@ -1610,9 +1599,9 @@ uncharge:
css_put(&parent->css);
/* uncharge if move fails */
if (!mem_cgroup_is_root(parent)) {
- res_counter_uncharge(&parent->res, PAGE_SIZE, NULL);
+ res_counter_uncharge(&parent->res, PAGE_SIZE);
if (do_swap_account)
- res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL);
+ res_counter_uncharge(&parent->memsw, PAGE_SIZE);
}
return ret;
}
@@ -1803,8 +1792,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
* calling css_tryget
*/
if (!mem_cgroup_is_root(memcg))
- res_counter_uncharge(&memcg->memsw, PAGE_SIZE,
- NULL);
+ res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
mem_cgroup_swap_statistics(memcg, false);
mem_cgroup_put(memcg);
}
@@ -1831,9 +1819,9 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
if (!mem)
return;
if (!mem_cgroup_is_root(mem)) {
- res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
+ res_counter_uncharge(&mem->res, PAGE_SIZE);
if (do_swap_account)
- res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
+ res_counter_uncharge(&mem->memsw, PAGE_SIZE);
}
css_put(&mem->css);
}
@@ -1848,7 +1836,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
struct page_cgroup *pc;
struct mem_cgroup *mem = NULL;
struct mem_cgroup_per_zone *mz;
- bool soft_limit_excess = false;
if (mem_cgroup_disabled())
return NULL;
@@ -1888,10 +1875,10 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
}
if (!mem_cgroup_is_root(mem)) {
- res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess);
+ res_counter_uncharge(&mem->res, PAGE_SIZE);
if (do_swap_account &&
(ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
- res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
+ res_counter_uncharge(&mem->memsw, PAGE_SIZE);
}
if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
mem_cgroup_swap_statistics(mem, true);
@@ -1908,7 +1895,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
mz = page_cgroup_zoneinfo(pc);
unlock_page_cgroup(pc);
- if (soft_limit_excess && mem_cgroup_soft_limit_check(mem))
+ if (mem_cgroup_soft_limit_check(mem))
mem_cgroup_update_tree(mem, page);
/* at swapout, this memcg will be accessed to record to swap */
if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
@@ -1986,7 +1973,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
* This memcg can be obsolete one. We avoid calling css_tryget
*/
if (!mem_cgroup_is_root(memcg))
- res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL);
+ res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
mem_cgroup_swap_statistics(memcg, false);
mem_cgroup_put(memcg);
}
@@ -2233,6 +2220,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
unsigned long reclaimed;
int loop = 0;
struct mem_cgroup_tree_per_zone *mctz;
+ unsigned long long excess;
if (order > 0)
return 0;
@@ -2284,9 +2272,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
break;
} while (1);
}
- mz->usage_in_excess =
- res_counter_soft_limit_excess(&mz->mem->res);
__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
+ excess = res_counter_soft_limit_excess(&mz->mem->res);
/*
* One school of thought says that we should not add
* back the node to the tree if reclaim returns 0.
@@ -2295,8 +2282,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
* memory to reclaim from. Consider this as a longer
* term TODO.
*/
- if (mz->usage_in_excess)
- __mem_cgroup_insert_exceeded(mz->mem, mz, mctz);
+ /* If excess == 0, no tree ops */
+ __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
spin_unlock(&mctz->lock);
css_put(&mz->mem->css);
loop++;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 729d4b15b645..dacc64183874 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -35,6 +35,7 @@
#include <linux/mm.h>
#include <linux/page-flags.h>
#include <linux/sched.h>
+#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
@@ -370,9 +371,6 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
int ret = FAILED;
struct address_space *mapping;
- if (!isolate_lru_page(p))
- page_cache_release(p);
-
/*
* For anonymous pages we're done the only reference left
* should be the one m_f() holds.
@@ -498,30 +496,18 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
*/
static int me_swapcache_dirty(struct page *p, unsigned long pfn)
{
- int ret = FAILED;
-
ClearPageDirty(p);
/* Trigger EIO in shmem: */
ClearPageUptodate(p);
- if (!isolate_lru_page(p)) {
- page_cache_release(p);
- ret = DELAYED;
- }
-
- return ret;
+ return DELAYED;
}
static int me_swapcache_clean(struct page *p, unsigned long pfn)
{
- int ret = FAILED;
-
- if (!isolate_lru_page(p)) {
- page_cache_release(p);
- ret = RECOVERED;
- }
delete_from_swap_cache(p);
- return ret;
+
+ return RECOVERED;
}
/*
@@ -611,8 +597,6 @@ static struct page_state {
{ 0, 0, "unknown page state", me_unknown },
};
-#undef lru
-
static void action_result(unsigned long pfn, char *msg, int result)
{
struct page *page = NULL;
@@ -629,13 +613,16 @@ static int page_action(struct page_state *ps, struct page *p,
unsigned long pfn, int ref)
{
int result;
+ int count;
result = ps->action(p, pfn);
action_result(pfn, ps->msg, result);
- if (page_count(p) != 1 + ref)
+
+ count = page_count(p) - 1 - ref;
+ if (count != 0)
printk(KERN_ERR
"MCE %#lx: %s page still referenced by %d users\n",
- pfn, ps->msg, page_count(p) - 1);
+ pfn, ps->msg, count);
/* Could do more checks here if page looks ok */
/*
@@ -661,12 +648,9 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
int i;
int kill = 1;
- if (PageReserved(p) || PageCompound(p) || PageSlab(p))
+ if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p))
return;
- if (!PageLRU(p))
- lru_add_drain_all();
-
/*
* This check implies we don't kill processes if their pages
* are in the swap cache early. Those are always late kills.
@@ -738,6 +722,7 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
int __memory_failure(unsigned long pfn, int trapno, int ref)
{
+ unsigned long lru_flag;
struct page_state *ps;
struct page *p;
int res;
@@ -775,6 +760,24 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
}
/*
+ * We ignore non-LRU pages for good reasons.
+ * - PG_locked is only well defined for LRU pages and a few others
+ * - to avoid races with __set_page_locked()
+ * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
+ * The check (unnecessarily) ignores LRU pages being isolated and
+ * walked by the page reclaim code, however that's not a big loss.
+ */
+ if (!PageLRU(p))
+ lru_add_drain_all();
+ lru_flag = p->flags & lru;
+ if (isolate_lru_page(p)) {
+ action_result(pfn, "non LRU", IGNORED);
+ put_page(p);
+ return -EBUSY;
+ }
+ page_cache_release(p);
+
+ /*
* Lock the page and wait for writeback to finish.
* It's very difficult to mess with pages currently under IO
* and in many cases impossible, so we just avoid it here.
@@ -790,7 +793,7 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
/*
* Torn down by someone else?
*/
- if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
+ if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) {
action_result(pfn, "already truncated LRU", IGNORED);
res = 0;
goto out;
@@ -798,7 +801,7 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
res = -EBUSY;
for (ps = error_states;; ps++) {
- if ((p->flags & ps->mask) == ps->res) {
+ if (((p->flags | lru_flag)& ps->mask) == ps->res) {
res = page_action(ps, p, pfn, ref);
break;
}
diff --git a/mm/memory.c b/mm/memory.c
index 7e91b5f9f690..6ab19dd4a199 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -641,6 +641,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
+ pte_t *orig_src_pte, *orig_dst_pte;
pte_t *src_pte, *dst_pte;
spinlock_t *src_ptl, *dst_ptl;
int progress = 0;
@@ -654,6 +655,8 @@ again:
src_pte = pte_offset_map_nested(src_pmd, addr);
src_ptl = pte_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+ orig_src_pte = src_pte;
+ orig_dst_pte = dst_pte;
arch_enter_lazy_mmu_mode();
do {
@@ -677,9 +680,9 @@ again:
arch_leave_lazy_mmu_mode();
spin_unlock(src_ptl);
- pte_unmap_nested(src_pte - 1);
+ pte_unmap_nested(orig_src_pte);
add_mm_rss(dst_mm, rss[0], rss[1]);
- pte_unmap_unlock(dst_pte - 1, dst_ptl);
+ pte_unmap_unlock(orig_dst_pte, dst_ptl);
cond_resched();
if (addr != end)
goto again;
@@ -1820,10 +1823,10 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
token = pmd_pgtable(*pmd);
do {
- err = fn(pte, token, addr, data);
+ err = fn(pte++, token, addr, data);
if (err)
break;
- } while (pte++, addr += PAGE_SIZE, addr != end);
+ } while (addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
@@ -2539,7 +2542,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
} else if (PageHWPoison(page)) {
ret = VM_FAULT_HWPOISON;
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
- goto out;
+ goto out_release;
}
lock_page(page);
@@ -2611,6 +2614,7 @@ out_nomap:
pte_unmap_unlock(page_table, ptl);
out_page:
unlock_page(page);
+out_release:
page_cache_release(page);
return ret;
}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 821dee596377..2047465cd27c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -26,6 +26,7 @@
#include <linux/migrate.h>
#include <linux/page-isolation.h>
#include <linux/pfn.h>
+#include <linux/suspend.h>
#include <asm/tlbflush.h>
@@ -447,7 +448,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
}
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
-static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
+/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
+static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
{
struct pglist_data *pgdat;
unsigned long zones_size[MAX_NR_ZONES] = {0};
@@ -484,14 +486,18 @@ int __ref add_memory(int nid, u64 start, u64 size)
struct resource *res;
int ret;
+ lock_system_sleep();
+
res = register_memory_resource(start, size);
+ ret = -EEXIST;
if (!res)
- return -EEXIST;
+ goto out;
if (!node_online(nid)) {
pgdat = hotadd_new_pgdat(nid, start);
+ ret = -ENOMEM;
if (!pgdat)
- return -ENOMEM;
+ goto out;
new_pgdat = 1;
}
@@ -514,7 +520,8 @@ int __ref add_memory(int nid, u64 start, u64 size)
BUG_ON(ret);
}
- return ret;
+ goto out;
+
error:
/* rollback pgdat allocation and others */
if (new_pgdat)
@@ -522,6 +529,8 @@ error:
if (res)
release_memory_resource(res);
+out:
+ unlock_system_sleep();
return ret;
}
EXPORT_SYMBOL_GPL(add_memory);
@@ -758,6 +767,8 @@ int offline_pages(unsigned long start_pfn,
if (!test_pages_in_a_zone(start_pfn, end_pfn))
return -EINVAL;
+ lock_system_sleep();
+
zone = page_zone(pfn_to_page(start_pfn));
node = zone_to_nid(zone);
nr_pages = end_pfn - start_pfn;
@@ -765,7 +776,7 @@ int offline_pages(unsigned long start_pfn,
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn);
if (ret)
- return ret;
+ goto out;
arg.start_pfn = start_pfn;
arg.nr_pages = nr_pages;
@@ -843,6 +854,7 @@ repeat:
writeback_set_ratelimit();
memory_notify(MEM_OFFLINE, &arg);
+ unlock_system_sleep();
return 0;
failed_removal:
@@ -852,6 +864,8 @@ failed_removal:
/* pushback to free area */
undo_isolate_page_range(start_pfn, end_pfn);
+out:
+ unlock_system_sleep();
return ret;
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 7dd9d9f80694..4545d5944243 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1024,7 +1024,7 @@ static long do_mbind(unsigned long start, unsigned long len,
err = migrate_prep();
if (err)
- return err;
+ goto mpol_out;
}
{
NODEMASK_SCRATCH(scratch);
@@ -1039,10 +1039,9 @@ static long do_mbind(unsigned long start, unsigned long len,
err = -ENOMEM;
NODEMASK_SCRATCH_FREE(scratch);
}
- if (err) {
- mpol_put(new);
- return err;
- }
+ if (err)
+ goto mpol_out;
+
vma = check_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT, &pagelist);
@@ -1058,9 +1057,11 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!err && nr_failed && (flags & MPOL_MF_STRICT))
err = -EIO;
- }
+ } else
+ putback_lru_pages(&pagelist);
up_write(&mm->mmap_sem);
+ mpol_out:
mpol_put(new);
return err;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 1a4bf4813780..7dbcb22316d2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -602,7 +602,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
struct page *newpage = get_new_page(page, private, &result);
int rcu_locked = 0;
int charge = 0;
- struct mem_cgroup *mem;
+ struct mem_cgroup *mem = NULL;
if (!newpage)
return -ENOMEM;
diff --git a/mm/nommu.c b/mm/nommu.c
index 5189b5aed8c0..9876fa0c3ad3 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1362,9 +1362,11 @@ share:
error_just_free:
up_write(&nommu_region_sem);
error:
- fput(region->vm_file);
+ if (region->vm_file)
+ fput(region->vm_file);
kmem_cache_free(vm_region_jar, region);
- fput(vma->vm_file);
+ if (vma->vm_file)
+ fput(vma->vm_file);
if (vma->vm_flags & VM_EXECUTABLE)
removed_exe_file_vma(vma->vm_mm);
kmem_cache_free(vm_area_cachep, vma);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index a3b14090b1fb..2c5d79236ead 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -566,7 +566,8 @@ static void balance_dirty_pages(struct address_space *mapping,
if (pages_written >= write_chunk)
break; /* We've done our duty */
- schedule_timeout_interruptible(pause);
+ __set_current_state(TASK_INTERRUPTIBLE);
+ io_schedule_timeout(pause);
/*
* Increase the delay for each loop, up to our previous
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bf720550b44d..2bc2ac63f41e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1769,7 +1769,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
alloc_flags &= ~ALLOC_CPUSET;
- } else if (unlikely(rt_task(p)))
+ } else if (unlikely(rt_task(p)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
@@ -1817,9 +1817,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
goto nopage;
+restart:
wake_all_kswapd(order, zonelist, high_zoneidx);
-restart:
/*
* OK, we're below the kswapd watermark and have kicked background
* reclaim. Now things get more complex, so set up alloc_flags according
@@ -2183,7 +2183,7 @@ void show_free_areas(void)
printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
" unevictable:%lu"
- " dirty:%lu writeback:%lu unstable:%lu buffer:%lu\n"
+ " dirty:%lu writeback:%lu unstable:%lu\n"
" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
global_page_state(NR_ACTIVE_ANON),
@@ -2196,7 +2196,6 @@ void show_free_areas(void)
global_page_state(NR_FILE_DIRTY),
global_page_state(NR_WRITEBACK),
global_page_state(NR_UNSTABLE_NFS),
- nr_blockdev_pages(),
global_page_state(NR_FREE_PAGES),
global_page_state(NR_SLAB_RECLAIMABLE),
global_page_state(NR_SLAB_UNRECLAIMABLE),
diff --git a/mm/percpu.c b/mm/percpu.c
index 43d8cacfdaa5..5adfc268b408 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -153,7 +153,10 @@ static int pcpu_reserved_chunk_limit;
*
* During allocation, pcpu_alloc_mutex is kept locked all the time and
* pcpu_lock is grabbed and released as necessary. All actual memory
- * allocations are done using GFP_KERNEL with pcpu_lock released.
+ * allocations are done using GFP_KERNEL with pcpu_lock released. In
+ * general, percpu memory can't be allocated with irq off but
+ * irqsave/restore are still used in alloc path so that it can be used
+ * from early init path - sched_init() specifically.
*
* Free path accesses and alters only the index data structures, so it
* can be safely called from atomic context. When memory needs to be
@@ -352,62 +355,86 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
}
/**
- * pcpu_extend_area_map - extend area map for allocation
- * @chunk: target chunk
+ * pcpu_need_to_extend - determine whether chunk area map needs to be extended
+ * @chunk: chunk of interest
*
- * Extend area map of @chunk so that it can accomodate an allocation.
- * A single allocation can split an area into three areas, so this
- * function makes sure that @chunk->map has at least two extra slots.
+ * Determine whether area map of @chunk needs to be extended to
+ * accomodate a new allocation.
*
* CONTEXT:
- * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired
- * if area map is extended.
+ * pcpu_lock.
*
* RETURNS:
- * 0 if noop, 1 if successfully extended, -errno on failure.
+ * New target map allocation length if extension is necessary, 0
+ * otherwise.
*/
-static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
+static int pcpu_need_to_extend(struct pcpu_chunk *chunk)
{
int new_alloc;
- int *new;
- size_t size;
- /* has enough? */
if (chunk->map_alloc >= chunk->map_used + 2)
return 0;
- spin_unlock_irq(&pcpu_lock);
-
new_alloc = PCPU_DFL_MAP_ALLOC;
while (new_alloc < chunk->map_used + 2)
new_alloc *= 2;
- new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
- if (!new) {
- spin_lock_irq(&pcpu_lock);
+ return new_alloc;
+}
+
+/**
+ * pcpu_extend_area_map - extend area map of a chunk
+ * @chunk: chunk of interest
+ * @new_alloc: new target allocation length of the area map
+ *
+ * Extend area map of @chunk to have @new_alloc entries.
+ *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation. Grabs and releases pcpu_lock.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
+{
+ int *old = NULL, *new = NULL;
+ size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
+ unsigned long flags;
+
+ new = pcpu_mem_alloc(new_size);
+ if (!new)
return -ENOMEM;
- }
- /*
- * Acquire pcpu_lock and switch to new area map. Only free
- * could have happened inbetween, so map_used couldn't have
- * grown.
- */
- spin_lock_irq(&pcpu_lock);
- BUG_ON(new_alloc < chunk->map_used + 2);
+ /* acquire pcpu_lock and switch to new area map */
+ spin_lock_irqsave(&pcpu_lock, flags);
+
+ if (new_alloc <= chunk->map_alloc)
+ goto out_unlock;
- size = chunk->map_alloc * sizeof(chunk->map[0]);
- memcpy(new, chunk->map, size);
+ old_size = chunk->map_alloc * sizeof(chunk->map[0]);
+ memcpy(new, chunk->map, old_size);
/*
* map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is
* one of the first chunks and still using static map.
*/
if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
- pcpu_mem_free(chunk->map, size);
+ old = chunk->map;
chunk->map_alloc = new_alloc;
chunk->map = new;
+ new = NULL;
+
+out_unlock:
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+
+ /*
+ * pcpu_mem_free() might end up calling vfree() which uses
+ * IRQ-unsafe lock and thus can't be called under pcpu_lock.
+ */
+ pcpu_mem_free(old, old_size);
+ pcpu_mem_free(new, new_size);
+
return 0;
}
@@ -1043,8 +1070,11 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
*/
static void *pcpu_alloc(size_t size, size_t align, bool reserved)
{
+ static int warn_limit = 10;
struct pcpu_chunk *chunk;
- int slot, off;
+ const char *err;
+ int slot, off, new_alloc;
+ unsigned long flags;
if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
WARN(true, "illegal size (%zu) or align (%zu) for "
@@ -1053,17 +1083,31 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
}
mutex_lock(&pcpu_alloc_mutex);
- spin_lock_irq(&pcpu_lock);
+ spin_lock_irqsave(&pcpu_lock, flags);
/* serve reserved allocations from the reserved chunk if available */
if (reserved && pcpu_reserved_chunk) {
chunk = pcpu_reserved_chunk;
- if (size > chunk->contig_hint ||
- pcpu_extend_area_map(chunk) < 0)
+
+ if (size > chunk->contig_hint) {
+ err = "alloc from reserved chunk failed";
goto fail_unlock;
+ }
+
+ while ((new_alloc = pcpu_need_to_extend(chunk))) {
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+ if (pcpu_extend_area_map(chunk, new_alloc) < 0) {
+ err = "failed to extend area map of reserved chunk";
+ goto fail_unlock_mutex;
+ }
+ spin_lock_irqsave(&pcpu_lock, flags);
+ }
+
off = pcpu_alloc_area(chunk, size, align);
if (off >= 0)
goto area_found;
+
+ err = "alloc from reserved chunk failed";
goto fail_unlock;
}
@@ -1074,13 +1118,20 @@ restart:
if (size > chunk->contig_hint)
continue;
- switch (pcpu_extend_area_map(chunk)) {
- case 0:
- break;
- case 1:
- goto restart; /* pcpu_lock dropped, restart */
- default:
- goto fail_unlock;
+ new_alloc = pcpu_need_to_extend(chunk);
+ if (new_alloc) {
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+ if (pcpu_extend_area_map(chunk,
+ new_alloc) < 0) {
+ err = "failed to extend area map";
+ goto fail_unlock_mutex;
+ }
+ spin_lock_irqsave(&pcpu_lock, flags);
+ /*
+ * pcpu_lock has been dropped, need to
+ * restart cpu_slot list walking.
+ */
+ goto restart;
}
off = pcpu_alloc_area(chunk, size, align);
@@ -1090,23 +1141,26 @@ restart:
}
/* hmmm... no space left, create a new chunk */
- spin_unlock_irq(&pcpu_lock);
+ spin_unlock_irqrestore(&pcpu_lock, flags);
chunk = alloc_pcpu_chunk();
- if (!chunk)
+ if (!chunk) {
+ err = "failed to allocate new chunk";
goto fail_unlock_mutex;
+ }
- spin_lock_irq(&pcpu_lock);
+ spin_lock_irqsave(&pcpu_lock, flags);
pcpu_chunk_relocate(chunk, -1);
goto restart;
area_found:
- spin_unlock_irq(&pcpu_lock);
+ spin_unlock_irqrestore(&pcpu_lock, flags);
/* populate, map and clear the area */
if (pcpu_populate_chunk(chunk, off, size)) {
- spin_lock_irq(&pcpu_lock);
+ spin_lock_irqsave(&pcpu_lock, flags);
pcpu_free_area(chunk, off);
+ err = "failed to populate";
goto fail_unlock;
}
@@ -1116,9 +1170,16 @@ area_found:
return __addr_to_pcpu_ptr(chunk->base_addr + off);
fail_unlock:
- spin_unlock_irq(&pcpu_lock);
+ spin_unlock_irqrestore(&pcpu_lock, flags);
fail_unlock_mutex:
mutex_unlock(&pcpu_alloc_mutex);
+ if (warn_limit) {
+ pr_warning("PERCPU: allocation failed, size=%zu align=%zu, "
+ "%s\n", size, align, err);
+ dump_stack();
+ if (!--warn_limit)
+ pr_info("PERCPU: limit reached, disable warning\n");
+ }
return NULL;
}
@@ -1347,6 +1408,10 @@ struct pcpu_alloc_info * __init pcpu_build_alloc_info(
struct pcpu_alloc_info *ai;
unsigned int *cpu_map;
+ /* this function may be called multiple times */
+ memset(group_map, 0, sizeof(group_map));
+ memset(group_cnt, 0, sizeof(group_map));
+
/*
* Determine min_unit_size, alloc_size and max_upa such that
* alloc_size is multiple of atom_size and is the smallest
@@ -1574,6 +1639,7 @@ static void pcpu_dump_alloc_info(const char *lvl,
int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
void *base_addr)
{
+ static char cpus_buf[4096] __initdata;
static int smap[2], dmap[2];
size_t dyn_size = ai->dyn_size;
size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
@@ -1585,17 +1651,26 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
int *unit_map;
int group, unit, i;
+ cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask);
+
+#define PCPU_SETUP_BUG_ON(cond) do { \
+ if (unlikely(cond)) { \
+ pr_emerg("PERCPU: failed to initialize, %s", #cond); \
+ pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf); \
+ pcpu_dump_alloc_info(KERN_EMERG, ai); \
+ BUG(); \
+ } \
+} while (0)
+
/* sanity checks */
BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
- BUG_ON(ai->nr_groups <= 0);
- BUG_ON(!ai->static_size);
- BUG_ON(!base_addr);
- BUG_ON(ai->unit_size < size_sum);
- BUG_ON(ai->unit_size & ~PAGE_MASK);
- BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
-
- pcpu_dump_alloc_info(KERN_DEBUG, ai);
+ PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
+ PCPU_SETUP_BUG_ON(!ai->static_size);
+ PCPU_SETUP_BUG_ON(!base_addr);
+ PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
+ PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
+ PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
/* process group information and build config tables accordingly */
group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
@@ -1604,7 +1679,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0]));
for (cpu = 0; cpu < nr_cpu_ids; cpu++)
- unit_map[cpu] = NR_CPUS;
+ unit_map[cpu] = UINT_MAX;
pcpu_first_unit_cpu = NR_CPUS;
for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
@@ -1618,8 +1693,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
if (cpu == NR_CPUS)
continue;
- BUG_ON(cpu > nr_cpu_ids || !cpu_possible(cpu));
- BUG_ON(unit_map[cpu] != NR_CPUS);
+ PCPU_SETUP_BUG_ON(cpu > nr_cpu_ids);
+ PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
+ PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
unit_map[cpu] = unit + i;
unit_off[cpu] = gi->base_offset + i * ai->unit_size;
@@ -1632,7 +1708,11 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
pcpu_nr_units = unit;
for_each_possible_cpu(cpu)
- BUG_ON(unit_map[cpu] == NR_CPUS);
+ PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
+
+ /* we're done parsing the input, undefine BUG macro and dump config */
+#undef PCPU_SETUP_BUG_ON
+ pcpu_dump_alloc_info(KERN_INFO, ai);
pcpu_nr_groups = ai->nr_groups;
pcpu_group_offsets = group_offsets;
@@ -1782,7 +1862,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
void *base = (void *)ULONG_MAX;
void **areas = NULL;
struct pcpu_alloc_info *ai;
- size_t size_sum, areas_size;
+ size_t size_sum, areas_size, max_distance;
int group, i, rc;
ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
@@ -1832,8 +1912,25 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
}
/* base address is now known, determine group base offsets */
- for (group = 0; group < ai->nr_groups; group++)
+ max_distance = 0;
+ for (group = 0; group < ai->nr_groups; group++) {
ai->groups[group].base_offset = areas[group] - base;
+ max_distance = max_t(size_t, max_distance,
+ ai->groups[group].base_offset);
+ }
+ max_distance += ai->unit_size;
+
+ /* warn if maximum distance is further than 75% of vmalloc space */
+ if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) {
+ pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
+ "space 0x%lx\n",
+ max_distance, VMALLOC_END - VMALLOC_START);
+#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+ /* and fail if we have fallback */
+ rc = -EINVAL;
+ goto out_free;
+#endif
+ }
pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
diff --git a/mm/rmap.c b/mm/rmap.c
index 28aafe2b5306..dd43373a483f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -242,8 +242,8 @@ vma_address(struct page *page, struct vm_area_struct *vma)
}
/*
- * At what user virtual address is page expected in vma? checking that the
- * page matches the vma: currently only used on anon pages, by unuse_vma;
+ * At what user virtual address is page expected in vma?
+ * checking that the page matches the vma.
*/
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4de7f02f820b..9c590eef7912 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1151,8 +1151,7 @@ static int try_to_unuse(unsigned int type)
} else
retval = unuse_mm(mm, entry, page);
- if (set_start_mm &&
- swap_count(*swap_map) < swcount) {
+ if (set_start_mm && *swap_map < swcount) {
mmput(new_start_mm);
atomic_inc(&mm->mm_users);
new_start_mm = mm;
@@ -1974,12 +1973,14 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
goto bad_swap;
}
- if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
- p->flags |= SWP_SOLIDSTATE;
- p->cluster_next = 1 + (random32() % p->highest_bit);
+ if (p->bdev) {
+ if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
+ p->flags |= SWP_SOLIDSTATE;
+ p->cluster_next = 1 + (random32() % p->highest_bit);
+ }
+ if (discard_swap(p) == 0)
+ p->flags |= SWP_DISCARDABLE;
}
- if (discard_swap(p) == 0)
- p->flags |= SWP_DISCARDABLE;
mutex_lock(&swapon_mutex);
spin_lock(&swap_lock);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 69511e663234..0f551a4a44cd 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -12,6 +12,7 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/highmem.h>
+#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
@@ -25,10 +26,10 @@
#include <linux/rcupdate.h>
#include <linux/pfn.h>
#include <linux/kmemleak.h>
-#include <linux/highmem.h>
#include <asm/atomic.h>
#include <asm/uaccess.h>
#include <asm/tlbflush.h>
+#include <asm/shmparam.h>
/*** Page table manipulation functions ***/
@@ -1156,12 +1157,11 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
}
static struct vm_struct *__get_vm_area_node(unsigned long size,
- unsigned long flags, unsigned long start, unsigned long end,
- int node, gfp_t gfp_mask, void *caller)
+ unsigned long align, unsigned long flags, unsigned long start,
+ unsigned long end, int node, gfp_t gfp_mask, void *caller)
{
static struct vmap_area *va;
struct vm_struct *area;
- unsigned long align = 1;
BUG_ON(in_interrupt());
if (flags & VM_IOREMAP) {
@@ -1201,7 +1201,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
unsigned long start, unsigned long end)
{
- return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL,
+ return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
__builtin_return_address(0));
}
EXPORT_SYMBOL_GPL(__get_vm_area);
@@ -1210,7 +1210,7 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
unsigned long start, unsigned long end,
void *caller)
{
- return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL,
+ return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
caller);
}
@@ -1225,22 +1225,22 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
*/
struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
{
- return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END,
+ return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
-1, GFP_KERNEL, __builtin_return_address(0));
}
struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
void *caller)
{
- return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END,
+ return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
-1, GFP_KERNEL, caller);
}
struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
int node, gfp_t gfp_mask)
{
- return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node,
- gfp_mask, __builtin_return_address(0));
+ return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
+ node, gfp_mask, __builtin_return_address(0));
}
static struct vm_struct *find_vm_area(const void *addr)
@@ -1403,7 +1403,8 @@ void *vmap(struct page **pages, unsigned int count,
}
EXPORT_SYMBOL(vmap);
-static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
+static void *__vmalloc_node(unsigned long size, unsigned long align,
+ gfp_t gfp_mask, pgprot_t prot,
int node, void *caller);
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node, void *caller)
@@ -1417,7 +1418,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
area->nr_pages = nr_pages;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
- pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO,
+ pages = __vmalloc_node(array_size, 1, gfp_mask | __GFP_ZERO,
PAGE_KERNEL, node, caller);
area->flags |= VM_VPAGES;
} else {
@@ -1476,6 +1477,7 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
/**
* __vmalloc_node - allocate virtually contiguous memory
* @size: allocation size
+ * @align: desired alignment
* @gfp_mask: flags for the page level allocator
* @prot: protection mask for the allocated pages
* @node: node to use for allocation or -1
@@ -1485,8 +1487,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
* allocator with @gfp_mask flags. Map them into contiguous
* kernel virtual space, using a pagetable protection of @prot.
*/
-static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
- int node, void *caller)
+static void *__vmalloc_node(unsigned long size, unsigned long align,
+ gfp_t gfp_mask, pgprot_t prot,
+ int node, void *caller)
{
struct vm_struct *area;
void *addr;
@@ -1496,8 +1499,8 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
if (!size || (size >> PAGE_SHIFT) > totalram_pages)
return NULL;
- area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END,
- node, gfp_mask, caller);
+ area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START,
+ VMALLOC_END, node, gfp_mask, caller);
if (!area)
return NULL;
@@ -1516,7 +1519,7 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
{
- return __vmalloc_node(size, gfp_mask, prot, -1,
+ return __vmalloc_node(size, 1, gfp_mask, prot, -1,
__builtin_return_address(0));
}
EXPORT_SYMBOL(__vmalloc);
@@ -1532,7 +1535,7 @@ EXPORT_SYMBOL(__vmalloc);
*/
void *vmalloc(unsigned long size)
{
- return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
+ return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
-1, __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc);
@@ -1549,7 +1552,8 @@ void *vmalloc_user(unsigned long size)
struct vm_struct *area;
void *ret;
- ret = __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+ ret = __vmalloc_node(size, SHMLBA,
+ GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
PAGE_KERNEL, -1, __builtin_return_address(0));
if (ret) {
area = find_vm_area(ret);
@@ -1572,7 +1576,7 @@ EXPORT_SYMBOL(vmalloc_user);
*/
void *vmalloc_node(unsigned long size, int node)
{
- return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
+ return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
node, __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_node);
@@ -1595,7 +1599,7 @@ EXPORT_SYMBOL(vmalloc_node);
void *vmalloc_exec(unsigned long size)
{
- return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
+ return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
-1, __builtin_return_address(0));
}
@@ -1616,7 +1620,7 @@ void *vmalloc_exec(unsigned long size)
*/
void *vmalloc_32(unsigned long size)
{
- return __vmalloc_node(size, GFP_VMALLOC32, PAGE_KERNEL,
+ return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
-1, __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32);
@@ -1633,7 +1637,7 @@ void *vmalloc_32_user(unsigned long size)
struct vm_struct *area;
void *ret;
- ret = __vmalloc_node(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
+ ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
-1, __builtin_return_address(0));
if (ret) {
area = find_vm_area(ret);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 64e438898832..777af57fd8c8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -544,6 +544,16 @@ redo:
*/
lru = LRU_UNEVICTABLE;
add_page_to_unevictable_list(page);
+ /*
+ * When racing with an mlock clearing (page is
+ * unlocked), make sure that if the other thread does
+ * not observe our setting of PG_lru and fails
+ * isolation, we see PG_mlocked cleared below and move
+ * the page back to the evictable list.
+ *
+ * The other side is TestClearPageMlocked().
+ */
+ smp_mb();
}
/*
@@ -1088,7 +1098,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
int lumpy_reclaim = 0;
while (unlikely(too_many_isolated(zone, file, sc))) {
- congestion_wait(WRITE, HZ/10);
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
/* We are about to die and free our memory. Return now. */
if (fatal_signal_pending(current))
@@ -1356,7 +1366,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
* IO, plus JVM can create lots of anon VM_EXEC pages,
* so we ignore them here.
*/
- if ((vm_flags & VM_EXEC) && !PageAnon(page)) {
+ if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
list_add(&page->lru, &l_active);
continue;
}