diff options
author | Balbir Singh <balbir@linux.vnet.ibm.com> | 2008-02-07 00:13:56 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2008-02-07 08:42:18 -0800 |
commit | 66e1707bc34609f626e2e7b4fe7e454c9748bad5 (patch) | |
tree | d850a729887485874c976ba64eb85e3406e488a1 /mm/memcontrol.c | |
parent | 67e465a77ba658635309ee00b367bec6555ea544 (diff) | |
download | lwn-66e1707bc34609f626e2e7b4fe7e454c9748bad5.tar.gz lwn-66e1707bc34609f626e2e7b4fe7e454c9748bad5.zip |
Memory controller: add per cgroup LRU and reclaim
Add the page_cgroup to the per cgroup LRU. The reclaim algorithm has
been modified to make the isolate_lru_pages() as a pluggable component. The
scan_control data structure now accepts the cgroup on behalf of which
reclaims are carried out. try_to_free_pages() has been extended to become
cgroup aware.
[akpm@linux-foundation.org: fix warning]
[Lee.Schermerhorn@hp.com: initialize all scan_control's isolate_pages member]
[bunk@kernel.org: make do_try_to_free_pages() static]
[hugh@veritas.com: memcgroup: fix try_to_free order]
[kamezawa.hiroyu@jp.fujitsu.com: this unlock_page_cgroup() is unnecessary]
Signed-off-by: Pavel Emelianov <xemul@openvz.org>
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Paul Menage <menage@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Kirill Korotaev <dev@sw.ru>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: David Rientjes <rientjes@google.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 148 |
1 files changed, 143 insertions, 5 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b25df2a9d024..9e9ff914c0f1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -22,10 +22,15 @@ #include <linux/cgroup.h> #include <linux/mm.h> #include <linux/page-flags.h> +#include <linux/backing-dev.h> #include <linux/bit_spinlock.h> #include <linux/rcupdate.h> +#include <linux/swap.h> +#include <linux/spinlock.h> +#include <linux/fs.h> struct cgroup_subsys mem_cgroup_subsys; +static const int MEM_CGROUP_RECLAIM_RETRIES = 5; /* * The memory controller data structure. The memory controller controls both @@ -51,6 +56,10 @@ struct mem_cgroup { */ struct list_head active_list; struct list_head inactive_list; + /* + * spin_lock to protect the per cgroup LRU + */ + spinlock_t lru_lock; }; /* @@ -141,6 +150,94 @@ void __always_inline unlock_page_cgroup(struct page *page) bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); } +void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) +{ + if (active) + list_move(&pc->lru, &pc->mem_cgroup->active_list); + else + list_move(&pc->lru, &pc->mem_cgroup->inactive_list); +} + +/* + * This routine assumes that the appropriate zone's lru lock is already held + */ +void mem_cgroup_move_lists(struct page_cgroup *pc, bool active) +{ + struct mem_cgroup *mem; + if (!pc) + return; + + mem = pc->mem_cgroup; + + spin_lock(&mem->lru_lock); + __mem_cgroup_move_lists(pc, active); + spin_unlock(&mem->lru_lock); +} + +unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, + struct list_head *dst, + unsigned long *scanned, int order, + int mode, struct zone *z, + struct mem_cgroup *mem_cont, + int active) +{ + unsigned long nr_taken = 0; + struct page *page; + unsigned long scan; + LIST_HEAD(pc_list); + struct list_head *src; + struct page_cgroup *pc; + + if (active) + src = &mem_cont->active_list; + else + src = &mem_cont->inactive_list; + + spin_lock(&mem_cont->lru_lock); + for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { + pc = list_entry(src->prev, struct page_cgroup, lru); + page = pc->page; + VM_BUG_ON(!pc); + + if (PageActive(page) && !active) { + __mem_cgroup_move_lists(pc, true); + scan--; + continue; + } + if (!PageActive(page) && active) { + __mem_cgroup_move_lists(pc, false); + scan--; + continue; + } + + /* + * Reclaim, per zone + * TODO: make the active/inactive lists per zone + */ + if (page_zone(page) != z) + continue; + + /* + * Check if the meta page went away from under us + */ + if (!list_empty(&pc->lru)) + list_move(&pc->lru, &pc_list); + else + continue; + + if (__isolate_lru_page(page, mode) == 0) { + list_move(&page->lru, dst); + nr_taken++; + } + } + + list_splice(&pc_list, src); + spin_unlock(&mem_cont->lru_lock); + + *scanned = scan; + return nr_taken; +} + /* * Charge the memory controller for page usage. * Return @@ -151,6 +248,8 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm) { struct mem_cgroup *mem; struct page_cgroup *pc, *race_pc; + unsigned long flags; + unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; /* * Should page_cgroup's go to their own slab? @@ -159,14 +258,20 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm) * to see if the cgroup page already has a page_cgroup associated * with it */ +retry: lock_page_cgroup(page); pc = page_get_page_cgroup(page); /* * The page_cgroup exists and the page has already been accounted */ if (pc) { - atomic_inc(&pc->ref_cnt); - goto done; + if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) { + /* this page is under being uncharged ? */ + unlock_page_cgroup(page); + cpu_relax(); + goto retry; + } else + goto done; } unlock_page_cgroup(page); @@ -197,7 +302,32 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm) * If we created the page_cgroup, we should free it on exceeding * the cgroup limit. */ - if (res_counter_charge(&mem->res, 1)) { + while (res_counter_charge(&mem->res, 1)) { + if (try_to_free_mem_cgroup_pages(mem)) + continue; + + /* + * try_to_free_mem_cgroup_pages() might not give us a full + * picture of reclaim. Some pages are reclaimed and might be + * moved to swap cache or just unmapped from the cgroup. + * Check the limit again to see if the reclaim reduced the + * current usage of the cgroup before giving up + */ + if (res_counter_check_under_limit(&mem->res)) + continue; + /* + * Since we control both RSS and cache, we end up with a + * very interesting scenario where we end up reclaiming + * memory (essentially RSS), since the memory is pushed + * to swap cache, we eventually end up adding those + * pages back to our list. Hence we give ourselves a + * few chances before we fail + */ + else if (nr_retries--) { + congestion_wait(WRITE, HZ/10); + continue; + } + css_put(&mem->css); goto free_pc; } @@ -221,14 +351,16 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm) pc->page = page; page_assign_page_cgroup(page, pc); + spin_lock_irqsave(&mem->lru_lock, flags); + list_add(&pc->lru, &mem->active_list); + spin_unlock_irqrestore(&mem->lru_lock, flags); + done: unlock_page_cgroup(page); return 0; free_pc: kfree(pc); - return -ENOMEM; err: - unlock_page_cgroup(page); return -ENOMEM; } @@ -240,6 +372,7 @@ void mem_cgroup_uncharge(struct page_cgroup *pc) { struct mem_cgroup *mem; struct page *page; + unsigned long flags; if (!pc) return; @@ -252,6 +385,10 @@ void mem_cgroup_uncharge(struct page_cgroup *pc) page_assign_page_cgroup(page, NULL); unlock_page_cgroup(page); res_counter_uncharge(&mem->res, 1); + + spin_lock_irqsave(&mem->lru_lock, flags); + list_del_init(&pc->lru); + spin_unlock_irqrestore(&mem->lru_lock, flags); kfree(pc); } } @@ -310,6 +447,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) res_counter_init(&mem->res); INIT_LIST_HEAD(&mem->active_list); INIT_LIST_HEAD(&mem->inactive_list); + spin_lock_init(&mem->lru_lock); return &mem->css; } |