diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 157 | ||||
-rw-r--r-- | mm/cma.c | 4 | ||||
-rw-r--r-- | mm/compaction.c | 17 | ||||
-rw-r--r-- | mm/debug_vm_pgtable.c | 4 | ||||
-rw-r--r-- | mm/filemap.c | 149 | ||||
-rw-r--r-- | mm/hugetlb.c | 17 | ||||
-rw-r--r-- | mm/khugepaged.c | 3 | ||||
-rw-r--r-- | mm/memcontrol.c | 31 | ||||
-rw-r--r-- | mm/memory.c | 35 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 13 | ||||
-rw-r--r-- | mm/migrate.c | 13 | ||||
-rw-r--r-- | mm/mmap.c | 16 | ||||
-rw-r--r-- | mm/mremap.c | 23 | ||||
-rw-r--r-- | mm/nommu.c | 17 | ||||
-rw-r--r-- | mm/page_alloc.c | 2 | ||||
-rw-r--r-- | mm/page_io.c | 17 | ||||
-rw-r--r-- | mm/shmem.c | 2 | ||||
-rw-r--r-- | mm/slab.h | 4 | ||||
-rw-r--r-- | mm/slab_common.c | 37 | ||||
-rw-r--r-- | mm/slub.c | 19 | ||||
-rw-r--r-- | mm/swap.c | 3 | ||||
-rw-r--r-- | mm/swap_state.c | 4 | ||||
-rw-r--r-- | mm/swapfile.c | 2 | ||||
-rw-r--r-- | mm/vmalloc.c | 21 | ||||
-rw-r--r-- | mm/vmscan.c | 3 | ||||
-rw-r--r-- | mm/workingset.c | 46 |
26 files changed, 330 insertions, 329 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index d382272bcc31..8e8b00627bb2 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -281,7 +281,7 @@ void wb_wakeup_delayed(struct bdi_writeback *wb) #define INIT_BW (100 << (20 - PAGE_SHIFT)) static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, - int blkcg_id, gfp_t gfp) + gfp_t gfp) { int i, err; @@ -308,15 +308,9 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, INIT_DELAYED_WORK(&wb->dwork, wb_workfn); wb->dirty_sleep = jiffies; - wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp); - if (!wb->congested) { - err = -ENOMEM; - goto out_put_bdi; - } - err = fprop_local_init_percpu(&wb->completions, gfp); if (err) - goto out_put_cong; + goto out_put_bdi; for (i = 0; i < NR_WB_STAT_ITEMS; i++) { err = percpu_counter_init(&wb->stat[i], 0, gfp); @@ -330,8 +324,6 @@ out_destroy_stat: while (i--) percpu_counter_destroy(&wb->stat[i]); fprop_local_destroy_percpu(&wb->completions); -out_put_cong: - wb_congested_put(wb->congested); out_put_bdi: if (wb != &bdi->wb) bdi_put(bdi); @@ -374,7 +366,6 @@ static void wb_exit(struct bdi_writeback *wb) percpu_counter_destroy(&wb->stat[i]); fprop_local_destroy_percpu(&wb->completions); - wb_congested_put(wb->congested); if (wb != &wb->bdi->wb) bdi_put(wb->bdi); } @@ -384,99 +375,12 @@ static void wb_exit(struct bdi_writeback *wb) #include <linux/memcontrol.h> /* - * cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree, - * blkcg->cgwb_list, and memcg->cgwb_list. bdi->cgwb_tree is also RCU - * protected. + * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, and memcg->cgwb_list. + * bdi->cgwb_tree is also RCU protected. */ static DEFINE_SPINLOCK(cgwb_lock); static struct workqueue_struct *cgwb_release_wq; -/** - * wb_congested_get_create - get or create a wb_congested - * @bdi: associated bdi - * @blkcg_id: ID of the associated blkcg - * @gfp: allocation mask - * - * Look up the wb_congested for @blkcg_id on @bdi. If missing, create one. - * The returned wb_congested has its reference count incremented. Returns - * NULL on failure. - */ -struct bdi_writeback_congested * -wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp) -{ - struct bdi_writeback_congested *new_congested = NULL, *congested; - struct rb_node **node, *parent; - unsigned long flags; -retry: - spin_lock_irqsave(&cgwb_lock, flags); - - node = &bdi->cgwb_congested_tree.rb_node; - parent = NULL; - - while (*node != NULL) { - parent = *node; - congested = rb_entry(parent, struct bdi_writeback_congested, - rb_node); - if (congested->blkcg_id < blkcg_id) - node = &parent->rb_left; - else if (congested->blkcg_id > blkcg_id) - node = &parent->rb_right; - else - goto found; - } - - if (new_congested) { - /* !found and storage for new one already allocated, insert */ - congested = new_congested; - rb_link_node(&congested->rb_node, parent, node); - rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree); - spin_unlock_irqrestore(&cgwb_lock, flags); - return congested; - } - - spin_unlock_irqrestore(&cgwb_lock, flags); - - /* allocate storage for new one and retry */ - new_congested = kzalloc(sizeof(*new_congested), gfp); - if (!new_congested) - return NULL; - - refcount_set(&new_congested->refcnt, 1); - new_congested->__bdi = bdi; - new_congested->blkcg_id = blkcg_id; - goto retry; - -found: - refcount_inc(&congested->refcnt); - spin_unlock_irqrestore(&cgwb_lock, flags); - kfree(new_congested); - return congested; -} - -/** - * wb_congested_put - put a wb_congested - * @congested: wb_congested to put - * - * Put @congested and destroy it if the refcnt reaches zero. - */ -void wb_congested_put(struct bdi_writeback_congested *congested) -{ - unsigned long flags; - - if (!refcount_dec_and_lock_irqsave(&congested->refcnt, &cgwb_lock, &flags)) - return; - - /* bdi might already have been destroyed leaving @congested unlinked */ - if (congested->__bdi) { - rb_erase(&congested->rb_node, - &congested->__bdi->cgwb_congested_tree); - congested->__bdi = NULL; - } - - spin_unlock_irqrestore(&cgwb_lock, flags); - kfree(congested); -} - static void cgwb_release_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(work, struct bdi_writeback, @@ -558,7 +462,7 @@ static int cgwb_create(struct backing_dev_info *bdi, goto out_put; } - ret = wb_init(wb, bdi, blkcg_css->id, gfp); + ret = wb_init(wb, bdi, gfp); if (ret) goto err_free; @@ -696,11 +600,10 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) int ret; INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); - bdi->cgwb_congested_tree = RB_ROOT; mutex_init(&bdi->cgwb_release_mutex); init_rwsem(&bdi->wb_switch_rwsem); - ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); + ret = wb_init(&bdi->wb, bdi, GFP_KERNEL); if (!ret) { bdi->wb.memcg_css = &root_mem_cgroup->css; bdi->wb.blkcg_css = blkcg_root_css; @@ -769,21 +672,6 @@ void wb_blkcg_offline(struct blkcg *blkcg) spin_unlock_irq(&cgwb_lock); } -static void cgwb_bdi_exit(struct backing_dev_info *bdi) -{ - struct rb_node *rbn; - - spin_lock_irq(&cgwb_lock); - while ((rbn = rb_first(&bdi->cgwb_congested_tree))) { - struct bdi_writeback_congested *congested = - rb_entry(rbn, struct bdi_writeback_congested, rb_node); - - rb_erase(rbn, &bdi->cgwb_congested_tree); - congested->__bdi = NULL; /* mark @congested unlinked */ - } - spin_unlock_irq(&cgwb_lock); -} - static void cgwb_bdi_register(struct backing_dev_info *bdi) { spin_lock_irq(&cgwb_lock); @@ -810,29 +698,11 @@ subsys_initcall(cgwb_init); static int cgwb_bdi_init(struct backing_dev_info *bdi) { - int err; - - bdi->wb_congested = kzalloc(sizeof(*bdi->wb_congested), GFP_KERNEL); - if (!bdi->wb_congested) - return -ENOMEM; - - refcount_set(&bdi->wb_congested->refcnt, 1); - - err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); - if (err) { - wb_congested_put(bdi->wb_congested); - return err; - } - return 0; + return wb_init(&bdi->wb, bdi, GFP_KERNEL); } static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { } -static void cgwb_bdi_exit(struct backing_dev_info *bdi) -{ - wb_congested_put(bdi->wb_congested); -} - static void cgwb_bdi_register(struct backing_dev_info *bdi) { list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); @@ -1023,7 +893,6 @@ static void release_bdi(struct kref *ref) bdi_unregister(bdi); WARN_ON_ONCE(bdi->dev); wb_exit(&bdi->wb); - cgwb_bdi_exit(bdi); kfree(bdi); } @@ -1047,29 +916,29 @@ static wait_queue_head_t congestion_wqh[2] = { }; static atomic_t nr_wb_congested[2]; -void clear_wb_congested(struct bdi_writeback_congested *congested, int sync) +void clear_bdi_congested(struct backing_dev_info *bdi, int sync) { wait_queue_head_t *wqh = &congestion_wqh[sync]; enum wb_congested_state bit; bit = sync ? WB_sync_congested : WB_async_congested; - if (test_and_clear_bit(bit, &congested->state)) + if (test_and_clear_bit(bit, &bdi->wb.congested)) atomic_dec(&nr_wb_congested[sync]); smp_mb__after_atomic(); if (waitqueue_active(wqh)) wake_up(wqh); } -EXPORT_SYMBOL(clear_wb_congested); +EXPORT_SYMBOL(clear_bdi_congested); -void set_wb_congested(struct bdi_writeback_congested *congested, int sync) +void set_bdi_congested(struct backing_dev_info *bdi, int sync) { enum wb_congested_state bit; bit = sync ? WB_sync_congested : WB_async_congested; - if (!test_and_set_bit(bit, &congested->state)) + if (!test_and_set_bit(bit, &bdi->wb.congested)) atomic_inc(&nr_wb_congested[sync]); } -EXPORT_SYMBOL(set_wb_congested); +EXPORT_SYMBOL(set_bdi_congested); /** * congestion_wait - wait for a backing_dev to become uncongested @@ -339,13 +339,13 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, */ if (base < highmem_start && limit > highmem_start) { addr = memblock_alloc_range_nid(size, alignment, - highmem_start, limit, nid, false); + highmem_start, limit, nid, true); limit = highmem_start; } if (!addr) { addr = memblock_alloc_range_nid(size, alignment, base, - limit, nid, false); + limit, nid, true); if (!addr) { ret = -ENOMEM; goto err; diff --git a/mm/compaction.c b/mm/compaction.c index fd988b7e5f2b..86375605faa9 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2316,15 +2316,26 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .page = NULL, }; - current->capture_control = &capc; + /* + * Make sure the structs are really initialized before we expose the + * capture control, in case we are interrupted and the interrupt handler + * frees a page. + */ + barrier(); + WRITE_ONCE(current->capture_control, &capc); ret = compact_zone(&cc, &capc); VM_BUG_ON(!list_empty(&cc.freepages)); VM_BUG_ON(!list_empty(&cc.migratepages)); - *capture = capc.page; - current->capture_control = NULL; + /* + * Make sure we hide capture control first before we read the captured + * page pointer, otherwise an interrupt could free and capture a page + * and we would leak it. + */ + WRITE_ONCE(current->capture_control, NULL); + *capture = READ_ONCE(capc.page); return ret; } diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index e45623016aea..61ab16fb2e36 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -246,13 +246,13 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp, static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep, unsigned long vaddr) { - pte_t pte = READ_ONCE(*ptep); + pte_t pte = ptep_get(ptep); pte = __pte(pte_val(pte) | RANDOM_ORVALUE); set_pte_at(mm, vaddr, ptep, pte); barrier(); pte_clear(mm, vaddr, ptep); - pte = READ_ONCE(*ptep); + pte = ptep_get(ptep); WARN_ON(!pte_none(pte)); } diff --git a/mm/filemap.c b/mm/filemap.c index a5b1fa8f7ce4..9f131f1cfde3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -989,15 +989,44 @@ void __init pagecache_init(void) static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { + int ret; struct wait_page_key *key = arg; struct wait_page_queue *wait_page = container_of(wait, struct wait_page_queue, wait); - int ret; - ret = wake_page_match(wait_page, key); - if (ret != 1) - return ret; - return autoremove_wake_function(wait, mode, sync, key); + if (!wake_page_match(wait_page, key)) + return 0; + + /* + * If it's an exclusive wait, we get the bit for it, and + * stop walking if we can't. + * + * If it's a non-exclusive wait, then the fact that this + * wake function was called means that the bit already + * was cleared, and we don't care if somebody then + * re-took it. + */ + ret = 0; + if (wait->flags & WQ_FLAG_EXCLUSIVE) { + if (test_and_set_bit(key->bit_nr, &key->page->flags)) + return -1; + ret = 1; + } + wait->flags |= WQ_FLAG_WOKEN; + + wake_up_state(wait->private, mode); + + /* + * Ok, we have successfully done what we're waiting for, + * and we can unconditionally remove the wait entry. + * + * Note that this has to be the absolute last thing we do, + * since after list_del_init(&wait->entry) the wait entry + * might be de-allocated and the process might even have + * exited. + */ + list_del_init_careful(&wait->entry); + return ret; } static void wake_up_page_bit(struct page *page, int bit_nr) @@ -1076,16 +1105,31 @@ enum behavior { */ }; +/* + * Attempt to check (or get) the page bit, and mark the + * waiter woken if successful. + */ +static inline bool trylock_page_bit_common(struct page *page, int bit_nr, + struct wait_queue_entry *wait) +{ + if (wait->flags & WQ_FLAG_EXCLUSIVE) { + if (test_and_set_bit(bit_nr, &page->flags)) + return false; + } else if (test_bit(bit_nr, &page->flags)) + return false; + + wait->flags |= WQ_FLAG_WOKEN; + return true; +} + static inline int wait_on_page_bit_common(wait_queue_head_t *q, struct page *page, int bit_nr, int state, enum behavior behavior) { struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; - bool bit_is_set; bool thrashing = false; bool delayacct = false; unsigned long pflags; - int ret = 0; if (bit_nr == PG_locked && !PageUptodate(page) && PageWorkingset(page)) { @@ -1103,48 +1147,47 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, wait_page.page = page; wait_page.bit_nr = bit_nr; - for (;;) { - spin_lock_irq(&q->lock); + /* + * Do one last check whether we can get the + * page bit synchronously. + * + * Do the SetPageWaiters() marking before that + * to let any waker we _just_ missed know they + * need to wake us up (otherwise they'll never + * even go to the slow case that looks at the + * page queue), and add ourselves to the wait + * queue if we need to sleep. + * + * This part needs to be done under the queue + * lock to avoid races. + */ + spin_lock_irq(&q->lock); + SetPageWaiters(page); + if (!trylock_page_bit_common(page, bit_nr, wait)) + __add_wait_queue_entry_tail(q, wait); + spin_unlock_irq(&q->lock); - if (likely(list_empty(&wait->entry))) { - __add_wait_queue_entry_tail(q, wait); - SetPageWaiters(page); - } + /* + * From now on, all the logic will be based on + * the WQ_FLAG_WOKEN flag, and the and the page + * bit testing (and setting) will be - or has + * already been - done by the wake function. + * + * We can drop our reference to the page. + */ + if (behavior == DROP) + put_page(page); + for (;;) { set_current_state(state); - spin_unlock_irq(&q->lock); - - bit_is_set = test_bit(bit_nr, &page->flags); - if (behavior == DROP) - put_page(page); - - if (likely(bit_is_set)) - io_schedule(); - - if (behavior == EXCLUSIVE) { - if (!test_and_set_bit_lock(bit_nr, &page->flags)) - break; - } else if (behavior == SHARED) { - if (!test_bit(bit_nr, &page->flags)) - break; - } - - if (signal_pending_state(state, current)) { - ret = -EINTR; + if (signal_pending_state(state, current)) break; - } - if (behavior == DROP) { - /* - * We can no longer safely access page->flags: - * even if CONFIG_MEMORY_HOTREMOVE is not enabled, - * there is a risk of waiting forever on a page reused - * for something that keeps it locked indefinitely. - * But best check for -EINTR above before breaking. - */ + if (wait->flags & WQ_FLAG_WOKEN) break; - } + + io_schedule(); } finish_wait(q, wait); @@ -1163,7 +1206,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, * bother with signals either. */ - return ret; + return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR; } void wait_on_page_bit(struct page *page, int bit_nr) @@ -2044,6 +2087,8 @@ find_page: page = find_get_page(mapping, index); if (!page) { + if (iocb->ki_flags & IOCB_NOIO) + goto would_block; page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); @@ -2052,6 +2097,10 @@ find_page: goto no_cached_page; } if (PageReadahead(page)) { + if (iocb->ki_flags & IOCB_NOIO) { + put_page(page); + goto out; + } page_cache_async_readahead(mapping, ra, filp, page, index, last_index - index); @@ -2185,7 +2234,7 @@ page_not_up_to_date_locked: } readpage: - if (iocb->ki_flags & IOCB_NOWAIT) { + if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) { unlock_page(page); put_page(page); goto would_block; @@ -2279,9 +2328,19 @@ EXPORT_SYMBOL_GPL(generic_file_buffered_read); * * This is the "read_iter()" routine for all filesystems * that can use the page cache directly. + * + * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall + * be returned when no data can be read without waiting for I/O requests + * to complete; it doesn't prevent readahead. + * + * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O + * requests shall be made for the read or for readahead. When no data + * can be read, -EAGAIN shall be returned. When readahead would be + * triggered, a partial, possibly empty read shall be returned. + * * Return: * * number of bytes copied, even for partial reads - * * negative error code if nothing was read + * * negative error code (or 0 if IOCB_NOIO) if nothing was read */ ssize_t generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 57ece74e3aae..590111ea6975 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -45,7 +45,10 @@ int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; +#ifdef CONFIG_CMA static struct cma *hugetlb_cma[MAX_NUMNODES]; +#endif +static unsigned long hugetlb_cma_size __initdata; /* * Minimum page order among possible hugepage sizes, set to a proper value @@ -1235,9 +1238,10 @@ static void free_gigantic_page(struct page *page, unsigned int order) * If the page isn't allocated using the cma allocator, * cma_release() returns false. */ - if (IS_ENABLED(CONFIG_CMA) && - cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order)) +#ifdef CONFIG_CMA + if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order)) return; +#endif free_contig_range(page_to_pfn(page), 1 << order); } @@ -1248,7 +1252,8 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, { unsigned long nr_pages = 1UL << huge_page_order(h); - if (IS_ENABLED(CONFIG_CMA)) { +#ifdef CONFIG_CMA + { struct page *page; int node; @@ -1262,6 +1267,7 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, return page; } } +#endif return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); } @@ -1593,7 +1599,7 @@ static struct address_space *_get_hugetlb_page_mapping(struct page *hpage) /* Use first found vma */ pgoff_start = page_to_pgoff(hpage); - pgoff_end = pgoff_start + hpage_nr_pages(hpage) - 1; + pgoff_end = pgoff_start + pages_per_huge_page(page_hstate(hpage)) - 1; anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff_start, pgoff_end) { struct vm_area_struct *vma = avc->vma; @@ -2571,7 +2577,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) for (i = 0; i < h->max_huge_pages; ++i) { if (hstate_is_gigantic(h)) { - if (IS_ENABLED(CONFIG_CMA) && hugetlb_cma[0]) { + if (hugetlb_cma_size) { pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n"); break; } @@ -5654,7 +5660,6 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) } #ifdef CONFIG_CMA -static unsigned long hugetlb_cma_size __initdata; static bool cma_reserve_called __initdata; static int __init cmdline_parse_hugetlb_cma(char *p) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b043c40a21d4..700f5160f3e4 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -958,6 +958,9 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, return SCAN_ADDRESS_RANGE; if (!hugepage_vma_check(vma, vma->vm_flags)) return SCAN_VMA_CHECK; + /* Anon VMA expected */ + if (!vma->anon_vma || vma->vm_ops) + return SCAN_VMA_CHECK; return 0; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0b38b6ad547d..13f559af1ab6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2772,8 +2772,10 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, return; cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN); - if (!cw) + if (!cw) { + css_put(&memcg->css); return; + } cw->memcg = memcg; cw->cachep = cachep; @@ -5667,7 +5669,6 @@ static void __mem_cgroup_clear_mc(void) if (!mem_cgroup_is_root(mc.to)) page_counter_uncharge(&mc.to->memory, mc.moved_swap); - mem_cgroup_id_get_many(mc.to, mc.moved_swap); css_put_many(&mc.to->css, mc.moved_swap); mc.moved_swap = 0; @@ -5858,7 +5859,8 @@ put: /* get_mctgt_type() gets the page */ ent = target.ent; if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { mc.precharge--; - /* we fixup refcnts and charges later. */ + mem_cgroup_id_get_many(mc.to, 1); + /* we fixup other refcnts and charges later. */ mc.moved_swap++; } break; @@ -6360,11 +6362,16 @@ static unsigned long effective_protection(unsigned long usage, * We're using unprotected memory for the weight so that if * some cgroups DO claim explicit protection, we don't protect * the same bytes twice. + * + * Check both usage and parent_usage against the respective + * protected values. One should imply the other, but they + * aren't read atomically - make sure the division is sane. */ if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)) return ep; - - if (parent_effective > siblings_protected && usage > protected) { + if (parent_effective > siblings_protected && + parent_usage > siblings_protected && + usage > protected) { unsigned long unclaimed; unclaimed = parent_effective - siblings_protected; @@ -6416,7 +6423,7 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, if (parent == root) { memcg->memory.emin = READ_ONCE(memcg->memory.min); - memcg->memory.elow = memcg->memory.low; + memcg->memory.elow = READ_ONCE(memcg->memory.low); goto out; } @@ -6428,7 +6435,8 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, atomic_long_read(&parent->memory.children_min_usage))); WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage, - memcg->memory.low, READ_ONCE(parent->memory.elow), + READ_ONCE(memcg->memory.low), + READ_ONCE(parent->memory.elow), atomic_long_read(&parent->memory.children_low_usage))); out: @@ -7178,6 +7186,13 @@ static struct cftype memsw_files[] = { { }, /* terminate */ }; +/* + * If mem_cgroup_swap_init() is implemented as a subsys_initcall() + * instead of a core_initcall(), this could mean cgroup_memory_noswap still + * remains set to false even when memcg is disabled via "cgroup_disable=memory" + * boot parameter. This may result in premature OOPS inside + * mem_cgroup_get_nr_swap_pages() function in corner cases. + */ static int __init mem_cgroup_swap_init(void) { /* No memory control -> no swap control */ @@ -7192,6 +7207,6 @@ static int __init mem_cgroup_swap_init(void) return 0; } -subsys_initcall(mem_cgroup_swap_init); +core_initcall(mem_cgroup_swap_init); #endif /* CONFIG_MEMCG_SWAP */ diff --git a/mm/memory.c b/mm/memory.c index dc7f3543b1fd..3ecad55103ad 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1498,7 +1498,7 @@ out: } #ifdef pte_index -static int insert_page_in_batch_locked(struct mm_struct *mm, pmd_t *pmd, +static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { int err; @@ -1506,8 +1506,9 @@ static int insert_page_in_batch_locked(struct mm_struct *mm, pmd_t *pmd, if (!page_count(page)) return -EINVAL; err = validate_page_before_insert(page); - return err ? err : insert_page_into_pte_locked( - mm, pte_offset_map(pmd, addr), addr, page, prot); + if (err) + return err; + return insert_page_into_pte_locked(mm, pte, addr, page, prot); } /* insert_pages() amortizes the cost of spinlock operations @@ -1517,7 +1518,8 @@ static int insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num, pgprot_t prot) { pmd_t *pmd = NULL; - spinlock_t *pte_lock = NULL; + pte_t *start_pte, *pte; + spinlock_t *pte_lock; struct mm_struct *const mm = vma->vm_mm; unsigned long curr_page_idx = 0; unsigned long remaining_pages_total = *num; @@ -1536,18 +1538,17 @@ more: ret = -ENOMEM; if (pte_alloc(mm, pmd)) goto out; - pte_lock = pte_lockptr(mm, pmd); while (pages_to_write_in_pmd) { int pte_idx = 0; const int batch_size = min_t(int, pages_to_write_in_pmd, 8); - spin_lock(pte_lock); - for (; pte_idx < batch_size; ++pte_idx) { - int err = insert_page_in_batch_locked(mm, pmd, + start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock); + for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) { + int err = insert_page_in_batch_locked(mm, pte, addr, pages[curr_page_idx], prot); if (unlikely(err)) { - spin_unlock(pte_lock); + pte_unmap_unlock(start_pte, pte_lock); ret = err; remaining_pages_total -= pte_idx; goto out; @@ -1555,7 +1556,7 @@ more: addr += PAGE_SIZE; ++curr_page_idx; } - spin_unlock(pte_lock); + pte_unmap_unlock(start_pte, pte_lock); pages_to_write_in_pmd -= batch_size; remaining_pages_total -= batch_size; } @@ -1600,7 +1601,7 @@ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, return insert_pages(vma, addr, pages, num, vma->vm_page_prot); #else unsigned long idx = 0, pgcount = *num; - int err; + int err = -EINVAL; for (; idx < pgcount; ++idx) { err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]); @@ -3140,8 +3141,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) err = mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL); ClearPageSwapCache(page); - if (err) + if (err) { + ret = VM_FAULT_OOM; goto out_page; + } + + /* + * XXX: Move to lru_cache_add() when it + * supports new vs putback + */ + spin_lock_irq(&page_pgdat(page)->lru_lock); + lru_note_cost_page(page); + spin_unlock_irq(&page_pgdat(page)->lru_lock); lru_cache_add(page); swap_readpage(page, true); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9b34e03e730a..da374cd3d45b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -471,11 +471,20 @@ void __ref remove_pfn_range_from_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { + const unsigned long end_pfn = start_pfn + nr_pages; struct pglist_data *pgdat = zone->zone_pgdat; - unsigned long flags; + unsigned long pfn, cur_nr_pages, flags; /* Poison struct pages because they are now uninitialized again. */ - page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages); + for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) { + cond_resched(); + + /* Select all remaining pages up to the next section boundary */ + cur_nr_pages = + min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn); + page_init_poison(pfn_to_page(pfn), + sizeof(struct page) * cur_nr_pages); + } #ifdef CONFIG_ZONE_DEVICE /* diff --git a/mm/migrate.c b/mm/migrate.c index f37729673558..40cd7016ae6f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1161,21 +1161,10 @@ out: } /* - * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work - * around it. - */ -#if defined(CONFIG_ARM) && \ - defined(GCC_VERSION) && GCC_VERSION < 40900 && GCC_VERSION >= 40700 -#define ICE_noinline noinline -#else -#define ICE_noinline -#endif - -/* * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. */ -static ICE_noinline int unmap_and_move(new_page_t get_new_page, +static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, unsigned long private, struct page *page, int force, enum migrate_mode mode, diff --git a/mm/mmap.c b/mm/mmap.c index 59a4682ebf3f..8c7ca737a19b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2620,7 +2620,7 @@ static void unmap_region(struct mm_struct *mm, * Create a list of vma's touched by the unmap, removing them from the mm's * vma list as we go.. */ -static void +static bool detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long end) { @@ -2645,6 +2645,17 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, /* Kill the cache */ vmacache_invalidate(mm); + + /* + * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or + * VM_GROWSUP VMA. Such VMAs can change their size under + * down_read(mmap_lock) and collide with the VMA we are about to unmap. + */ + if (vma && (vma->vm_flags & VM_GROWSDOWN)) + return false; + if (prev && (prev->vm_flags & VM_GROWSUP)) + return false; + return true; } /* @@ -2825,7 +2836,8 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, } /* Detach vmas from rbtree */ - detach_vmas_to_be_unmapped(mm, vma, prev, end); + if (!detach_vmas_to_be_unmapped(mm, vma, prev, end)) + downgrade = false; if (downgrade) mmap_write_downgrade(mm); diff --git a/mm/mremap.c b/mm/mremap.c index 5dd572d57ca9..6b153dc05fe4 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -206,9 +206,28 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, /* * The destination pmd shouldn't be established, free_pgtables() - * should have release it. + * should have released it. + * + * However, there's a case during execve() where we use mremap + * to move the initial stack, and in that case the target area + * may overlap the source area (always moving down). + * + * If everything is PMD-aligned, that works fine, as moving + * each pmd down will clear the source pmd. But if we first + * have a few 4kB-only pages that get moved down, and then + * hit the "now the rest is PMD-aligned, let's do everything + * one pmd at a time", we will still have the old (now empty + * of any 4kB pages, but still there) PMD in the page table + * tree. + * + * Warn on it once - because we really should try to figure + * out how to do this better - but then say "I won't move + * this pmd". + * + * One alternative might be to just unmap the target pmd at + * this point, and verify that it really is empty. We'll see. */ - if (WARN_ON(!pmd_none(*new_pmd))) + if (WARN_ON_ONCE(!pmd_none(*new_pmd))) return false; /* diff --git a/mm/nommu.c b/mm/nommu.c index cdcad5d61dd1..f32a69095d50 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -291,23 +291,6 @@ void *vzalloc_node(unsigned long size, int node) EXPORT_SYMBOL(vzalloc_node); /** - * vmalloc_exec - allocate virtually contiguous, executable memory - * @size: allocation size - * - * Kernel-internal function to allocate enough pages to cover @size - * the page level allocator and map them into contiguous and - * executable kernel virtual space. - * - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. - */ - -void *vmalloc_exec(unsigned long size) -{ - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM); -} - -/** * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) * @size: allocation size * diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 48eb0f1410d4..e028b87ce294 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7832,7 +7832,7 @@ void setup_per_zone_wmarks(void) * Initialise min_free_kbytes. * * For small machines we want it small (128k min). For large machines - * we want it large (64MB max). But it is not linear, because network + * we want it large (256MB max). But it is not linear, because network * bandwidth does not increase linearly with machine size. We use * * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: diff --git a/mm/page_io.c b/mm/page_io.c index e8726f3e3820..ccda76790088 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -277,6 +277,23 @@ static inline void count_swpout_vm_event(struct page *page) count_vm_events(PSWPOUT, hpage_nr_pages(page)); } +#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) +static void bio_associate_blkg_from_page(struct bio *bio, struct page *page) +{ + struct cgroup_subsys_state *css; + + if (!page->mem_cgroup) + return; + + rcu_read_lock(); + css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys); + bio_associate_blkg_from_css(bio, css); + rcu_read_unlock(); +} +#else +#define bio_associate_blkg_from_page(bio, page) do { } while (0) +#endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */ + int __swap_writepage(struct page *page, struct writeback_control *wbc, bio_end_io_t end_write_func) { diff --git a/mm/shmem.c b/mm/shmem.c index a0dbe62f8042..b2abca3f7f33 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3178,7 +3178,7 @@ static int shmem_initxattrs(struct inode *inode, new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, GFP_KERNEL); if (!new_xattr->name) { - kfree(new_xattr); + kvfree(new_xattr); return -ENOMEM; } diff --git a/mm/slab.h b/mm/slab.h index 207c83ef6e06..74f7e09a7cfd 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -348,7 +348,7 @@ static __always_inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, struct kmem_cache *s) { - unsigned int nr_pages = 1 << order; + int nr_pages = 1 << order; struct mem_cgroup *memcg; struct lruvec *lruvec; int ret; @@ -388,7 +388,7 @@ out: static __always_inline void memcg_uncharge_slab(struct page *page, int order, struct kmem_cache *s) { - unsigned int nr_pages = 1 << order; + int nr_pages = 1 << order; struct mem_cgroup *memcg; struct lruvec *lruvec; diff --git a/mm/slab_common.c b/mm/slab_common.c index 9e72ba224175..fe8b68482670 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -326,6 +326,14 @@ int slab_unmergeable(struct kmem_cache *s) if (s->refcount < 0) return 1; +#ifdef CONFIG_MEMCG_KMEM + /* + * Skip the dying kmem_cache. + */ + if (s->memcg_params.dying) + return 1; +#endif + return 0; } @@ -886,12 +894,15 @@ static int shutdown_memcg_caches(struct kmem_cache *s) return 0; } -static void flush_memcg_workqueue(struct kmem_cache *s) +static void memcg_set_kmem_cache_dying(struct kmem_cache *s) { spin_lock_irq(&memcg_kmem_wq_lock); s->memcg_params.dying = true; spin_unlock_irq(&memcg_kmem_wq_lock); +} +static void flush_memcg_workqueue(struct kmem_cache *s) +{ /* * SLAB and SLUB deactivate the kmem_caches through call_rcu. Make * sure all registered rcu callbacks have been invoked. @@ -923,10 +934,6 @@ static inline int shutdown_memcg_caches(struct kmem_cache *s) { return 0; } - -static inline void flush_memcg_workqueue(struct kmem_cache *s) -{ -} #endif /* CONFIG_MEMCG_KMEM */ void slab_kmem_cache_release(struct kmem_cache *s) @@ -944,8 +951,6 @@ void kmem_cache_destroy(struct kmem_cache *s) if (unlikely(!s)) return; - flush_memcg_workqueue(s); - get_online_cpus(); get_online_mems(); @@ -955,6 +960,22 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->refcount) goto out_unlock; +#ifdef CONFIG_MEMCG_KMEM + memcg_set_kmem_cache_dying(s); + + mutex_unlock(&slab_mutex); + + put_online_mems(); + put_online_cpus(); + + flush_memcg_workqueue(s); + + get_online_cpus(); + get_online_mems(); + + mutex_lock(&slab_mutex); +#endif + err = shutdown_memcg_caches(s); if (!err) err = shutdown_cache(s); @@ -1726,7 +1747,7 @@ void kzfree(const void *p) if (unlikely(ZERO_OR_NULL_PTR(mem))) return; ks = ksize(mem); - memset(mem, 0, ks); + memzero_explicit(mem, ks); kfree(mem); } EXPORT_SYMBOL(kzfree); diff --git a/mm/slub.c b/mm/slub.c index fe81773fd97e..ef303070d175 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3766,15 +3766,13 @@ error: } static void list_slab_objects(struct kmem_cache *s, struct page *page, - const char *text, unsigned long *map) + const char *text) { #ifdef CONFIG_SLUB_DEBUG void *addr = page_address(page); + unsigned long *map; void *p; - if (!map) - return; - slab_err(s, page, text, s->name); slab_lock(page); @@ -3786,6 +3784,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, print_tracking(s, p); } } + put_map(map); slab_unlock(page); #endif } @@ -3799,11 +3798,6 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { LIST_HEAD(discard); struct page *page, *h; - unsigned long *map = NULL; - -#ifdef CONFIG_SLUB_DEBUG - map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL); -#endif BUG_ON(irqs_disabled()); spin_lock_irq(&n->list_lock); @@ -3813,16 +3807,11 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) list_add(&page->slab_list, &discard); } else { list_slab_objects(s, page, - "Objects remaining in %s on __kmem_cache_shutdown()", - map); + "Objects remaining in %s on __kmem_cache_shutdown()"); } } spin_unlock_irq(&n->list_lock); -#ifdef CONFIG_SLUB_DEBUG - bitmap_free(map); -#endif - list_for_each_entry_safe(page, h, &discard, slab_list) discard_slab(s, page); } diff --git a/mm/swap.c b/mm/swap.c index dbcab84c6fce..a82efc33411f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -443,8 +443,7 @@ void mark_page_accessed(struct page *page) else __lru_cache_activate_page(page); ClearPageReferenced(page); - if (page_is_file_lru(page)) - workingset_activation(page); + workingset_activation(page); } if (page_is_idle(page)) clear_page_idle(page); diff --git a/mm/swap_state.c b/mm/swap_state.c index e98ff460e9e9..05889e8e3c97 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -21,7 +21,7 @@ #include <linux/vmalloc.h> #include <linux/swap_slots.h> #include <linux/huge_mm.h> - +#include "internal.h" /* * swapper_space is a fiction, retained to simplify the path through @@ -429,7 +429,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, __SetPageSwapBacked(page); /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (add_to_swap_cache(page, entry, gfp_mask & GFP_KERNEL)) { + if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK)) { put_swap_page(page, entry); goto fail_unlock; } diff --git a/mm/swapfile.c b/mm/swapfile.c index 987276c557d1..6c26916e95fd 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2929,7 +2929,7 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) * write only restriction. Hence zoned block devices are not * suitable for swapping. Disallow them here. */ - if (blk_queue_is_zoned(p->bdev->bd_queue)) + if (blk_queue_is_zoned(p->bdev->bd_disk->queue)) return -EINVAL; p->flags |= SWP_BLKDEV; } else if (S_ISREG(inode->i_mode)) { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3091c2ca60df..5a2b55c8dd9a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1862,7 +1862,6 @@ EXPORT_SYMBOL(vm_unmap_ram); * @pages: an array of pointers to the pages to be mapped * @count: number of pages * @node: prefer to allocate data structures on this node - * @prot: memory protection to use. PAGE_KERNEL for regular RAM * * If you use this function for less than VMAP_MAX_ALLOC pages, it could be * faster than vmap so it's good. But if you mix long-life and short-life @@ -2696,26 +2695,6 @@ void *vzalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vzalloc_node); -/** - * vmalloc_exec - allocate virtually contiguous, executable memory - * @size: allocation size - * - * Kernel-internal function to allocate enough pages to cover @size - * the page level allocator and map them into contiguous and - * executable kernel virtual space. - * - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. - * - * Return: pointer to the allocated memory or %NULL on error - */ -void *vmalloc_exec(unsigned long size) -{ - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, - NUMA_NO_NODE, __builtin_return_address(0)); -} - #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) diff --git a/mm/vmscan.c b/mm/vmscan.c index b6d84326bdf2..749d239c62b2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -904,6 +904,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, __delete_from_swap_cache(page, swap); xa_unlock_irqrestore(&mapping->i_pages, flags); put_swap_page(page, swap); + workingset_eviction(page, target_memcg); } else { void (*freepage)(struct page *); void *shadow = NULL; @@ -1884,6 +1885,8 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, list_add(&page->lru, &pages_to_free); } else { nr_moved += nr_pages; + if (PageActive(page)) + workingset_age_nonresident(lruvec, nr_pages); } } diff --git a/mm/workingset.c b/mm/workingset.c index d481ea452eeb..50b7937bab32 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -156,8 +156,8 @@ * * Implementation * - * For each node's file LRU lists, a counter for inactive evictions - * and activations is maintained (node->inactive_age). + * For each node's LRU lists, a counter for inactive evictions and + * activations is maintained (node->nonresident_age). * * On eviction, a snapshot of this counter (along with some bits to * identify the node) is stored in the now empty page cache @@ -213,7 +213,17 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, *workingsetp = workingset; } -static void advance_inactive_age(struct mem_cgroup *memcg, pg_data_t *pgdat) +/** + * workingset_age_nonresident - age non-resident entries as LRU ages + * @memcg: the lruvec that was aged + * @nr_pages: the number of pages to count + * + * As in-memory pages are aged, non-resident pages need to be aged as + * well, in order for the refault distances later on to be comparable + * to the in-memory dimensions. This function allows reclaim and LRU + * operations to drive the non-resident aging along in parallel. + */ +void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages) { /* * Reclaiming a cgroup means reclaiming all its children in a @@ -227,11 +237,8 @@ static void advance_inactive_age(struct mem_cgroup *memcg, pg_data_t *pgdat) * the root cgroup's, age as well. */ do { - struct lruvec *lruvec; - - lruvec = mem_cgroup_lruvec(memcg, pgdat); - atomic_long_inc(&lruvec->inactive_age); - } while (memcg && (memcg = parent_mem_cgroup(memcg))); + atomic_long_add(nr_pages, &lruvec->nonresident_age); + } while ((lruvec = parent_lruvec(lruvec))); } /** @@ -254,12 +261,11 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); - advance_inactive_age(page_memcg(page), pgdat); - lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + workingset_age_nonresident(lruvec, hpage_nr_pages(page)); /* XXX: target_memcg can be NULL, go through lruvec */ memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); - eviction = atomic_long_read(&lruvec->inactive_age); + eviction = atomic_long_read(&lruvec->nonresident_age); return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); } @@ -309,20 +315,20 @@ void workingset_refault(struct page *page, void *shadow) if (!mem_cgroup_disabled() && !eviction_memcg) goto out; eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); - refault = atomic_long_read(&eviction_lruvec->inactive_age); + refault = atomic_long_read(&eviction_lruvec->nonresident_age); /* * Calculate the refault distance * * The unsigned subtraction here gives an accurate distance - * across inactive_age overflows in most cases. There is a + * across nonresident_age overflows in most cases. There is a * special case: usually, shadow entries have a short lifetime * and are either refaulted or reclaimed along with the inode * before they get too old. But it is not impossible for the - * inactive_age to lap a shadow entry in the field, which can - * then result in a false small refault distance, leading to a - * false activation should this old entry actually refault - * again. However, earlier kernels used to deactivate + * nonresident_age to lap a shadow entry in the field, which + * can then result in a false small refault distance, leading + * to a false activation should this old entry actually + * refault again. However, earlier kernels used to deactivate * unconditionally with *every* reclaim invocation for the * longest time, so the occasional inappropriate activation * leading to pressure on the active list is not a problem. @@ -359,7 +365,7 @@ void workingset_refault(struct page *page, void *shadow) goto out; SetPageActive(page); - advance_inactive_age(memcg, pgdat); + workingset_age_nonresident(lruvec, hpage_nr_pages(page)); inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); /* Page was active prior to eviction */ @@ -382,6 +388,7 @@ out: void workingset_activation(struct page *page) { struct mem_cgroup *memcg; + struct lruvec *lruvec; rcu_read_lock(); /* @@ -394,7 +401,8 @@ void workingset_activation(struct page *page) memcg = page_memcg_rcu(page); if (!mem_cgroup_disabled() && !memcg) goto out; - advance_inactive_age(memcg, page_pgdat(page)); + lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); + workingset_age_nonresident(lruvec, hpage_nr_pages(page)); out: rcu_read_unlock(); } |