diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 157 | ||||
-rw-r--r-- | mm/cma.c | 4 | ||||
-rw-r--r-- | mm/compaction.c | 17 | ||||
-rw-r--r-- | mm/debug.c | 8 | ||||
-rw-r--r-- | mm/debug_vm_pgtable.c | 6 | ||||
-rw-r--r-- | mm/filemap.c | 237 | ||||
-rw-r--r-- | mm/frontswap.c | 2 | ||||
-rw-r--r-- | mm/gup.c | 4 | ||||
-rw-r--r-- | mm/hugetlb.c | 17 | ||||
-rw-r--r-- | mm/khugepaged.c | 3 | ||||
-rw-r--r-- | mm/ksm.c | 2 | ||||
-rw-r--r-- | mm/list_lru.c | 6 | ||||
-rw-r--r-- | mm/maccess.c | 61 | ||||
-rw-r--r-- | mm/memblock.c | 57 | ||||
-rw-r--r-- | mm/memcontrol.c | 33 | ||||
-rw-r--r-- | mm/memory.c | 39 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 13 | ||||
-rw-r--r-- | mm/mempolicy.c | 4 | ||||
-rw-r--r-- | mm/migrate.c | 13 | ||||
-rw-r--r-- | mm/mmap.c | 17 | ||||
-rw-r--r-- | mm/mremap.c | 23 | ||||
-rw-r--r-- | mm/nommu.c | 17 | ||||
-rw-r--r-- | mm/page_alloc.c | 4 | ||||
-rw-r--r-- | mm/page_io.c | 17 | ||||
-rw-r--r-- | mm/percpu.c | 2 | ||||
-rw-r--r-- | mm/rodata_test.c | 2 | ||||
-rw-r--r-- | mm/shmem.c | 2 | ||||
-rw-r--r-- | mm/slab.h | 4 | ||||
-rw-r--r-- | mm/slab_common.c | 37 | ||||
-rw-r--r-- | mm/slub.c | 25 | ||||
-rw-r--r-- | mm/swap.c | 7 | ||||
-rw-r--r-- | mm/swap_state.c | 4 | ||||
-rw-r--r-- | mm/swapfile.c | 2 | ||||
-rw-r--r-- | mm/vmalloc.c | 21 | ||||
-rw-r--r-- | mm/vmscan.c | 3 | ||||
-rw-r--r-- | mm/workingset.c | 46 |
36 files changed, 475 insertions, 441 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index d382272bcc31..8e8b00627bb2 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -281,7 +281,7 @@ void wb_wakeup_delayed(struct bdi_writeback *wb) #define INIT_BW (100 << (20 - PAGE_SHIFT)) static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, - int blkcg_id, gfp_t gfp) + gfp_t gfp) { int i, err; @@ -308,15 +308,9 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, INIT_DELAYED_WORK(&wb->dwork, wb_workfn); wb->dirty_sleep = jiffies; - wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp); - if (!wb->congested) { - err = -ENOMEM; - goto out_put_bdi; - } - err = fprop_local_init_percpu(&wb->completions, gfp); if (err) - goto out_put_cong; + goto out_put_bdi; for (i = 0; i < NR_WB_STAT_ITEMS; i++) { err = percpu_counter_init(&wb->stat[i], 0, gfp); @@ -330,8 +324,6 @@ out_destroy_stat: while (i--) percpu_counter_destroy(&wb->stat[i]); fprop_local_destroy_percpu(&wb->completions); -out_put_cong: - wb_congested_put(wb->congested); out_put_bdi: if (wb != &bdi->wb) bdi_put(bdi); @@ -374,7 +366,6 @@ static void wb_exit(struct bdi_writeback *wb) percpu_counter_destroy(&wb->stat[i]); fprop_local_destroy_percpu(&wb->completions); - wb_congested_put(wb->congested); if (wb != &wb->bdi->wb) bdi_put(wb->bdi); } @@ -384,99 +375,12 @@ static void wb_exit(struct bdi_writeback *wb) #include <linux/memcontrol.h> /* - * cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree, - * blkcg->cgwb_list, and memcg->cgwb_list. bdi->cgwb_tree is also RCU - * protected. + * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, and memcg->cgwb_list. + * bdi->cgwb_tree is also RCU protected. */ static DEFINE_SPINLOCK(cgwb_lock); static struct workqueue_struct *cgwb_release_wq; -/** - * wb_congested_get_create - get or create a wb_congested - * @bdi: associated bdi - * @blkcg_id: ID of the associated blkcg - * @gfp: allocation mask - * - * Look up the wb_congested for @blkcg_id on @bdi. If missing, create one. - * The returned wb_congested has its reference count incremented. Returns - * NULL on failure. - */ -struct bdi_writeback_congested * -wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp) -{ - struct bdi_writeback_congested *new_congested = NULL, *congested; - struct rb_node **node, *parent; - unsigned long flags; -retry: - spin_lock_irqsave(&cgwb_lock, flags); - - node = &bdi->cgwb_congested_tree.rb_node; - parent = NULL; - - while (*node != NULL) { - parent = *node; - congested = rb_entry(parent, struct bdi_writeback_congested, - rb_node); - if (congested->blkcg_id < blkcg_id) - node = &parent->rb_left; - else if (congested->blkcg_id > blkcg_id) - node = &parent->rb_right; - else - goto found; - } - - if (new_congested) { - /* !found and storage for new one already allocated, insert */ - congested = new_congested; - rb_link_node(&congested->rb_node, parent, node); - rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree); - spin_unlock_irqrestore(&cgwb_lock, flags); - return congested; - } - - spin_unlock_irqrestore(&cgwb_lock, flags); - - /* allocate storage for new one and retry */ - new_congested = kzalloc(sizeof(*new_congested), gfp); - if (!new_congested) - return NULL; - - refcount_set(&new_congested->refcnt, 1); - new_congested->__bdi = bdi; - new_congested->blkcg_id = blkcg_id; - goto retry; - -found: - refcount_inc(&congested->refcnt); - spin_unlock_irqrestore(&cgwb_lock, flags); - kfree(new_congested); - return congested; -} - -/** - * wb_congested_put - put a wb_congested - * @congested: wb_congested to put - * - * Put @congested and destroy it if the refcnt reaches zero. - */ -void wb_congested_put(struct bdi_writeback_congested *congested) -{ - unsigned long flags; - - if (!refcount_dec_and_lock_irqsave(&congested->refcnt, &cgwb_lock, &flags)) - return; - - /* bdi might already have been destroyed leaving @congested unlinked */ - if (congested->__bdi) { - rb_erase(&congested->rb_node, - &congested->__bdi->cgwb_congested_tree); - congested->__bdi = NULL; - } - - spin_unlock_irqrestore(&cgwb_lock, flags); - kfree(congested); -} - static void cgwb_release_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(work, struct bdi_writeback, @@ -558,7 +462,7 @@ static int cgwb_create(struct backing_dev_info *bdi, goto out_put; } - ret = wb_init(wb, bdi, blkcg_css->id, gfp); + ret = wb_init(wb, bdi, gfp); if (ret) goto err_free; @@ -696,11 +600,10 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) int ret; INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); - bdi->cgwb_congested_tree = RB_ROOT; mutex_init(&bdi->cgwb_release_mutex); init_rwsem(&bdi->wb_switch_rwsem); - ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); + ret = wb_init(&bdi->wb, bdi, GFP_KERNEL); if (!ret) { bdi->wb.memcg_css = &root_mem_cgroup->css; bdi->wb.blkcg_css = blkcg_root_css; @@ -769,21 +672,6 @@ void wb_blkcg_offline(struct blkcg *blkcg) spin_unlock_irq(&cgwb_lock); } -static void cgwb_bdi_exit(struct backing_dev_info *bdi) -{ - struct rb_node *rbn; - - spin_lock_irq(&cgwb_lock); - while ((rbn = rb_first(&bdi->cgwb_congested_tree))) { - struct bdi_writeback_congested *congested = - rb_entry(rbn, struct bdi_writeback_congested, rb_node); - - rb_erase(rbn, &bdi->cgwb_congested_tree); - congested->__bdi = NULL; /* mark @congested unlinked */ - } - spin_unlock_irq(&cgwb_lock); -} - static void cgwb_bdi_register(struct backing_dev_info *bdi) { spin_lock_irq(&cgwb_lock); @@ -810,29 +698,11 @@ subsys_initcall(cgwb_init); static int cgwb_bdi_init(struct backing_dev_info *bdi) { - int err; - - bdi->wb_congested = kzalloc(sizeof(*bdi->wb_congested), GFP_KERNEL); - if (!bdi->wb_congested) - return -ENOMEM; - - refcount_set(&bdi->wb_congested->refcnt, 1); - - err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); - if (err) { - wb_congested_put(bdi->wb_congested); - return err; - } - return 0; + return wb_init(&bdi->wb, bdi, GFP_KERNEL); } static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { } -static void cgwb_bdi_exit(struct backing_dev_info *bdi) -{ - wb_congested_put(bdi->wb_congested); -} - static void cgwb_bdi_register(struct backing_dev_info *bdi) { list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); @@ -1023,7 +893,6 @@ static void release_bdi(struct kref *ref) bdi_unregister(bdi); WARN_ON_ONCE(bdi->dev); wb_exit(&bdi->wb); - cgwb_bdi_exit(bdi); kfree(bdi); } @@ -1047,29 +916,29 @@ static wait_queue_head_t congestion_wqh[2] = { }; static atomic_t nr_wb_congested[2]; -void clear_wb_congested(struct bdi_writeback_congested *congested, int sync) +void clear_bdi_congested(struct backing_dev_info *bdi, int sync) { wait_queue_head_t *wqh = &congestion_wqh[sync]; enum wb_congested_state bit; bit = sync ? WB_sync_congested : WB_async_congested; - if (test_and_clear_bit(bit, &congested->state)) + if (test_and_clear_bit(bit, &bdi->wb.congested)) atomic_dec(&nr_wb_congested[sync]); smp_mb__after_atomic(); if (waitqueue_active(wqh)) wake_up(wqh); } -EXPORT_SYMBOL(clear_wb_congested); +EXPORT_SYMBOL(clear_bdi_congested); -void set_wb_congested(struct bdi_writeback_congested *congested, int sync) +void set_bdi_congested(struct backing_dev_info *bdi, int sync) { enum wb_congested_state bit; bit = sync ? WB_sync_congested : WB_async_congested; - if (!test_and_set_bit(bit, &congested->state)) + if (!test_and_set_bit(bit, &bdi->wb.congested)) atomic_inc(&nr_wb_congested[sync]); } -EXPORT_SYMBOL(set_wb_congested); +EXPORT_SYMBOL(set_bdi_congested); /** * congestion_wait - wait for a backing_dev to become uncongested @@ -339,13 +339,13 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, */ if (base < highmem_start && limit > highmem_start) { addr = memblock_alloc_range_nid(size, alignment, - highmem_start, limit, nid, false); + highmem_start, limit, nid, true); limit = highmem_start; } if (!addr) { addr = memblock_alloc_range_nid(size, alignment, base, - limit, nid, false); + limit, nid, true); if (!addr) { ret = -ENOMEM; goto err; diff --git a/mm/compaction.c b/mm/compaction.c index fd988b7e5f2b..86375605faa9 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2316,15 +2316,26 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .page = NULL, }; - current->capture_control = &capc; + /* + * Make sure the structs are really initialized before we expose the + * capture control, in case we are interrupted and the interrupt handler + * frees a page. + */ + barrier(); + WRITE_ONCE(current->capture_control, &capc); ret = compact_zone(&cc, &capc); VM_BUG_ON(!list_empty(&cc.freepages)); VM_BUG_ON(!list_empty(&cc.migratepages)); - *capture = capc.page; - current->capture_control = NULL; + /* + * Make sure we hide capture control first before we read the captured + * page pointer, otherwise an interrupt could free and capture a page + * and we would leak it. + */ + WRITE_ONCE(current->capture_control, NULL); + *capture = READ_ONCE(capc.page); return ret; } diff --git a/mm/debug.c b/mm/debug.c index b5b1de8c71ac..4f376514744d 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -120,9 +120,9 @@ void __dump_page(struct page *page, const char *reason) * mapping can be invalid pointer and we don't want to crash * accessing it, so probe everything depending on it carefully */ - if (probe_kernel_read(&host, &mapping->host, + if (copy_from_kernel_nofault(&host, &mapping->host, sizeof(struct inode *)) || - probe_kernel_read(&a_ops, &mapping->a_ops, + copy_from_kernel_nofault(&a_ops, &mapping->a_ops, sizeof(struct address_space_operations *))) { pr_warn("failed to read mapping->host or a_ops, mapping not a valid kernel address?\n"); goto out_mapping; @@ -133,7 +133,7 @@ void __dump_page(struct page *page, const char *reason) goto out_mapping; } - if (probe_kernel_read(&dentry_first, + if (copy_from_kernel_nofault(&dentry_first, &host->i_dentry.first, sizeof(struct hlist_node *))) { pr_warn("mapping->a_ops:%ps with invalid mapping->host inode address %px\n", a_ops, host); @@ -146,7 +146,7 @@ void __dump_page(struct page *page, const char *reason) } dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); - if (probe_kernel_read(&dentry, dentry_ptr, + if (copy_from_kernel_nofault(&dentry, dentry_ptr, sizeof(struct dentry))) { pr_warn("mapping->aops:%ps with invalid mapping->host->i_dentry.first %px\n", a_ops, dentry_ptr); diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index e45623016aea..d315ff544f05 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -246,13 +246,13 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp, static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep, unsigned long vaddr) { - pte_t pte = READ_ONCE(*ptep); + pte_t pte = ptep_get(ptep); pte = __pte(pte_val(pte) | RANDOM_ORVALUE); set_pte_at(mm, vaddr, ptep, pte); barrier(); pte_clear(mm, vaddr, ptep); - pte = READ_ONCE(*ptep); + pte = ptep_get(ptep); WARN_ON(!pte_none(pte)); } @@ -307,7 +307,7 @@ static int __init debug_vm_pgtable(void) phys_addr_t paddr; unsigned long vaddr, pte_aligned, pmd_aligned; unsigned long pud_aligned, p4d_aligned, pgd_aligned; - spinlock_t *uninitialized_var(ptl); + spinlock_t *ptl = NULL; pr_info("Validating architecture page table helpers\n"); prot = vm_get_page_prot(VMFLAGS); diff --git a/mm/filemap.c b/mm/filemap.c index f0ae9a6308cb..9f131f1cfde3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -987,44 +987,46 @@ void __init pagecache_init(void) page_writeback_init(); } -/* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */ -struct wait_page_key { - struct page *page; - int bit_nr; - int page_match; -}; - -struct wait_page_queue { - struct page *page; - int bit_nr; - wait_queue_entry_t wait; -}; - static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { + int ret; struct wait_page_key *key = arg; struct wait_page_queue *wait_page = container_of(wait, struct wait_page_queue, wait); - if (wait_page->page != key->page) - return 0; - key->page_match = 1; - - if (wait_page->bit_nr != key->bit_nr) + if (!wake_page_match(wait_page, key)) return 0; /* - * Stop walking if it's locked. - * Is this safe if put_and_wait_on_page_locked() is in use? - * Yes: the waker must hold a reference to this page, and if PG_locked - * has now already been set by another task, that task must also hold - * a reference to the *same usage* of this page; so there is no need - * to walk on to wake even the put_and_wait_on_page_locked() callers. + * If it's an exclusive wait, we get the bit for it, and + * stop walking if we can't. + * + * If it's a non-exclusive wait, then the fact that this + * wake function was called means that the bit already + * was cleared, and we don't care if somebody then + * re-took it. */ - if (test_bit(key->bit_nr, &key->page->flags)) - return -1; + ret = 0; + if (wait->flags & WQ_FLAG_EXCLUSIVE) { + if (test_and_set_bit(key->bit_nr, &key->page->flags)) + return -1; + ret = 1; + } + wait->flags |= WQ_FLAG_WOKEN; + + wake_up_state(wait->private, mode); - return autoremove_wake_function(wait, mode, sync, key); + /* + * Ok, we have successfully done what we're waiting for, + * and we can unconditionally remove the wait entry. + * + * Note that this has to be the absolute last thing we do, + * since after list_del_init(&wait->entry) the wait entry + * might be de-allocated and the process might even have + * exited. + */ + list_del_init_careful(&wait->entry); + return ret; } static void wake_up_page_bit(struct page *page, int bit_nr) @@ -1103,16 +1105,31 @@ enum behavior { */ }; +/* + * Attempt to check (or get) the page bit, and mark the + * waiter woken if successful. + */ +static inline bool trylock_page_bit_common(struct page *page, int bit_nr, + struct wait_queue_entry *wait) +{ + if (wait->flags & WQ_FLAG_EXCLUSIVE) { + if (test_and_set_bit(bit_nr, &page->flags)) + return false; + } else if (test_bit(bit_nr, &page->flags)) + return false; + + wait->flags |= WQ_FLAG_WOKEN; + return true; +} + static inline int wait_on_page_bit_common(wait_queue_head_t *q, struct page *page, int bit_nr, int state, enum behavior behavior) { struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; - bool bit_is_set; bool thrashing = false; bool delayacct = false; unsigned long pflags; - int ret = 0; if (bit_nr == PG_locked && !PageUptodate(page) && PageWorkingset(page)) { @@ -1130,48 +1147,47 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, wait_page.page = page; wait_page.bit_nr = bit_nr; - for (;;) { - spin_lock_irq(&q->lock); + /* + * Do one last check whether we can get the + * page bit synchronously. + * + * Do the SetPageWaiters() marking before that + * to let any waker we _just_ missed know they + * need to wake us up (otherwise they'll never + * even go to the slow case that looks at the + * page queue), and add ourselves to the wait + * queue if we need to sleep. + * + * This part needs to be done under the queue + * lock to avoid races. + */ + spin_lock_irq(&q->lock); + SetPageWaiters(page); + if (!trylock_page_bit_common(page, bit_nr, wait)) + __add_wait_queue_entry_tail(q, wait); + spin_unlock_irq(&q->lock); - if (likely(list_empty(&wait->entry))) { - __add_wait_queue_entry_tail(q, wait); - SetPageWaiters(page); - } + /* + * From now on, all the logic will be based on + * the WQ_FLAG_WOKEN flag, and the and the page + * bit testing (and setting) will be - or has + * already been - done by the wake function. + * + * We can drop our reference to the page. + */ + if (behavior == DROP) + put_page(page); + for (;;) { set_current_state(state); - spin_unlock_irq(&q->lock); - - bit_is_set = test_bit(bit_nr, &page->flags); - if (behavior == DROP) - put_page(page); - - if (likely(bit_is_set)) - io_schedule(); - - if (behavior == EXCLUSIVE) { - if (!test_and_set_bit_lock(bit_nr, &page->flags)) - break; - } else if (behavior == SHARED) { - if (!test_bit(bit_nr, &page->flags)) - break; - } - - if (signal_pending_state(state, current)) { - ret = -EINTR; + if (signal_pending_state(state, current)) break; - } - if (behavior == DROP) { - /* - * We can no longer safely access page->flags: - * even if CONFIG_MEMORY_HOTREMOVE is not enabled, - * there is a risk of waiting forever on a page reused - * for something that keeps it locked indefinitely. - * But best check for -EINTR above before breaking. - */ + if (wait->flags & WQ_FLAG_WOKEN) break; - } + + io_schedule(); } finish_wait(q, wait); @@ -1190,7 +1206,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, * bother with signals either. */ - return ret; + return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR; } void wait_on_page_bit(struct page *page, int bit_nr) @@ -1207,6 +1223,44 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr) } EXPORT_SYMBOL(wait_on_page_bit_killable); +static int __wait_on_page_locked_async(struct page *page, + struct wait_page_queue *wait, bool set) +{ + struct wait_queue_head *q = page_waitqueue(page); + int ret = 0; + + wait->page = page; + wait->bit_nr = PG_locked; + + spin_lock_irq(&q->lock); + __add_wait_queue_entry_tail(q, &wait->wait); + SetPageWaiters(page); + if (set) + ret = !trylock_page(page); + else + ret = PageLocked(page); + /* + * If we were succesful now, we know we're still on the + * waitqueue as we're still under the lock. This means it's + * safe to remove and return success, we know the callback + * isn't going to trigger. + */ + if (!ret) + __remove_wait_queue(q, &wait->wait); + else + ret = -EIOCBQUEUED; + spin_unlock_irq(&q->lock); + return ret; +} + +static int wait_on_page_locked_async(struct page *page, + struct wait_page_queue *wait) +{ + if (!PageLocked(page)) + return 0; + return __wait_on_page_locked_async(compound_head(page), wait, false); +} + /** * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked * @page: The page to wait for. @@ -1369,6 +1423,11 @@ int __lock_page_killable(struct page *__page) } EXPORT_SYMBOL_GPL(__lock_page_killable); +int __lock_page_async(struct page *page, struct wait_page_queue *wait) +{ + return __wait_on_page_locked_async(page, wait, true); +} + /* * Return values: * 1 - page is locked; mmap_lock is still held. @@ -2028,7 +2087,7 @@ find_page: page = find_get_page(mapping, index); if (!page) { - if (iocb->ki_flags & IOCB_NOWAIT) + if (iocb->ki_flags & IOCB_NOIO) goto would_block; page_cache_sync_readahead(mapping, ra, filp, @@ -2038,22 +2097,34 @@ find_page: goto no_cached_page; } if (PageReadahead(page)) { + if (iocb->ki_flags & IOCB_NOIO) { + put_page(page); + goto out; + } page_cache_async_readahead(mapping, ra, filp, page, index, last_index - index); } if (!PageUptodate(page)) { - if (iocb->ki_flags & IOCB_NOWAIT) { - put_page(page); - goto would_block; - } - /* * See comment in do_read_cache_page on why * wait_on_page_locked is used to avoid unnecessarily * serialisations and why it's safe. */ - error = wait_on_page_locked_killable(page); + if (iocb->ki_flags & IOCB_WAITQ) { + if (written) { + put_page(page); + goto out; + } + error = wait_on_page_locked_async(page, + iocb->ki_waitq); + } else { + if (iocb->ki_flags & IOCB_NOWAIT) { + put_page(page); + goto would_block; + } + error = wait_on_page_locked_killable(page); + } if (unlikely(error)) goto readpage_error; if (PageUptodate(page)) @@ -2141,7 +2212,10 @@ page_ok: page_not_up_to_date: /* Get exclusive access to the page ... */ - error = lock_page_killable(page); + if (iocb->ki_flags & IOCB_WAITQ) + error = lock_page_async(page, iocb->ki_waitq); + else + error = lock_page_killable(page); if (unlikely(error)) goto readpage_error; @@ -2160,6 +2234,11 @@ page_not_up_to_date_locked: } readpage: + if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) { + unlock_page(page); + put_page(page); + goto would_block; + } /* * A previous I/O error may have been due to temporary * failures, eg. multipath errors. @@ -2249,9 +2328,19 @@ EXPORT_SYMBOL_GPL(generic_file_buffered_read); * * This is the "read_iter()" routine for all filesystems * that can use the page cache directly. + * + * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall + * be returned when no data can be read without waiting for I/O requests + * to complete; it doesn't prevent readahead. + * + * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O + * requests shall be made for the read or for readahead. When no data + * can be read, -EAGAIN shall be returned. When readahead would be + * triggered, a partial, possibly empty read shall be returned. + * * Return: * * number of bytes copied, even for partial reads - * * negative error code if nothing was read + * * negative error code (or 0 if IOCB_NOIO) if nothing was read */ ssize_t generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) diff --git a/mm/frontswap.c b/mm/frontswap.c index bfa3a339253e..9d977b1fc016 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -446,7 +446,7 @@ static int __frontswap_shrink(unsigned long target_pages, void frontswap_shrink(unsigned long target_pages) { unsigned long pages_to_unuse = 0; - int uninitialized_var(type), ret; + int type, ret; /* * we don't want to hold swap_lock while doing a very @@ -2196,7 +2196,7 @@ static inline pte_t gup_get_pte(pte_t *ptep) */ static inline pte_t gup_get_pte(pte_t *ptep) { - return READ_ONCE(*ptep); + return ptep_get(ptep); } #endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */ @@ -2425,7 +2425,7 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, if (pte_end < end) end = pte_end; - pte = READ_ONCE(*ptep); + pte = huge_ptep_get(ptep); if (!pte_access_permitted(pte, flags & FOLL_WRITE)) return 0; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 57ece74e3aae..590111ea6975 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -45,7 +45,10 @@ int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; +#ifdef CONFIG_CMA static struct cma *hugetlb_cma[MAX_NUMNODES]; +#endif +static unsigned long hugetlb_cma_size __initdata; /* * Minimum page order among possible hugepage sizes, set to a proper value @@ -1235,9 +1238,10 @@ static void free_gigantic_page(struct page *page, unsigned int order) * If the page isn't allocated using the cma allocator, * cma_release() returns false. */ - if (IS_ENABLED(CONFIG_CMA) && - cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order)) +#ifdef CONFIG_CMA + if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order)) return; +#endif free_contig_range(page_to_pfn(page), 1 << order); } @@ -1248,7 +1252,8 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, { unsigned long nr_pages = 1UL << huge_page_order(h); - if (IS_ENABLED(CONFIG_CMA)) { +#ifdef CONFIG_CMA + { struct page *page; int node; @@ -1262,6 +1267,7 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, return page; } } +#endif return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); } @@ -1593,7 +1599,7 @@ static struct address_space *_get_hugetlb_page_mapping(struct page *hpage) /* Use first found vma */ pgoff_start = page_to_pgoff(hpage); - pgoff_end = pgoff_start + hpage_nr_pages(hpage) - 1; + pgoff_end = pgoff_start + pages_per_huge_page(page_hstate(hpage)) - 1; anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff_start, pgoff_end) { struct vm_area_struct *vma = avc->vma; @@ -2571,7 +2577,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) for (i = 0; i < h->max_huge_pages; ++i) { if (hstate_is_gigantic(h)) { - if (IS_ENABLED(CONFIG_CMA) && hugetlb_cma[0]) { + if (hugetlb_cma_size) { pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n"); break; } @@ -5654,7 +5660,6 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) } #ifdef CONFIG_CMA -static unsigned long hugetlb_cma_size __initdata; static bool cma_reserve_called __initdata; static int __init cmdline_parse_hugetlb_cma(char *p) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b043c40a21d4..700f5160f3e4 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -958,6 +958,9 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, return SCAN_ADDRESS_RANGE; if (!hugepage_vma_check(vma, vma->vm_flags)) return SCAN_VMA_CHECK; + /* Anon VMA expected */ + if (!vma->anon_vma || vma->vm_ops) + return SCAN_VMA_CHECK; return 0; } @@ -2387,7 +2387,7 @@ next_mm: static void ksm_do_scan(unsigned int scan_npages) { struct rmap_item *rmap_item; - struct page *uninitialized_var(page); + struct page *page; while (scan_npages-- && likely(!freezing(current))) { cond_resched(); diff --git a/mm/list_lru.c b/mm/list_lru.c index 9222910ab1cb..e825804b3928 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -373,14 +373,14 @@ static void memcg_destroy_list_lru_node(struct list_lru_node *nlru) struct list_lru_memcg *memcg_lrus; /* * This is called when shrinker has already been unregistered, - * and nobody can use it. So, there is no need to use kvfree_rcu(). + * and nobody can use it. So, there is no need to use kvfree_rcu_local(). */ memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, true); __memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids); kvfree(memcg_lrus); } -static void kvfree_rcu(struct rcu_head *head) +static void kvfree_rcu_local(struct rcu_head *head) { struct list_lru_memcg *mlru; @@ -419,7 +419,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, rcu_assign_pointer(nlru->memcg_lrus, new); spin_unlock_irq(&nlru->lock); - call_rcu(&old->rcu, kvfree_rcu); + call_rcu(&old->rcu, kvfree_rcu_local); return 0; } diff --git a/mm/maccess.c b/mm/maccess.c index 88845eda5047..f98ff91e32c6 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -6,14 +6,15 @@ #include <linux/mm.h> #include <linux/uaccess.h> -bool __weak probe_kernel_read_allowed(const void *unsafe_src, size_t size) +bool __weak copy_from_kernel_nofault_allowed(const void *unsafe_src, + size_t size) { return true; } #ifdef HAVE_GET_KERNEL_NOFAULT -#define probe_kernel_read_loop(dst, src, len, type, err_label) \ +#define copy_from_kernel_nofault_loop(dst, src, len, type, err_label) \ while (len >= sizeof(type)) { \ __get_kernel_nofault(dst, src, type, err_label); \ dst += sizeof(type); \ @@ -21,25 +22,25 @@ bool __weak probe_kernel_read_allowed(const void *unsafe_src, size_t size) len -= sizeof(type); \ } -long probe_kernel_read(void *dst, const void *src, size_t size) +long copy_from_kernel_nofault(void *dst, const void *src, size_t size) { - if (!probe_kernel_read_allowed(src, size)) + if (!copy_from_kernel_nofault_allowed(src, size)) return -ERANGE; pagefault_disable(); - probe_kernel_read_loop(dst, src, size, u64, Efault); - probe_kernel_read_loop(dst, src, size, u32, Efault); - probe_kernel_read_loop(dst, src, size, u16, Efault); - probe_kernel_read_loop(dst, src, size, u8, Efault); + copy_from_kernel_nofault_loop(dst, src, size, u64, Efault); + copy_from_kernel_nofault_loop(dst, src, size, u32, Efault); + copy_from_kernel_nofault_loop(dst, src, size, u16, Efault); + copy_from_kernel_nofault_loop(dst, src, size, u8, Efault); pagefault_enable(); return 0; Efault: pagefault_enable(); return -EFAULT; } -EXPORT_SYMBOL_GPL(probe_kernel_read); +EXPORT_SYMBOL_GPL(copy_from_kernel_nofault); -#define probe_kernel_write_loop(dst, src, len, type, err_label) \ +#define copy_to_kernel_nofault_loop(dst, src, len, type, err_label) \ while (len >= sizeof(type)) { \ __put_kernel_nofault(dst, src, type, err_label); \ dst += sizeof(type); \ @@ -47,13 +48,13 @@ EXPORT_SYMBOL_GPL(probe_kernel_read); len -= sizeof(type); \ } -long probe_kernel_write(void *dst, const void *src, size_t size) +long copy_to_kernel_nofault(void *dst, const void *src, size_t size) { pagefault_disable(); - probe_kernel_write_loop(dst, src, size, u64, Efault); - probe_kernel_write_loop(dst, src, size, u32, Efault); - probe_kernel_write_loop(dst, src, size, u16, Efault); - probe_kernel_write_loop(dst, src, size, u8, Efault); + copy_to_kernel_nofault_loop(dst, src, size, u64, Efault); + copy_to_kernel_nofault_loop(dst, src, size, u32, Efault); + copy_to_kernel_nofault_loop(dst, src, size, u16, Efault); + copy_to_kernel_nofault_loop(dst, src, size, u8, Efault); pagefault_enable(); return 0; Efault: @@ -67,7 +68,7 @@ long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count) if (unlikely(count <= 0)) return 0; - if (!probe_kernel_read_allowed(unsafe_addr, count)) + if (!copy_from_kernel_nofault_allowed(unsafe_addr, count)) return -ERANGE; pagefault_disable(); @@ -87,7 +88,7 @@ Efault: } #else /* HAVE_GET_KERNEL_NOFAULT */ /** - * probe_kernel_read(): safely attempt to read from kernel-space + * copy_from_kernel_nofault(): safely attempt to read from kernel-space * @dst: pointer to the buffer that shall take the data * @src: address to read from * @size: size of the data chunk @@ -98,15 +99,15 @@ Efault: * * We ensure that the copy_from_user is executed in atomic context so that * do_page_fault() doesn't attempt to take mmap_lock. This makes - * probe_kernel_read() suitable for use within regions where the caller + * copy_from_kernel_nofault() suitable for use within regions where the caller * already holds mmap_lock, or other locks which nest inside mmap_lock. */ -long probe_kernel_read(void *dst, const void *src, size_t size) +long copy_from_kernel_nofault(void *dst, const void *src, size_t size) { long ret; mm_segment_t old_fs = get_fs(); - if (!probe_kernel_read_allowed(src, size)) + if (!copy_from_kernel_nofault_allowed(src, size)) return -ERANGE; set_fs(KERNEL_DS); @@ -120,10 +121,10 @@ long probe_kernel_read(void *dst, const void *src, size_t size) return -EFAULT; return 0; } -EXPORT_SYMBOL_GPL(probe_kernel_read); +EXPORT_SYMBOL_GPL(copy_from_kernel_nofault); /** - * probe_kernel_write(): safely attempt to write to a location + * copy_to_kernel_nofault(): safely attempt to write to a location * @dst: address to write to * @src: pointer to the data that shall be written * @size: size of the data chunk @@ -131,7 +132,7 @@ EXPORT_SYMBOL_GPL(probe_kernel_read); * Safely write to address @dst from the buffer at @src. If a kernel fault * happens, handle that and return -EFAULT. */ -long probe_kernel_write(void *dst, const void *src, size_t size) +long copy_to_kernel_nofault(void *dst, const void *src, size_t size) { long ret; mm_segment_t old_fs = get_fs(); @@ -174,7 +175,7 @@ long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count) if (unlikely(count <= 0)) return 0; - if (!probe_kernel_read_allowed(unsafe_addr, count)) + if (!copy_from_kernel_nofault_allowed(unsafe_addr, count)) return -ERANGE; set_fs(KERNEL_DS); @@ -193,7 +194,7 @@ long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count) #endif /* HAVE_GET_KERNEL_NOFAULT */ /** - * probe_user_read(): safely attempt to read from a user-space location + * copy_from_user_nofault(): safely attempt to read from a user-space location * @dst: pointer to the buffer that shall take the data * @src: address to read from. This must be a user address. * @size: size of the data chunk @@ -201,7 +202,7 @@ long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count) * Safely read from user address @src to the buffer at @dst. If a kernel fault * happens, handle that and return -EFAULT. */ -long probe_user_read(void *dst, const void __user *src, size_t size) +long copy_from_user_nofault(void *dst, const void __user *src, size_t size) { long ret = -EFAULT; mm_segment_t old_fs = get_fs(); @@ -218,10 +219,10 @@ long probe_user_read(void *dst, const void __user *src, size_t size) return -EFAULT; return 0; } -EXPORT_SYMBOL_GPL(probe_user_read); +EXPORT_SYMBOL_GPL(copy_from_user_nofault); /** - * probe_user_write(): safely attempt to write to a user-space location + * copy_to_user_nofault(): safely attempt to write to a user-space location * @dst: address to write to * @src: pointer to the data that shall be written * @size: size of the data chunk @@ -229,7 +230,7 @@ EXPORT_SYMBOL_GPL(probe_user_read); * Safely write to address @dst from the buffer at @src. If a kernel fault * happens, handle that and return -EFAULT. */ -long probe_user_write(void __user *dst, const void *src, size_t size) +long copy_to_user_nofault(void __user *dst, const void *src, size_t size) { long ret = -EFAULT; mm_segment_t old_fs = get_fs(); @@ -246,7 +247,7 @@ long probe_user_write(void __user *dst, const void *src, size_t size) return -EFAULT; return 0; } -EXPORT_SYMBOL_GPL(probe_user_write); +EXPORT_SYMBOL_GPL(copy_to_user_nofault); /** * strncpy_from_user_nofault: - Copy a NUL terminated string from unsafe user diff --git a/mm/memblock.c b/mm/memblock.c index 39aceafc57f6..45f198750be9 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -44,19 +44,20 @@ * in the system, for instance when the memory is restricted with * ``mem=`` command line parameter * * ``reserved`` - describes the regions that were allocated - * * ``physmap`` - describes the actual physical memory regardless of - * the possible restrictions; the ``physmap`` type is only available - * on some architectures. + * * ``physmem`` - describes the actual physical memory available during + * boot regardless of the possible restrictions and memory hot(un)plug; + * the ``physmem`` type is only available on some architectures. * * Each region is represented by :c:type:`struct memblock_region` that * defines the region extents, its attributes and NUMA node id on NUMA * systems. Every memory type is described by the :c:type:`struct * memblock_type` which contains an array of memory regions along with - * the allocator metadata. The memory types are nicely wrapped with - * :c:type:`struct memblock`. This structure is statically initialzed - * at build time. The region arrays for the "memory" and "reserved" - * types are initially sized to %INIT_MEMBLOCK_REGIONS and for the - * "physmap" type to %INIT_PHYSMEM_REGIONS. + * the allocator metadata. The "memory" and "reserved" types are nicely + * wrapped with :c:type:`struct memblock`. This structure is statically + * initialized at build time. The region arrays are initially sized to + * %INIT_MEMBLOCK_REGIONS for "memory" and %INIT_MEMBLOCK_RESERVED_REGIONS + * for "reserved". The region array for "physmem" is initially sized to + * %INIT_PHYSMEM_REGIONS. * The memblock_allow_resize() enables automatic resizing of the region * arrays during addition of new regions. This feature should be used * with care so that memory allocated for the region array will not @@ -87,8 +88,8 @@ * function frees all the memory to the buddy page allocator. * * Unless an architecture enables %CONFIG_ARCH_KEEP_MEMBLOCK, the - * memblock data structures will be discarded after the system - * initialization completes. + * memblock data structures (except "physmem") will be discarded after the + * system initialization completes. */ #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -104,7 +105,7 @@ unsigned long long max_possible_pfn; static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock; #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP -static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock; +static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS]; #endif struct memblock memblock __initdata_memblock = { @@ -118,17 +119,19 @@ struct memblock memblock __initdata_memblock = { .reserved.max = INIT_MEMBLOCK_RESERVED_REGIONS, .reserved.name = "reserved", -#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP - .physmem.regions = memblock_physmem_init_regions, - .physmem.cnt = 1, /* empty dummy entry */ - .physmem.max = INIT_PHYSMEM_REGIONS, - .physmem.name = "physmem", -#endif - .bottom_up = false, .current_limit = MEMBLOCK_ALLOC_ANYWHERE, }; +#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP +struct memblock_type physmem = { + .regions = memblock_physmem_init_regions, + .cnt = 1, /* empty dummy entry */ + .max = INIT_PHYSMEM_REGIONS, + .name = "physmem", +}; +#endif + int memblock_debug __initdata_memblock; static bool system_has_some_mirror __initdata_memblock = false; static int memblock_can_resize __initdata_memblock; @@ -838,7 +841,7 @@ int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size) memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, &base, &end, (void *)_RET_IP_); - return memblock_add_range(&memblock.physmem, base, size, MAX_NUMNODES, 0); + return memblock_add_range(&physmem, base, size, MAX_NUMNODES, 0); } #endif @@ -1019,12 +1022,10 @@ static bool should_skip_region(struct memblock_region *m, int nid, int flags) * As both region arrays are sorted, the function advances the two indices * in lockstep and returns each intersection. */ -void __init_memblock __next_mem_range(u64 *idx, int nid, - enum memblock_flags flags, - struct memblock_type *type_a, - struct memblock_type *type_b, - phys_addr_t *out_start, - phys_addr_t *out_end, int *out_nid) +void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags, + struct memblock_type *type_a, + struct memblock_type *type_b, phys_addr_t *out_start, + phys_addr_t *out_end, int *out_nid) { int idx_a = *idx & 0xffffffff; int idx_b = *idx >> 32; @@ -1924,7 +1925,7 @@ void __init_memblock __memblock_dump_all(void) memblock_dump(&memblock.memory); memblock_dump(&memblock.reserved); #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP - memblock_dump(&memblock.physmem); + memblock_dump(&physmem); #endif } @@ -2064,8 +2065,8 @@ static int __init memblock_init_debugfs(void) debugfs_create_file("reserved", 0444, root, &memblock.reserved, &memblock_debug_fops); #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP - debugfs_create_file("physmem", 0444, root, - &memblock.physmem, &memblock_debug_fops); + debugfs_create_file("physmem", 0444, root, &physmem, + &memblock_debug_fops); #endif return 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0b38b6ad547d..8cc617ede7e2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1004,7 +1004,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, struct mem_cgroup_reclaim_cookie *reclaim) { - struct mem_cgroup_reclaim_iter *uninitialized_var(iter); + struct mem_cgroup_reclaim_iter *iter; struct cgroup_subsys_state *css = NULL; struct mem_cgroup *memcg = NULL; struct mem_cgroup *pos = NULL; @@ -2772,8 +2772,10 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, return; cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN); - if (!cw) + if (!cw) { + css_put(&memcg->css); return; + } cw->memcg = memcg; cw->cachep = cachep; @@ -5667,7 +5669,6 @@ static void __mem_cgroup_clear_mc(void) if (!mem_cgroup_is_root(mc.to)) page_counter_uncharge(&mc.to->memory, mc.moved_swap); - mem_cgroup_id_get_many(mc.to, mc.moved_swap); css_put_many(&mc.to->css, mc.moved_swap); mc.moved_swap = 0; @@ -5858,7 +5859,8 @@ put: /* get_mctgt_type() gets the page */ ent = target.ent; if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { mc.precharge--; - /* we fixup refcnts and charges later. */ + mem_cgroup_id_get_many(mc.to, 1); + /* we fixup other refcnts and charges later. */ mc.moved_swap++; } break; @@ -6360,11 +6362,16 @@ static unsigned long effective_protection(unsigned long usage, * We're using unprotected memory for the weight so that if * some cgroups DO claim explicit protection, we don't protect * the same bytes twice. + * + * Check both usage and parent_usage against the respective + * protected values. One should imply the other, but they + * aren't read atomically - make sure the division is sane. */ if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)) return ep; - - if (parent_effective > siblings_protected && usage > protected) { + if (parent_effective > siblings_protected && + parent_usage > siblings_protected && + usage > protected) { unsigned long unclaimed; unclaimed = parent_effective - siblings_protected; @@ -6416,7 +6423,7 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, if (parent == root) { memcg->memory.emin = READ_ONCE(memcg->memory.min); - memcg->memory.elow = memcg->memory.low; + memcg->memory.elow = READ_ONCE(memcg->memory.low); goto out; } @@ -6428,7 +6435,8 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, atomic_long_read(&parent->memory.children_min_usage))); WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage, - memcg->memory.low, READ_ONCE(parent->memory.elow), + READ_ONCE(memcg->memory.low), + READ_ONCE(parent->memory.elow), atomic_long_read(&parent->memory.children_low_usage))); out: @@ -7178,6 +7186,13 @@ static struct cftype memsw_files[] = { { }, /* terminate */ }; +/* + * If mem_cgroup_swap_init() is implemented as a subsys_initcall() + * instead of a core_initcall(), this could mean cgroup_memory_noswap still + * remains set to false even when memcg is disabled via "cgroup_disable=memory" + * boot parameter. This may result in premature OOPS inside + * mem_cgroup_get_nr_swap_pages() function in corner cases. + */ static int __init mem_cgroup_swap_init(void) { /* No memory control -> no swap control */ @@ -7192,6 +7207,6 @@ static int __init mem_cgroup_swap_init(void) return 0; } -subsys_initcall(mem_cgroup_swap_init); +core_initcall(mem_cgroup_swap_init); #endif /* CONFIG_MEMCG_SWAP */ diff --git a/mm/memory.c b/mm/memory.c index dc7f3543b1fd..0da48f6586f8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -437,7 +437,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd) * of a chain of data-dependent loads, meaning most CPUs (alpha * being the notable exception) will already guarantee loads are * seen in-order. See the alpha page table accessors for the - * smp_read_barrier_depends() barriers in page table walking code. + * smp_rmb() barriers in page table walking code. */ smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ @@ -1498,7 +1498,7 @@ out: } #ifdef pte_index -static int insert_page_in_batch_locked(struct mm_struct *mm, pmd_t *pmd, +static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { int err; @@ -1506,8 +1506,9 @@ static int insert_page_in_batch_locked(struct mm_struct *mm, pmd_t *pmd, if (!page_count(page)) return -EINVAL; err = validate_page_before_insert(page); - return err ? err : insert_page_into_pte_locked( - mm, pte_offset_map(pmd, addr), addr, page, prot); + if (err) + return err; + return insert_page_into_pte_locked(mm, pte, addr, page, prot); } /* insert_pages() amortizes the cost of spinlock operations @@ -1517,7 +1518,8 @@ static int insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num, pgprot_t prot) { pmd_t *pmd = NULL; - spinlock_t *pte_lock = NULL; + pte_t *start_pte, *pte; + spinlock_t *pte_lock; struct mm_struct *const mm = vma->vm_mm; unsigned long curr_page_idx = 0; unsigned long remaining_pages_total = *num; @@ -1536,18 +1538,17 @@ more: ret = -ENOMEM; if (pte_alloc(mm, pmd)) goto out; - pte_lock = pte_lockptr(mm, pmd); while (pages_to_write_in_pmd) { int pte_idx = 0; const int batch_size = min_t(int, pages_to_write_in_pmd, 8); - spin_lock(pte_lock); - for (; pte_idx < batch_size; ++pte_idx) { - int err = insert_page_in_batch_locked(mm, pmd, + start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock); + for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) { + int err = insert_page_in_batch_locked(mm, pte, addr, pages[curr_page_idx], prot); if (unlikely(err)) { - spin_unlock(pte_lock); + pte_unmap_unlock(start_pte, pte_lock); ret = err; remaining_pages_total -= pte_idx; goto out; @@ -1555,7 +1556,7 @@ more: addr += PAGE_SIZE; ++curr_page_idx; } - spin_unlock(pte_lock); + pte_unmap_unlock(start_pte, pte_lock); pages_to_write_in_pmd -= batch_size; remaining_pages_total -= batch_size; } @@ -1600,7 +1601,7 @@ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, return insert_pages(vma, addr, pages, num, vma->vm_page_prot); #else unsigned long idx = 0, pgcount = *num; - int err; + int err = -EINVAL; for (; idx < pgcount; ++idx) { err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]); @@ -2204,7 +2205,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, { pte_t *pte; int err = 0; - spinlock_t *uninitialized_var(ptl); + spinlock_t *ptl; if (create) { pte = (mm == &init_mm) ? @@ -3140,8 +3141,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) err = mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL); ClearPageSwapCache(page); - if (err) + if (err) { + ret = VM_FAULT_OOM; goto out_page; + } + + /* + * XXX: Move to lru_cache_add() when it + * supports new vs putback + */ + spin_lock_irq(&page_pgdat(page)->lru_lock); + lru_note_cost_page(page); + spin_unlock_irq(&page_pgdat(page)->lru_lock); lru_cache_add(page); swap_readpage(page, true); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9b34e03e730a..da374cd3d45b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -471,11 +471,20 @@ void __ref remove_pfn_range_from_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { + const unsigned long end_pfn = start_pfn + nr_pages; struct pglist_data *pgdat = zone->zone_pgdat; - unsigned long flags; + unsigned long pfn, cur_nr_pages, flags; /* Poison struct pages because they are now uninitialized again. */ - page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages); + for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) { + cond_resched(); + + /* Select all remaining pages up to the next section boundary */ + cur_nr_pages = + min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn); + page_init_poison(pfn_to_page(pfn), + sizeof(struct page) * cur_nr_pages); + } #ifdef CONFIG_ZONE_DEVICE /* diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 381320671677..b9e85d467352 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1234,7 +1234,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, static struct page *new_page(struct page *page, unsigned long start) { struct vm_area_struct *vma; - unsigned long uninitialized_var(address); + unsigned long address; vma = find_vma(current->mm, start); while (vma) { @@ -1629,7 +1629,7 @@ static int kernel_get_mempolicy(int __user *policy, unsigned long flags) { int err; - int uninitialized_var(pval); + int pval; nodemask_t nodes; addr = untagged_addr(addr); diff --git a/mm/migrate.c b/mm/migrate.c index f37729673558..40cd7016ae6f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1161,21 +1161,10 @@ out: } /* - * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work - * around it. - */ -#if defined(CONFIG_ARM) && \ - defined(GCC_VERSION) && GCC_VERSION < 40900 && GCC_VERSION >= 40700 -#define ICE_noinline noinline -#else -#define ICE_noinline -#endif - -/* * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. */ -static ICE_noinline int unmap_and_move(new_page_t get_new_page, +static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, unsigned long private, struct page *page, int force, enum migrate_mode mode, diff --git a/mm/mmap.c b/mm/mmap.c index 59a4682ebf3f..dcdab2675a21 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2620,7 +2620,7 @@ static void unmap_region(struct mm_struct *mm, * Create a list of vma's touched by the unmap, removing them from the mm's * vma list as we go.. */ -static void +static bool detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long end) { @@ -2645,6 +2645,17 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, /* Kill the cache */ vmacache_invalidate(mm); + + /* + * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or + * VM_GROWSUP VMA. Such VMAs can change their size under + * down_read(mmap_lock) and collide with the VMA we are about to unmap. + */ + if (vma && (vma->vm_flags & VM_GROWSDOWN)) + return false; + if (prev && (prev->vm_flags & VM_GROWSUP)) + return false; + return true; } /* @@ -2825,7 +2836,8 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, } /* Detach vmas from rbtree */ - detach_vmas_to_be_unmapped(mm, vma, prev, end); + if (!detach_vmas_to_be_unmapped(mm, vma, prev, end)) + downgrade = false; if (downgrade) mmap_write_downgrade(mm); @@ -3159,6 +3171,7 @@ void exit_mmap(struct mm_struct *mm) if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); vma = remove_vma(vma); + cond_resched(); } vm_unacct_memory(nr_accounted); } diff --git a/mm/mremap.c b/mm/mremap.c index 5dd572d57ca9..6b153dc05fe4 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -206,9 +206,28 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, /* * The destination pmd shouldn't be established, free_pgtables() - * should have release it. + * should have released it. + * + * However, there's a case during execve() where we use mremap + * to move the initial stack, and in that case the target area + * may overlap the source area (always moving down). + * + * If everything is PMD-aligned, that works fine, as moving + * each pmd down will clear the source pmd. But if we first + * have a few 4kB-only pages that get moved down, and then + * hit the "now the rest is PMD-aligned, let's do everything + * one pmd at a time", we will still have the old (now empty + * of any 4kB pages, but still there) PMD in the page table + * tree. + * + * Warn on it once - because we really should try to figure + * out how to do this better - but then say "I won't move + * this pmd". + * + * One alternative might be to just unmap the target pmd at + * this point, and verify that it really is empty. We'll see. */ - if (WARN_ON(!pmd_none(*new_pmd))) + if (WARN_ON_ONCE(!pmd_none(*new_pmd))) return false; /* diff --git a/mm/nommu.c b/mm/nommu.c index 64539971188b..314174817b04 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -291,23 +291,6 @@ void *vzalloc_node(unsigned long size, int node) EXPORT_SYMBOL(vzalloc_node); /** - * vmalloc_exec - allocate virtually contiguous, executable memory - * @size: allocation size - * - * Kernel-internal function to allocate enough pages to cover @size - * the page level allocator and map them into contiguous and - * executable kernel virtual space. - * - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. - */ - -void *vmalloc_exec(unsigned long size) -{ - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM); -} - -/** * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) * @size: allocation size * diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 48eb0f1410d4..901a21f61d68 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -961,7 +961,7 @@ static inline void __free_one_page(struct page *page, int migratetype, bool report) { struct capture_control *capc = task_capc(zone); - unsigned long uninitialized_var(buddy_pfn); + unsigned long buddy_pfn; unsigned long combined_pfn; unsigned int max_order; struct page *buddy; @@ -7832,7 +7832,7 @@ void setup_per_zone_wmarks(void) * Initialise min_free_kbytes. * * For small machines we want it small (128k min). For large machines - * we want it large (64MB max). But it is not linear, because network + * we want it large (256MB max). But it is not linear, because network * bandwidth does not increase linearly with machine size. We use * * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: diff --git a/mm/page_io.c b/mm/page_io.c index e8726f3e3820..ccda76790088 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -277,6 +277,23 @@ static inline void count_swpout_vm_event(struct page *page) count_vm_events(PSWPOUT, hpage_nr_pages(page)); } +#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) +static void bio_associate_blkg_from_page(struct bio *bio, struct page *page) +{ + struct cgroup_subsys_state *css; + + if (!page->mem_cgroup) + return; + + rcu_read_lock(); + css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys); + bio_associate_blkg_from_css(bio, css); + rcu_read_unlock(); +} +#else +#define bio_associate_blkg_from_page(bio, page) do { } while (0) +#endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */ + int __swap_writepage(struct page *page, struct writeback_control *wbc, bio_end_io_t end_write_func) { diff --git a/mm/percpu.c b/mm/percpu.c index 696367b18222..b626766160ce 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2513,7 +2513,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( const size_t static_size = __per_cpu_end - __per_cpu_start; int nr_groups = 1, nr_units = 0; size_t size_sum, min_unit_size, alloc_size; - int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ + int upa, max_upa, best_upa; /* units_per_alloc */ int last_allocs, group, unit; unsigned int cpu, tcpu; struct pcpu_alloc_info *ai; diff --git a/mm/rodata_test.c b/mm/rodata_test.c index 5e313fa93276..2a99df7beeb3 100644 --- a/mm/rodata_test.c +++ b/mm/rodata_test.c @@ -25,7 +25,7 @@ void rodata_test(void) } /* test 2: write to the variable; this should fault */ - if (!probe_kernel_write((void *)&rodata_test_data, + if (!copy_to_kernel_nofault((void *)&rodata_test_data, (void *)&zero, sizeof(zero))) { pr_err("test data was not read only\n"); return; diff --git a/mm/shmem.c b/mm/shmem.c index a0dbe62f8042..b2abca3f7f33 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3178,7 +3178,7 @@ static int shmem_initxattrs(struct inode *inode, new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, GFP_KERNEL); if (!new_xattr->name) { - kfree(new_xattr); + kvfree(new_xattr); return -ENOMEM; } diff --git a/mm/slab.h b/mm/slab.h index 207c83ef6e06..74f7e09a7cfd 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -348,7 +348,7 @@ static __always_inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, struct kmem_cache *s) { - unsigned int nr_pages = 1 << order; + int nr_pages = 1 << order; struct mem_cgroup *memcg; struct lruvec *lruvec; int ret; @@ -388,7 +388,7 @@ out: static __always_inline void memcg_uncharge_slab(struct page *page, int order, struct kmem_cache *s) { - unsigned int nr_pages = 1 << order; + int nr_pages = 1 << order; struct mem_cgroup *memcg; struct lruvec *lruvec; diff --git a/mm/slab_common.c b/mm/slab_common.c index 9e72ba224175..fe8b68482670 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -326,6 +326,14 @@ int slab_unmergeable(struct kmem_cache *s) if (s->refcount < 0) return 1; +#ifdef CONFIG_MEMCG_KMEM + /* + * Skip the dying kmem_cache. + */ + if (s->memcg_params.dying) + return 1; +#endif + return 0; } @@ -886,12 +894,15 @@ static int shutdown_memcg_caches(struct kmem_cache *s) return 0; } -static void flush_memcg_workqueue(struct kmem_cache *s) +static void memcg_set_kmem_cache_dying(struct kmem_cache *s) { spin_lock_irq(&memcg_kmem_wq_lock); s->memcg_params.dying = true; spin_unlock_irq(&memcg_kmem_wq_lock); +} +static void flush_memcg_workqueue(struct kmem_cache *s) +{ /* * SLAB and SLUB deactivate the kmem_caches through call_rcu. Make * sure all registered rcu callbacks have been invoked. @@ -923,10 +934,6 @@ static inline int shutdown_memcg_caches(struct kmem_cache *s) { return 0; } - -static inline void flush_memcg_workqueue(struct kmem_cache *s) -{ -} #endif /* CONFIG_MEMCG_KMEM */ void slab_kmem_cache_release(struct kmem_cache *s) @@ -944,8 +951,6 @@ void kmem_cache_destroy(struct kmem_cache *s) if (unlikely(!s)) return; - flush_memcg_workqueue(s); - get_online_cpus(); get_online_mems(); @@ -955,6 +960,22 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->refcount) goto out_unlock; +#ifdef CONFIG_MEMCG_KMEM + memcg_set_kmem_cache_dying(s); + + mutex_unlock(&slab_mutex); + + put_online_mems(); + put_online_cpus(); + + flush_memcg_workqueue(s); + + get_online_cpus(); + get_online_mems(); + + mutex_lock(&slab_mutex); +#endif + err = shutdown_memcg_caches(s); if (!err) err = shutdown_cache(s); @@ -1726,7 +1747,7 @@ void kzfree(const void *p) if (unlikely(ZERO_OR_NULL_PTR(mem))) return; ks = ksize(mem); - memset(mem, 0, ks); + memzero_explicit(mem, ks); kfree(mem); } EXPORT_SYMBOL(kzfree); diff --git a/mm/slub.c b/mm/slub.c index b8f798b50d44..f226d66408ee 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -292,7 +292,7 @@ static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) return get_freepointer(s, object); freepointer_addr = (unsigned long)object + s->offset; - probe_kernel_read(&p, (void **)freepointer_addr, sizeof(p)); + copy_from_kernel_nofault(&p, (void **)freepointer_addr, sizeof(p)); return freelist_ptr(s, p, freepointer_addr); } @@ -1218,7 +1218,7 @@ static noinline int free_debug_processing( struct kmem_cache_node *n = get_node(s, page_to_nid(page)); void *object = head; int cnt = 0; - unsigned long uninitialized_var(flags); + unsigned long flags; int ret = 0; spin_lock_irqsave(&n->list_lock, flags); @@ -2901,7 +2901,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, struct page new; unsigned long counters; struct kmem_cache_node *n = NULL; - unsigned long uninitialized_var(flags); + unsigned long flags; stat(s, FREE_SLOWPATH); @@ -3766,15 +3766,13 @@ error: } static void list_slab_objects(struct kmem_cache *s, struct page *page, - const char *text, unsigned long *map) + const char *text) { #ifdef CONFIG_SLUB_DEBUG void *addr = page_address(page); + unsigned long *map; void *p; - if (!map) - return; - slab_err(s, page, text, s->name); slab_lock(page); @@ -3786,6 +3784,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, print_tracking(s, p); } } + put_map(map); slab_unlock(page); #endif } @@ -3799,11 +3798,6 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { LIST_HEAD(discard); struct page *page, *h; - unsigned long *map = NULL; - -#ifdef CONFIG_SLUB_DEBUG - map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL); -#endif BUG_ON(irqs_disabled()); spin_lock_irq(&n->list_lock); @@ -3813,16 +3807,11 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) list_add(&page->slab_list, &discard); } else { list_slab_objects(s, page, - "Objects remaining in %s on __kmem_cache_shutdown()", - map); + "Objects remaining in %s on __kmem_cache_shutdown()"); } } spin_unlock_irq(&n->list_lock); -#ifdef CONFIG_SLUB_DEBUG - bitmap_free(map); -#endif - list_for_each_entry_safe(page, h, &discard, slab_list) discard_slab(s, page); } diff --git a/mm/swap.c b/mm/swap.c index dbcab84c6fce..de257c0a89b1 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -443,8 +443,7 @@ void mark_page_accessed(struct page *page) else __lru_cache_activate_page(page); ClearPageReferenced(page); - if (page_is_file_lru(page)) - workingset_activation(page); + workingset_activation(page); } if (page_is_idle(page)) clear_page_idle(page); @@ -831,8 +830,8 @@ void release_pages(struct page **pages, int nr) LIST_HEAD(pages_to_free); struct pglist_data *locked_pgdat = NULL; struct lruvec *lruvec; - unsigned long uninitialized_var(flags); - unsigned int uninitialized_var(lock_batch); + unsigned long flags; + unsigned int lock_batch; for (i = 0; i < nr; i++) { struct page *page = pages[i]; diff --git a/mm/swap_state.c b/mm/swap_state.c index e98ff460e9e9..05889e8e3c97 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -21,7 +21,7 @@ #include <linux/vmalloc.h> #include <linux/swap_slots.h> #include <linux/huge_mm.h> - +#include "internal.h" /* * swapper_space is a fiction, retained to simplify the path through @@ -429,7 +429,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, __SetPageSwapBacked(page); /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (add_to_swap_cache(page, entry, gfp_mask & GFP_KERNEL)) { + if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK)) { put_swap_page(page, entry); goto fail_unlock; } diff --git a/mm/swapfile.c b/mm/swapfile.c index 987276c557d1..6c26916e95fd 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2929,7 +2929,7 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) * write only restriction. Hence zoned block devices are not * suitable for swapping. Disallow them here. */ - if (blk_queue_is_zoned(p->bdev->bd_queue)) + if (blk_queue_is_zoned(p->bdev->bd_disk->queue)) return -EINVAL; p->flags |= SWP_BLKDEV; } else if (S_ISREG(inode->i_mode)) { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3091c2ca60df..5a2b55c8dd9a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1862,7 +1862,6 @@ EXPORT_SYMBOL(vm_unmap_ram); * @pages: an array of pointers to the pages to be mapped * @count: number of pages * @node: prefer to allocate data structures on this node - * @prot: memory protection to use. PAGE_KERNEL for regular RAM * * If you use this function for less than VMAP_MAX_ALLOC pages, it could be * faster than vmap so it's good. But if you mix long-life and short-life @@ -2696,26 +2695,6 @@ void *vzalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vzalloc_node); -/** - * vmalloc_exec - allocate virtually contiguous, executable memory - * @size: allocation size - * - * Kernel-internal function to allocate enough pages to cover @size - * the page level allocator and map them into contiguous and - * executable kernel virtual space. - * - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. - * - * Return: pointer to the allocated memory or %NULL on error - */ -void *vmalloc_exec(unsigned long size) -{ - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, - NUMA_NO_NODE, __builtin_return_address(0)); -} - #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) diff --git a/mm/vmscan.c b/mm/vmscan.c index b6d84326bdf2..749d239c62b2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -904,6 +904,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, __delete_from_swap_cache(page, swap); xa_unlock_irqrestore(&mapping->i_pages, flags); put_swap_page(page, swap); + workingset_eviction(page, target_memcg); } else { void (*freepage)(struct page *); void *shadow = NULL; @@ -1884,6 +1885,8 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, list_add(&page->lru, &pages_to_free); } else { nr_moved += nr_pages; + if (PageActive(page)) + workingset_age_nonresident(lruvec, nr_pages); } } diff --git a/mm/workingset.c b/mm/workingset.c index d481ea452eeb..50b7937bab32 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -156,8 +156,8 @@ * * Implementation * - * For each node's file LRU lists, a counter for inactive evictions - * and activations is maintained (node->inactive_age). + * For each node's LRU lists, a counter for inactive evictions and + * activations is maintained (node->nonresident_age). * * On eviction, a snapshot of this counter (along with some bits to * identify the node) is stored in the now empty page cache @@ -213,7 +213,17 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, *workingsetp = workingset; } -static void advance_inactive_age(struct mem_cgroup *memcg, pg_data_t *pgdat) +/** + * workingset_age_nonresident - age non-resident entries as LRU ages + * @memcg: the lruvec that was aged + * @nr_pages: the number of pages to count + * + * As in-memory pages are aged, non-resident pages need to be aged as + * well, in order for the refault distances later on to be comparable + * to the in-memory dimensions. This function allows reclaim and LRU + * operations to drive the non-resident aging along in parallel. + */ +void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages) { /* * Reclaiming a cgroup means reclaiming all its children in a @@ -227,11 +237,8 @@ static void advance_inactive_age(struct mem_cgroup *memcg, pg_data_t *pgdat) * the root cgroup's, age as well. */ do { - struct lruvec *lruvec; - - lruvec = mem_cgroup_lruvec(memcg, pgdat); - atomic_long_inc(&lruvec->inactive_age); - } while (memcg && (memcg = parent_mem_cgroup(memcg))); + atomic_long_add(nr_pages, &lruvec->nonresident_age); + } while ((lruvec = parent_lruvec(lruvec))); } /** @@ -254,12 +261,11 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); - advance_inactive_age(page_memcg(page), pgdat); - lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + workingset_age_nonresident(lruvec, hpage_nr_pages(page)); /* XXX: target_memcg can be NULL, go through lruvec */ memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); - eviction = atomic_long_read(&lruvec->inactive_age); + eviction = atomic_long_read(&lruvec->nonresident_age); return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); } @@ -309,20 +315,20 @@ void workingset_refault(struct page *page, void *shadow) if (!mem_cgroup_disabled() && !eviction_memcg) goto out; eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); - refault = atomic_long_read(&eviction_lruvec->inactive_age); + refault = atomic_long_read(&eviction_lruvec->nonresident_age); /* * Calculate the refault distance * * The unsigned subtraction here gives an accurate distance - * across inactive_age overflows in most cases. There is a + * across nonresident_age overflows in most cases. There is a * special case: usually, shadow entries have a short lifetime * and are either refaulted or reclaimed along with the inode * before they get too old. But it is not impossible for the - * inactive_age to lap a shadow entry in the field, which can - * then result in a false small refault distance, leading to a - * false activation should this old entry actually refault - * again. However, earlier kernels used to deactivate + * nonresident_age to lap a shadow entry in the field, which + * can then result in a false small refault distance, leading + * to a false activation should this old entry actually + * refault again. However, earlier kernels used to deactivate * unconditionally with *every* reclaim invocation for the * longest time, so the occasional inappropriate activation * leading to pressure on the active list is not a problem. @@ -359,7 +365,7 @@ void workingset_refault(struct page *page, void *shadow) goto out; SetPageActive(page); - advance_inactive_age(memcg, pgdat); + workingset_age_nonresident(lruvec, hpage_nr_pages(page)); inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); /* Page was active prior to eviction */ @@ -382,6 +388,7 @@ out: void workingset_activation(struct page *page) { struct mem_cgroup *memcg; + struct lruvec *lruvec; rcu_read_lock(); /* @@ -394,7 +401,8 @@ void workingset_activation(struct page *page) memcg = page_memcg_rcu(page); if (!mem_cgroup_disabled() && !memcg) goto out; - advance_inactive_age(memcg, page_pgdat(page)); + lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); + workingset_age_nonresident(lruvec, hpage_nr_pages(page)); out: rcu_read_unlock(); } |