diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 1 | ||||
-rw-r--r-- | mm/Makefile | 3 | ||||
-rw-r--r-- | mm/backing-dev.c | 5 | ||||
-rw-r--r-- | mm/debug.c | 4 | ||||
-rw-r--r-- | mm/fadvise.c | 81 | ||||
-rw-r--r-- | mm/gup_benchmark.c | 3 | ||||
-rw-r--r-- | mm/huge_memory.c | 22 | ||||
-rw-r--r-- | mm/hugetlb.c | 90 | ||||
-rw-r--r-- | mm/kmemleak.c | 9 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 2 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 3 | ||||
-rw-r--r-- | mm/migrate.c | 62 | ||||
-rw-r--r-- | mm/mmap.c | 2 | ||||
-rw-r--r-- | mm/mremap.c | 30 | ||||
-rw-r--r-- | mm/oom_kill.c | 14 | ||||
-rw-r--r-- | mm/page_alloc.c | 16 | ||||
-rw-r--r-- | mm/percpu.c | 1 | ||||
-rw-r--r-- | mm/readahead.c | 45 | ||||
-rw-r--r-- | mm/rmap.c | 42 | ||||
-rw-r--r-- | mm/shmem.c | 2 | ||||
-rw-r--r-- | mm/util.c | 11 | ||||
-rw-r--r-- | mm/vmacache.c | 38 | ||||
-rw-r--r-- | mm/vmscan.c | 18 | ||||
-rw-r--r-- | mm/vmstat.c | 4 |
25 files changed, 269 insertions, 241 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index a550635ea5c3..de64ea658716 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -637,6 +637,7 @@ config DEFERRED_STRUCT_PAGE_INIT depends on NO_BOOTMEM depends on SPARSEMEM depends on !NEED_PER_CPU_KM + depends on 64BIT help Ordinarily all struct pages are initialised during early boot in a single thread. On very large machines this can take a considerable diff --git a/mm/Makefile b/mm/Makefile index 7c48e0d3d8ab..6485d5745dd7 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -32,7 +32,7 @@ ifdef CONFIG_CROSS_MEMORY_ATTACH mmu-$(CONFIG_MMU) += process_vm_access.o endif -obj-y := filemap.o mempool.o oom_kill.o \ +obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ @@ -49,7 +49,6 @@ else obj-y += bootmem.o endif -obj-$(CONFIG_ADVISE_SYSCALLS) += fadvise.o ifdef CONFIG_MMU obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o endif diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f5981e9d6ae2..8a8bb8796c6c 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -491,6 +491,7 @@ static void cgwb_release_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(work, struct bdi_writeback, release_work); + struct blkcg *blkcg = css_to_blkcg(wb->blkcg_css); mutex_lock(&wb->bdi->cgwb_release_mutex); wb_shutdown(wb); @@ -499,6 +500,9 @@ static void cgwb_release_workfn(struct work_struct *work) css_put(wb->blkcg_css); mutex_unlock(&wb->bdi->cgwb_release_mutex); + /* triggers blkg destruction if cgwb_refcnt becomes zero */ + blkcg_cgwb_put(blkcg); + fprop_local_destroy_percpu(&wb->memcg_completions); percpu_ref_exit(&wb->refcnt); wb_exit(wb); @@ -597,6 +601,7 @@ static int cgwb_create(struct backing_dev_info *bdi, list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); list_add(&wb->memcg_node, memcg_cgwb_list); list_add(&wb->blkcg_node, blkcg_cgwb_list); + blkcg_cgwb_get(blkcg); css_get(memcg_css); css_get(blkcg_css); } diff --git a/mm/debug.c b/mm/debug.c index 38c926520c97..bd10aad8539a 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -114,7 +114,7 @@ EXPORT_SYMBOL(dump_vma); void dump_mm(const struct mm_struct *mm) { - pr_emerg("mm %px mmap %px seqnum %d task_size %lu\n" + pr_emerg("mm %px mmap %px seqnum %llu task_size %lu\n" #ifdef CONFIG_MMU "get_unmapped_area %px\n" #endif @@ -142,7 +142,7 @@ void dump_mm(const struct mm_struct *mm) "tlb_flush_pending %d\n" "def_flags: %#lx(%pGv)\n", - mm, mm->mmap, mm->vmacache_seqnum, mm->task_size, + mm, mm->mmap, (long long) mm->vmacache_seqnum, mm->task_size, #ifdef CONFIG_MMU mm->get_unmapped_area, #endif diff --git a/mm/fadvise.c b/mm/fadvise.c index 2d8376e3c640..467bcd032037 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -27,9 +27,9 @@ * deactivate the pages and clear PG_Referenced. */ -int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) +static int generic_fadvise(struct file *file, loff_t offset, loff_t len, + int advice) { - struct fd f = fdget(fd); struct inode *inode; struct address_space *mapping; struct backing_dev_info *bdi; @@ -37,22 +37,14 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) pgoff_t start_index; pgoff_t end_index; unsigned long nrpages; - int ret = 0; - - if (!f.file) - return -EBADF; - inode = file_inode(f.file); - if (S_ISFIFO(inode->i_mode)) { - ret = -ESPIPE; - goto out; - } + inode = file_inode(file); + if (S_ISFIFO(inode->i_mode)) + return -ESPIPE; - mapping = f.file->f_mapping; - if (!mapping || len < 0) { - ret = -EINVAL; - goto out; - } + mapping = file->f_mapping; + if (!mapping || len < 0) + return -EINVAL; bdi = inode_to_bdi(mapping->host); @@ -67,9 +59,9 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) /* no bad return value, but ignore advice */ break; default: - ret = -EINVAL; + return -EINVAL; } - goto out; + return 0; } /* @@ -85,21 +77,21 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) switch (advice) { case POSIX_FADV_NORMAL: - f.file->f_ra.ra_pages = bdi->ra_pages; - spin_lock(&f.file->f_lock); - f.file->f_mode &= ~FMODE_RANDOM; - spin_unlock(&f.file->f_lock); + file->f_ra.ra_pages = bdi->ra_pages; + spin_lock(&file->f_lock); + file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&file->f_lock); break; case POSIX_FADV_RANDOM: - spin_lock(&f.file->f_lock); - f.file->f_mode |= FMODE_RANDOM; - spin_unlock(&f.file->f_lock); + spin_lock(&file->f_lock); + file->f_mode |= FMODE_RANDOM; + spin_unlock(&file->f_lock); break; case POSIX_FADV_SEQUENTIAL: - f.file->f_ra.ra_pages = bdi->ra_pages * 2; - spin_lock(&f.file->f_lock); - f.file->f_mode &= ~FMODE_RANDOM; - spin_unlock(&f.file->f_lock); + file->f_ra.ra_pages = bdi->ra_pages * 2; + spin_lock(&file->f_lock); + file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&file->f_lock); break; case POSIX_FADV_WILLNEED: /* First and last PARTIAL page! */ @@ -115,8 +107,7 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) * Ignore return value because fadvise() shall return * success even if filesystem can't retrieve a hint, */ - force_page_cache_readahead(mapping, f.file, start_index, - nrpages); + force_page_cache_readahead(mapping, file, start_index, nrpages); break; case POSIX_FADV_NOREUSE: break; @@ -183,9 +174,32 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) } break; default: - ret = -EINVAL; + return -EINVAL; } -out: + return 0; +} + +int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice) +{ + if (file->f_op->fadvise) + return file->f_op->fadvise(file, offset, len, advice); + + return generic_fadvise(file, offset, len, advice); +} +EXPORT_SYMBOL(vfs_fadvise); + +#ifdef CONFIG_ADVISE_SYSCALLS + +int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) +{ + struct fd f = fdget(fd); + int ret; + + if (!f.file) + return -EBADF; + + ret = vfs_fadvise(f.file, offset, len, advice); + fdput(f); return ret; } @@ -203,3 +217,4 @@ SYSCALL_DEFINE4(fadvise64, int, fd, loff_t, offset, size_t, len, int, advice) } #endif +#endif diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c index 6a473709e9b6..7405c9d89d65 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_benchmark.c @@ -19,7 +19,8 @@ static int __gup_benchmark_ioctl(unsigned int cmd, struct gup_benchmark *gup) { ktime_t start_time, end_time; - unsigned long i, nr, nr_pages, addr, next; + unsigned long i, nr_pages, addr, next; + int nr; struct page **pages; nr_pages = gup->size / PAGE_SIZE; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c3bc7e9c9a2a..deed97fba979 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -821,11 +821,11 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, * but we need to be consistent with PTEs and architectures that * can't support a 'special' bit. */ - BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); + BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && + !pfn_t_devmap(pfn)); BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); - BUG_ON(!pfn_t_devmap(pfn)); if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; @@ -1780,7 +1780,7 @@ static pmd_t move_soft_dirty_pmd(pmd_t pmd) bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, - pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush) + pmd_t *old_pmd, pmd_t *new_pmd) { spinlock_t *old_ptl, *new_ptl; pmd_t pmd; @@ -1811,7 +1811,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); - if (pmd_present(pmd) && pmd_dirty(pmd)) + if (pmd_present(pmd)) force_flush = true; VM_BUG_ON(!pmd_none(*new_pmd)); @@ -1822,12 +1822,10 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, } pmd = move_soft_dirty_pmd(pmd); set_pmd_at(mm, new_addr, new_pmd, pmd); - if (new_ptl != old_ptl) - spin_unlock(new_ptl); if (force_flush) flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE); - else - *need_flush = true; + if (new_ptl != old_ptl) + spin_unlock(new_ptl); spin_unlock(old_ptl); return true; } @@ -2885,9 +2883,6 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, if (!(pvmw->pmd && !pvmw->pte)) return; - mmu_notifier_invalidate_range_start(mm, address, - address + HPAGE_PMD_SIZE); - flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); pmdval = *pvmw->pmd; pmdp_invalidate(vma, address, pvmw->pmd); @@ -2900,9 +2895,6 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, set_pmd_at(mm, address, pvmw->pmd, pmdswp); page_remove_rmap(page, true); put_page(page); - - mmu_notifier_invalidate_range_end(mm, address, - address + HPAGE_PMD_SIZE); } void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) @@ -2931,7 +2923,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) else page_add_file_rmap(new, true); set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); - if (vma->vm_flags & VM_LOCKED) + if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new)) mlock_vma_page(new); update_mmu_cache_pmd(vma, address, pvmw->pmd); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3c21775f196b..5c390f5a5207 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3326,8 +3326,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); - const unsigned long mmun_start = start; /* For mmu_notifiers */ - const unsigned long mmun_end = end; /* For mmu_notifiers */ + unsigned long mmun_start = start; /* For mmu_notifiers */ + unsigned long mmun_end = end; /* For mmu_notifiers */ WARN_ON(!is_vm_hugetlb_page(vma)); BUG_ON(start & ~huge_page_mask(h)); @@ -3339,6 +3339,11 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, */ tlb_remove_check_page_size_change(tlb, sz); tlb_start_vma(tlb, vma); + + /* + * If sharing possible, alert mmu notifiers of worst case. + */ + adjust_range_if_pmd_sharing_possible(vma, &mmun_start, &mmun_end); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); address = start; for (; address < end; address += sz) { @@ -3349,6 +3354,10 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, &address, ptep)) { spin_unlock(ptl); + /* + * We just unmapped a page of PMDs by clearing a PUD. + * The caller's TLB flush range should cover this area. + */ continue; } @@ -3431,12 +3440,23 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, { struct mm_struct *mm; struct mmu_gather tlb; + unsigned long tlb_start = start; + unsigned long tlb_end = end; + + /* + * If shared PMDs were possibly used within this vma range, adjust + * start/end for worst case tlb flushing. + * Note that we can not be sure if PMDs are shared until we try to + * unmap pages. However, we want to make sure TLB flushing covers + * the largest possible range. + */ + adjust_range_if_pmd_sharing_possible(vma, &tlb_start, &tlb_end); mm = vma->vm_mm; - tlb_gather_mmu(&tlb, mm, start, end); + tlb_gather_mmu(&tlb, mm, tlb_start, tlb_end); __unmap_hugepage_range(&tlb, vma, start, end, ref_page); - tlb_finish_mmu(&tlb, start, end); + tlb_finish_mmu(&tlb, tlb_start, tlb_end); } /* @@ -4298,11 +4318,21 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, pte_t pte; struct hstate *h = hstate_vma(vma); unsigned long pages = 0; + unsigned long f_start = start; + unsigned long f_end = end; + bool shared_pmd = false; + + /* + * In the case of shared PMDs, the area to flush could be beyond + * start/end. Set f_start/f_end to cover the maximum possible + * range if PMD sharing is possible. + */ + adjust_range_if_pmd_sharing_possible(vma, &f_start, &f_end); BUG_ON(address >= end); - flush_cache_range(vma, address, end); + flush_cache_range(vma, f_start, f_end); - mmu_notifier_invalidate_range_start(mm, start, end); + mmu_notifier_invalidate_range_start(mm, f_start, f_end); i_mmap_lock_write(vma->vm_file->f_mapping); for (; address < end; address += huge_page_size(h)) { spinlock_t *ptl; @@ -4313,6 +4343,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, if (huge_pmd_unshare(mm, &address, ptep)) { pages++; spin_unlock(ptl); + shared_pmd = true; continue; } pte = huge_ptep_get(ptep); @@ -4348,9 +4379,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare * may have cleared our pud entry and done put_page on the page table: * once we release i_mmap_rwsem, another task can do the final put_page - * and that page table be reused and filled with junk. + * and that page table be reused and filled with junk. If we actually + * did unshare a page of pmds, flush the range corresponding to the pud. */ - flush_hugetlb_tlb_range(vma, start, end); + if (shared_pmd) + flush_hugetlb_tlb_range(vma, f_start, f_end); + else + flush_hugetlb_tlb_range(vma, start, end); /* * No need to call mmu_notifier_invalidate_range() we are downgrading * page table protection not changing it to point to a new page. @@ -4358,7 +4393,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * See Documentation/vm/mmu_notifier.rst */ i_mmap_unlock_write(vma->vm_file->f_mapping); - mmu_notifier_invalidate_range_end(mm, start, end); + mmu_notifier_invalidate_range_end(mm, f_start, f_end); return pages << h->order; } @@ -4545,13 +4580,41 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) /* * check on proper vm_flags and page table alignment */ - if (vma->vm_flags & VM_MAYSHARE && - vma->vm_start <= base && end <= vma->vm_end) + if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end)) return true; return false; } /* + * Determine if start,end range within vma could be mapped by shared pmd. + * If yes, adjust start and end to cover range associated with possible + * shared pmd mappings. + */ +void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, + unsigned long *start, unsigned long *end) +{ + unsigned long check_addr = *start; + + if (!(vma->vm_flags & VM_MAYSHARE)) + return; + + for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) { + unsigned long a_start = check_addr & PUD_MASK; + unsigned long a_end = a_start + PUD_SIZE; + + /* + * If sharing is possible, adjust start/end if necessary. + */ + if (range_in_vma(vma, a_start, a_end)) { + if (a_start < *start) + *start = a_start; + if (a_end > *end) + *end = a_end; + } + } +} + +/* * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() * and returns the corresponding pte. While this is not necessary for the * !shared pmd case because we can allocate the pmd later as well, it makes the @@ -4648,6 +4711,11 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) { return 0; } + +void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, + unsigned long *start, unsigned long *end) +{ +} #define want_pmd_share() (0) #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 9a085d525bbc..17dd883198ae 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -2097,6 +2097,11 @@ static int __init kmemleak_late_init(void) kmemleak_initialized = 1; + dentry = debugfs_create_file("kmemleak", 0644, NULL, NULL, + &kmemleak_fops); + if (!dentry) + pr_warn("Failed to create the debugfs kmemleak file\n"); + if (kmemleak_error) { /* * Some error occurred and kmemleak was disabled. There is a @@ -2108,10 +2113,6 @@ static int __init kmemleak_late_init(void) return -ENOMEM; } - dentry = debugfs_create_file("kmemleak", 0644, NULL, NULL, - &kmemleak_fops); - if (!dentry) - pr_warn("Failed to create the debugfs kmemleak file\n"); mutex_lock(&scan_mutex); start_scan_thread(); mutex_unlock(&scan_mutex); diff --git a/mm/madvise.c b/mm/madvise.c index 972a9eaa898b..71d21df2a3f3 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -96,7 +96,7 @@ static long madvise_behavior(struct vm_area_struct *vma, new_flags |= VM_DONTDUMP; break; case MADV_DODUMP: - if (new_flags & VM_SPECIAL) { + if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) { error = -EINVAL; goto out; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4ead5a4817de..e79cb59552d9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1701,8 +1701,6 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int if (mem_cgroup_out_of_memory(memcg, mask, order)) return OOM_SUCCESS; - WARN(1,"Memory cgroup charge failed because of no reclaimable memory! " - "This looks like a misconfiguration or a kernel bug."); return OOM_FAILED; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9eea6e809a4e..38d94b703e9d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1333,7 +1333,8 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) if (__PageMovable(page)) return pfn; if (PageHuge(page)) { - if (page_huge_active(page)) + if (hugepage_migration_supported(page_hstate(page)) && + page_huge_active(page)) return pfn; else pfn = round_up(pfn + 1, diff --git a/mm/migrate.c b/mm/migrate.c index d6a2e89b086a..84381b55b2bd 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -275,6 +275,9 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) mlock_vma_page(new); + if (PageTransHuge(page) && PageMlocked(page)) + clear_page_mlock(page); + /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, pvmw.address, pvmw.pte); } @@ -1411,7 +1414,7 @@ retry: * we encounter them after the rest of the list * is processed. */ - if (PageTransHuge(page)) { + if (PageTransHuge(page) && !PageHuge(page)) { lock_page(page); rc = split_huge_page_to_list(page, from); unlock_page(page); @@ -1855,46 +1858,6 @@ static struct page *alloc_misplaced_dst_page(struct page *page, return newpage; } -/* - * page migration rate limiting control. - * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs - * window of time. Default here says do not migrate more than 1280M per second. - */ -static unsigned int migrate_interval_millisecs __read_mostly = 100; -static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); - -/* Returns true if the node is migrate rate-limited after the update */ -static bool numamigrate_update_ratelimit(pg_data_t *pgdat, - unsigned long nr_pages) -{ - /* - * Rate-limit the amount of data that is being migrated to a node. - * Optimal placement is no good if the memory bus is saturated and - * all the time is being spent migrating! - */ - if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { - spin_lock(&pgdat->numabalancing_migrate_lock); - pgdat->numabalancing_migrate_nr_pages = 0; - pgdat->numabalancing_migrate_next_window = jiffies + - msecs_to_jiffies(migrate_interval_millisecs); - spin_unlock(&pgdat->numabalancing_migrate_lock); - } - if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) { - trace_mm_numa_migrate_ratelimit(current, pgdat->node_id, - nr_pages); - return true; - } - - /* - * This is an unlocked non-atomic update so errors are possible. - * The consequences are failing to migrate when we potentiall should - * have which is not severe enough to warrant locking. If it is ever - * a problem, it can be converted to a per-cpu counter. - */ - pgdat->numabalancing_migrate_nr_pages += nr_pages; - return false; -} - static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) { int page_lru; @@ -1967,14 +1930,6 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, if (page_is_file_cache(page) && PageDirty(page)) goto out; - /* - * Rate-limit the amount of data that is being migrated to a node. - * Optimal placement is no good if the memory bus is saturated and - * all the time is being spent migrating! - */ - if (numamigrate_update_ratelimit(pgdat, 1)) - goto out; - isolated = numamigrate_isolate_page(pgdat, page); if (!isolated) goto out; @@ -2021,14 +1976,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, unsigned long mmun_start = address & HPAGE_PMD_MASK; unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; - /* - * Rate-limit the amount of data that is being migrated to a node. - * Optimal placement is no good if the memory bus is saturated and - * all the time is being spent migrating! - */ - if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR)) - goto out_dropref; - new_page = alloc_pages_node(node, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), HPAGE_PMD_ORDER); @@ -2125,7 +2072,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, out_fail: count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); -out_dropref: ptl = pmd_lock(mm, pmd); if (pmd_same(*pmd, entry)) { entry = pmd_modify(entry, vma->vm_page_prot); diff --git a/mm/mmap.c b/mm/mmap.c index 5f2b2b184c60..f7cd9cb966c0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1410,7 +1410,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (flags & MAP_FIXED_NOREPLACE) { struct vm_area_struct *vma = find_vma(mm, addr); - if (vma && vma->vm_start <= addr) + if (vma && vma->vm_start < addr + len) return -EEXIST; } diff --git a/mm/mremap.c b/mm/mremap.c index 5c2e18505f75..a9617e72e6b7 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -115,7 +115,7 @@ static pte_t move_soft_dirty_pte(pte_t pte) static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, unsigned long old_addr, unsigned long old_end, struct vm_area_struct *new_vma, pmd_t *new_pmd, - unsigned long new_addr, bool need_rmap_locks, bool *need_flush) + unsigned long new_addr, bool need_rmap_locks) { struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; @@ -163,15 +163,17 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, pte = ptep_get_and_clear(mm, old_addr, old_pte); /* - * If we are remapping a dirty PTE, make sure + * If we are remapping a valid PTE, make sure * to flush TLB before we drop the PTL for the - * old PTE or we may race with page_mkclean(). + * PTE. * - * This check has to be done after we removed the - * old PTE from page tables or another thread may - * dirty it after the check and before the removal. + * NOTE! Both old and new PTL matter: the old one + * for racing with page_mkclean(), the new one to + * make sure the physical page stays valid until + * the TLB entry for the old mapping has been + * flushed. */ - if (pte_present(pte) && pte_dirty(pte)) + if (pte_present(pte)) force_flush = true; pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); pte = move_soft_dirty_pte(pte); @@ -179,13 +181,11 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, } arch_leave_lazy_mmu_mode(); + if (force_flush) + flush_tlb_range(vma, old_end - len, old_end); if (new_ptl != old_ptl) spin_unlock(new_ptl); pte_unmap(new_pte - 1); - if (force_flush) - flush_tlb_range(vma, old_end - len, old_end); - else - *need_flush = true; pte_unmap_unlock(old_pte - 1, old_ptl); if (need_rmap_locks) drop_rmap_locks(vma); @@ -198,7 +198,6 @@ unsigned long move_page_tables(struct vm_area_struct *vma, { unsigned long extent, next, old_end; pmd_t *old_pmd, *new_pmd; - bool need_flush = false; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ @@ -229,8 +228,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, if (need_rmap_locks) take_rmap_locks(vma); moved = move_huge_pmd(vma, old_addr, new_addr, - old_end, old_pmd, new_pmd, - &need_flush); + old_end, old_pmd, new_pmd); if (need_rmap_locks) drop_rmap_locks(vma); if (moved) @@ -246,10 +244,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma, if (extent > next - new_addr) extent = next - new_addr; move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma, - new_pmd, new_addr, need_rmap_locks, &need_flush); + new_pmd, new_addr, need_rmap_locks); } - if (need_flush) - flush_tlb_range(vma, old_end-len, old_addr); mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b5b25e4dcbbb..f10aa5360616 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -522,6 +522,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm) tlb_gather_mmu(&tlb, mm, start, end); if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) { + tlb_finish_mmu(&tlb, start, end); ret = false; continue; } @@ -1103,10 +1104,17 @@ bool out_of_memory(struct oom_control *oc) } select_bad_process(oc); - /* Found nothing?!?! Either we hang forever, or we panic. */ - if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) { + /* Found nothing?!?! */ + if (!oc->chosen) { dump_header(oc, NULL); - panic("Out of memory and no killable processes...\n"); + pr_warn("Out of memory and no killable processes...\n"); + /* + * If we got here due to an actual allocation at the + * system level, we cannot survive this and will enter + * an endless loop in the allocator. Bail out now. + */ + if (!is_sysrq_oom(oc) && !is_memcg_oom(oc)) + panic("System is deadlocked on memory\n"); } if (oc->chosen && oc->chosen != (void *)-1UL) oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" : diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 05e983f42316..e2ef1c17942f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6193,17 +6193,6 @@ static unsigned long __init calc_memmap_size(unsigned long spanned_pages, return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; } -#ifdef CONFIG_NUMA_BALANCING -static void pgdat_init_numabalancing(struct pglist_data *pgdat) -{ - spin_lock_init(&pgdat->numabalancing_migrate_lock); - pgdat->numabalancing_migrate_nr_pages = 0; - pgdat->numabalancing_migrate_next_window = jiffies; -} -#else -static void pgdat_init_numabalancing(struct pglist_data *pgdat) {} -#endif - #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void pgdat_init_split_queue(struct pglist_data *pgdat) { @@ -6228,7 +6217,6 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) { pgdat_resize_init(pgdat); - pgdat_init_numabalancing(pgdat); pgdat_init_split_queue(pgdat); pgdat_init_kcompactd(pgdat); @@ -7708,6 +7696,10 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, * handle each tail page individually in migration. */ if (PageHuge(page)) { + + if (!hugepage_migration_supported(page_hstate(page))) + goto unmovable; + iter = round_up(iter + 1, 1<<compound_order(page)) - 1; continue; } diff --git a/mm/percpu.c b/mm/percpu.c index a749d4d96e3e..4b90682623e9 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1212,6 +1212,7 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; + pcpu_mem_free(chunk->md_blocks); pcpu_mem_free(chunk->bound_map); pcpu_mem_free(chunk->alloc_map); pcpu_mem_free(chunk); diff --git a/mm/readahead.c b/mm/readahead.c index a59ea70527b9..4e630143a0ba 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -20,6 +20,7 @@ #include <linux/file.h> #include <linux/mm_inline.h> #include <linux/blk-cgroup.h> +#include <linux/fadvise.h> #include "internal.h" @@ -575,24 +576,6 @@ page_cache_async_readahead(struct address_space *mapping, } EXPORT_SYMBOL_GPL(page_cache_async_readahead); -static ssize_t -do_readahead(struct address_space *mapping, struct file *filp, - pgoff_t index, unsigned long nr) -{ - if (!mapping || !mapping->a_ops) - return -EINVAL; - - /* - * Readahead doesn't make sense for DAX inodes, but we don't want it - * to report a failure either. Instead, we just return success and - * don't do any work. - */ - if (dax_mapping(mapping)) - return 0; - - return force_page_cache_readahead(mapping, filp, index, nr); -} - ssize_t ksys_readahead(int fd, loff_t offset, size_t count) { ssize_t ret; @@ -600,16 +583,22 @@ ssize_t ksys_readahead(int fd, loff_t offset, size_t count) ret = -EBADF; f = fdget(fd); - if (f.file) { - if (f.file->f_mode & FMODE_READ) { - struct address_space *mapping = f.file->f_mapping; - pgoff_t start = offset >> PAGE_SHIFT; - pgoff_t end = (offset + count - 1) >> PAGE_SHIFT; - unsigned long len = end - start + 1; - ret = do_readahead(mapping, f.file, start, len); - } - fdput(f); - } + if (!f.file || !(f.file->f_mode & FMODE_READ)) + goto out; + + /* + * The readahead() syscall is intended to run only on files + * that can execute readahead. If readahead is not possible + * on this file, then we must return -EINVAL. + */ + ret = -EINVAL; + if (!f.file->f_mapping || !f.file->f_mapping->a_ops || + !S_ISREG(file_inode(f.file)->i_mode)) + goto out; + + ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED); +out: + fdput(f); return ret; } diff --git a/mm/rmap.c b/mm/rmap.c index eb477809a5c0..1e79fac3186b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1362,11 +1362,21 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } /* - * We have to assume the worse case ie pmd for invalidation. Note that - * the page can not be free in this function as call of try_to_unmap() - * must hold a reference on the page. + * For THP, we have to assume the worse case ie pmd for invalidation. + * For hugetlb, it could be much worse if we need to do pud + * invalidation in the case of pmd sharing. + * + * Note that the page can not be free in this function as call of + * try_to_unmap() must hold a reference on the page. */ end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); + if (PageHuge(page)) { + /* + * If sharing is possible, start and end will be adjusted + * accordingly. + */ + adjust_range_if_pmd_sharing_possible(vma, &start, &end); + } mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); while (page_vma_mapped_walk(&pvmw)) { @@ -1409,6 +1419,32 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); address = pvmw.address; + if (PageHuge(page)) { + if (huge_pmd_unshare(mm, &address, pvmw.pte)) { + /* + * huge_pmd_unshare unmapped an entire PMD + * page. There is no way of knowing exactly + * which PMDs may be cached for this mm, so + * we must flush them all. start/end were + * already adjusted above to cover this range. + */ + flush_cache_range(vma, start, end); + flush_tlb_range(vma, start, end); + mmu_notifier_invalidate_range(mm, start, end); + + /* + * The ref count of the PMD page was dropped + * which is part of the way map counting + * is done for shared PMDs. Return 'true' + * here. When there is no other sharing, + * huge_pmd_unshare returns false and we will + * unmap the actual page and drop map count + * to zero. + */ + page_vma_mapped_walk_done(&pvmw); + break; + } + } if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && diff --git a/mm/shmem.c b/mm/shmem.c index 0376c124b043..446942677cd4 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2227,6 +2227,8 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode mpol_shared_policy_init(&info->policy, NULL); break; } + + lockdep_annotate_inode_mutex_key(inode); } else shmem_free_inode(sb); return inode; diff --git a/mm/util.c b/mm/util.c index d2890a407332..9e3ebd2ef65f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -435,11 +435,14 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) EXPORT_SYMBOL(kvmalloc_node); /** - * kvfree - free memory allocated with kvmalloc - * @addr: pointer returned by kvmalloc + * kvfree() - Free memory. + * @addr: Pointer to allocated memory. * - * If the memory is allocated from vmalloc area it is freed with vfree(). - * Otherwise kfree() is used. + * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc(). + * It is slightly more efficient to use kfree() or vfree() if you are certain + * that you know which one to use. + * + * Context: Any context except NMI. */ void kvfree(const void *addr) { diff --git a/mm/vmacache.c b/mm/vmacache.c index ea517bef7dc5..cdc32a3b02fa 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c @@ -20,44 +20,6 @@ #define VMACACHE_HASH(addr) ((addr >> VMACACHE_SHIFT) & VMACACHE_MASK) /* - * Flush vma caches for threads that share a given mm. - * - * The operation is safe because the caller holds the mmap_sem - * exclusively and other threads accessing the vma cache will - * have mmap_sem held at least for read, so no extra locking - * is required to maintain the vma cache. - */ -void vmacache_flush_all(struct mm_struct *mm) -{ - struct task_struct *g, *p; - - count_vm_vmacache_event(VMACACHE_FULL_FLUSHES); - - /* - * Single threaded tasks need not iterate the entire - * list of process. We can avoid the flushing as well - * since the mm's seqnum was increased and don't have - * to worry about other threads' seqnum. Current's - * flush will occur upon the next lookup. - */ - if (atomic_read(&mm->mm_users) == 1) - return; - - rcu_read_lock(); - for_each_process_thread(g, p) { - /* - * Only flush the vmacache pointers as the - * mm seqnum is already set and curr's will - * be set upon invalidation when the next - * lookup is done. - */ - if (mm == p->mm) - vmacache_flush(p); - } - rcu_read_unlock(); -} - -/* * This task may be accessing a foreign mm via (for example) * get_user_pages()->find_vma(). The vmacache is task-local and this * task's vmacache pertains to a different mm (ie, its own). There is diff --git a/mm/vmscan.c b/mm/vmscan.c index 7e7d25504651..c5ef7240cbcb 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -476,6 +476,17 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, delta = freeable >> priority; delta *= 4; do_div(delta, shrinker->seeks); + + /* + * Make sure we apply some minimal pressure on default priority + * even on small cgroups. Stale objects are not only consuming memory + * by themselves, but can also hold a reference to a dying cgroup, + * preventing it from being reclaimed. A dying cgroup with all + * corresponding structures like per-cpu stats and kmem caches + * can be really big, so it may lead to a significant waste of memory. + */ + delta = max_t(unsigned long long, delta, min(freeable, batch_size)); + total_scan += delta; if (total_scan < 0) { pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", @@ -569,8 +580,8 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { struct memcg_shrinker_map *map; - unsigned long freed = 0; - int ret, i; + unsigned long ret, freed = 0; + int i; if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)) return 0; @@ -666,9 +677,8 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { + unsigned long ret, freed = 0; struct shrinker *shrinker; - unsigned long freed = 0; - int ret; if (!mem_cgroup_is_root(memcg)) return shrink_slab_memcg(gfp_mask, nid, memcg, priority); diff --git a/mm/vmstat.c b/mm/vmstat.c index 8ba0870ecddd..7878da76abf2 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1275,6 +1275,9 @@ const char * const vmstat_text[] = { #ifdef CONFIG_SMP "nr_tlb_remote_flush", "nr_tlb_remote_flush_received", +#else + "", /* nr_tlb_remote_flush */ + "", /* nr_tlb_remote_flush_received */ #endif /* CONFIG_SMP */ "nr_tlb_local_flush_all", "nr_tlb_local_flush_one", @@ -1283,7 +1286,6 @@ const char * const vmstat_text[] = { #ifdef CONFIG_DEBUG_VM_VMACACHE "vmacache_find_calls", "vmacache_find_hits", - "vmacache_full_flushes", #endif #ifdef CONFIG_SWAP "swap_ra", |