summaryrefslogtreecommitdiff
path: root/mm/madvise.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-10-10 17:53:04 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-10-10 17:53:04 -0700
commit27bc50fc90647bbf7b734c3fc306a5e61350da53 (patch)
tree75fc525fbfec8c07a97a7875a89592317bcad4ca /mm/madvise.c
parent70442fc54e6889a2a77f0e9554e8188a1557f00e (diff)
parentbbff39cc6cbcb86ccfacb2dcafc79912a9f9df69 (diff)
downloadlwn-27bc50fc90647bbf7b734c3fc306a5e61350da53.tar.gz
lwn-27bc50fc90647bbf7b734c3fc306a5e61350da53.zip
Merge tag 'mm-stable-2022-10-08' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: - Yu Zhao's Multi-Gen LRU patches are here. They've been under test in linux-next for a couple of months without, to my knowledge, any negative reports (or any positive ones, come to that). - Also the Maple Tree from Liam Howlett. An overlapping range-based tree for vmas. It it apparently slightly more efficient in its own right, but is mainly targeted at enabling work to reduce mmap_lock contention. Liam has identified a number of other tree users in the kernel which could be beneficially onverted to mapletrees. Yu Zhao has identified a hard-to-hit but "easy to fix" lockdep splat at [1]. This has yet to be addressed due to Liam's unfortunately timed vacation. He is now back and we'll get this fixed up. - Dmitry Vyukov introduces KMSAN: the Kernel Memory Sanitizer. It uses clang-generated instrumentation to detect used-unintialized bugs down to the single bit level. KMSAN keeps finding bugs. New ones, as well as the legacy ones. - Yang Shi adds a userspace mechanism (madvise) to induce a collapse of memory into THPs. - Zach O'Keefe has expanded Yang Shi's madvise(MADV_COLLAPSE) to support file/shmem-backed pages. - userfaultfd updates from Axel Rasmussen - zsmalloc cleanups from Alexey Romanov - cleanups from Miaohe Lin: vmscan, hugetlb_cgroup, hugetlb and memory-failure - Huang Ying adds enhancements to NUMA balancing memory tiering mode's page promotion, with a new way of detecting hot pages. - memcg updates from Shakeel Butt: charging optimizations and reduced memory consumption. - memcg cleanups from Kairui Song. - memcg fixes and cleanups from Johannes Weiner. - Vishal Moola provides more folio conversions - Zhang Yi removed ll_rw_block() :( - migration enhancements from Peter Xu - migration error-path bugfixes from Huang Ying - Aneesh Kumar added ability for a device driver to alter the memory tiering promotion paths. For optimizations by PMEM drivers, DRM drivers, etc. - vma merging improvements from Jakub Matěn. - NUMA hinting cleanups from David Hildenbrand. - xu xin added aditional userspace visibility into KSM merging activity. - THP & KSM code consolidation from Qi Zheng. - more folio work from Matthew Wilcox. - KASAN updates from Andrey Konovalov. - DAMON cleanups from Kaixu Xia. - DAMON work from SeongJae Park: fixes, cleanups. - hugetlb sysfs cleanups from Muchun Song. - Mike Kravetz fixes locking issues in hugetlbfs and in hugetlb core. Link: https://lkml.kernel.org/r/CAOUHufZabH85CeUN-MEMgL8gJGzJEWUrkiM58JkTbBhh-jew0Q@mail.gmail.com [1] * tag 'mm-stable-2022-10-08' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (555 commits) hugetlb: allocate vma lock for all sharable vmas hugetlb: take hugetlb vma_lock when clearing vma_lock->vma pointer hugetlb: fix vma lock handling during split vma and range unmapping mglru: mm/vmscan.c: fix imprecise comments mm/mglru: don't sync disk for each aging cycle mm: memcontrol: drop dead CONFIG_MEMCG_SWAP config symbol mm: memcontrol: use do_memsw_account() in a few more places mm: memcontrol: deprecate swapaccounting=0 mode mm: memcontrol: don't allocate cgroup swap arrays when memcg is disabled mm/secretmem: remove reduntant return value mm/hugetlb: add available_huge_pages() func mm: remove unused inline functions from include/linux/mm_inline.h selftests/vm: add selftest for MADV_COLLAPSE of uffd-minor memory selftests/vm: add file/shmem MADV_COLLAPSE selftest for cleared pmd selftests/vm: add thp collapse shmem testing selftests/vm: add thp collapse file and tmpfs testing selftests/vm: modularize thp collapse memory operations selftests/vm: dedup THP helpers mm/khugepaged: add tracepoint to hpage_collapse_scan_file() mm/madvise: add file and shmem support to MADV_COLLAPSE ...
Diffstat (limited to 'mm/madvise.c')
-rw-r--r--mm/madvise.c60
1 files changed, 33 insertions, 27 deletions
diff --git a/mm/madvise.c b/mm/madvise.c
index 9ff51650f4f0..2baa93ca2310 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -59,6 +59,7 @@ static int madvise_need_mmap_write(int behavior)
case MADV_FREE:
case MADV_POPULATE_READ:
case MADV_POPULATE_WRITE:
+ case MADV_COLLAPSE:
return 0;
default:
/* be safe, default to 1. list exceptions explicitly */
@@ -600,6 +601,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
spinlock_t *ptl;
pte_t *orig_pte, *pte, ptent;
+ struct folio *folio;
struct page *page;
int nr_swap = 0;
unsigned long next;
@@ -644,56 +646,56 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
page = vm_normal_page(vma, addr, ptent);
if (!page || is_zone_device_page(page))
continue;
+ folio = page_folio(page);
/*
- * If pmd isn't transhuge but the page is THP and
+ * If pmd isn't transhuge but the folio is large and
* is owned by only this process, split it and
* deactivate all pages.
*/
- if (PageTransCompound(page)) {
- if (page_mapcount(page) != 1)
+ if (folio_test_large(folio)) {
+ if (folio_mapcount(folio) != 1)
goto out;
- get_page(page);
- if (!trylock_page(page)) {
- put_page(page);
+ folio_get(folio);
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
goto out;
}
pte_unmap_unlock(orig_pte, ptl);
- if (split_huge_page(page)) {
- unlock_page(page);
- put_page(page);
+ if (split_folio(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
goto out;
}
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
pte--;
addr -= PAGE_SIZE;
continue;
}
- VM_BUG_ON_PAGE(PageTransCompound(page), page);
-
- if (PageSwapCache(page) || PageDirty(page)) {
- if (!trylock_page(page))
+ if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
+ if (!folio_trylock(folio))
continue;
/*
- * If page is shared with others, we couldn't clear
- * PG_dirty of the page.
+ * If folio is shared with others, we mustn't clear
+ * the folio's dirty flag.
*/
- if (page_mapcount(page) != 1) {
- unlock_page(page);
+ if (folio_mapcount(folio) != 1) {
+ folio_unlock(folio);
continue;
}
- if (PageSwapCache(page) && !try_to_free_swap(page)) {
- unlock_page(page);
+ if (folio_test_swapcache(folio) &&
+ !folio_free_swap(folio)) {
+ folio_unlock(folio);
continue;
}
- ClearPageDirty(page);
- unlock_page(page);
+ folio_clear_dirty(folio);
+ folio_unlock(folio);
}
if (pte_young(ptent) || pte_dirty(ptent)) {
@@ -711,7 +713,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
set_pte_at(mm, addr, pte, ptent);
tlb_remove_tlb_entry(tlb, pte, addr);
}
- mark_page_lazyfree(page);
+ mark_page_lazyfree(&folio->page);
}
out:
if (nr_swap) {
@@ -1060,6 +1062,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
if (error)
goto out;
break;
+ case MADV_COLLAPSE:
+ return madvise_collapse(vma, prev, start, end);
}
anon_name = anon_vma_name(vma);
@@ -1153,6 +1157,7 @@ madvise_behavior_valid(int behavior)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
case MADV_HUGEPAGE:
case MADV_NOHUGEPAGE:
+ case MADV_COLLAPSE:
#endif
case MADV_DONTDUMP:
case MADV_DODUMP:
@@ -1169,13 +1174,13 @@ madvise_behavior_valid(int behavior)
}
}
-static bool
-process_madvise_behavior_valid(int behavior)
+static bool process_madvise_behavior_valid(int behavior)
{
switch (behavior) {
case MADV_COLD:
case MADV_PAGEOUT:
case MADV_WILLNEED:
+ case MADV_COLLAPSE:
return true;
default:
return false;
@@ -1241,7 +1246,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
if (start >= end)
break;
if (prev)
- vma = prev->vm_next;
+ vma = find_vma(mm, prev->vm_end);
else /* madvise_remove dropped mmap_lock */
vma = find_vma(mm, start);
}
@@ -1342,6 +1347,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
* MADV_NOHUGEPAGE - mark the given range as not worth being backed by
* transparent huge pages so the existing pages will not be
* coalesced into THP and new pages will not be allocated as THP.
+ * MADV_COLLAPSE - synchronously coalesce pages into new THP.
* MADV_DONTDUMP - the application wants to prevent pages in the given range
* from being included in its core dump.
* MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.