From 0ba5e806e14e97a4dd34e21ae2994693bcdd0406 Mon Sep 17 00:00:00 2001 From: Illia Ostapyshyn Date: Fri, 17 May 2024 11:13:48 +0200 Subject: mm/vmscan: update stale references to shrink_page_list Commit 49fd9b6df54e ("mm/vmscan: fix a lot of comments") renamed shrink_page_list() to shrink_folio_list(). Fix up the remaining references to the old name in comments and documentation. Link: https://lkml.kernel.org/r/20240517091348.1185566-1-illia@yshyn.com Signed-off-by: Illia Ostapyshyn Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- Documentation/mm/unevictable-lru.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'Documentation') diff --git a/Documentation/mm/unevictable-lru.rst b/Documentation/mm/unevictable-lru.rst index b6a07a26b10d..2feb2ed51ae2 100644 --- a/Documentation/mm/unevictable-lru.rst +++ b/Documentation/mm/unevictable-lru.rst @@ -191,13 +191,13 @@ have become evictable again (via munlock() for example) and have been "rescued" from the unevictable list. However, there may be situations where we decide, for the sake of expediency, to leave an unevictable folio on one of the regular active/inactive LRU lists for vmscan to deal with. vmscan checks for such -folios in all of the shrink_{active|inactive|page}_list() functions and will +folios in all of the shrink_{active|inactive|folio}_list() functions and will "cull" such folios that it encounters: that is, it diverts those folios to the unevictable list for the memory cgroup and node being scanned. There may be situations where a folio is mapped into a VM_LOCKED VMA, but the folio does not have the mlocked flag set. Such folios will make -it all the way to shrink_active_list() or shrink_page_list() where they +it all the way to shrink_active_list() or shrink_folio_list() where they will be detected when vmscan walks the reverse map in folio_referenced() or try_to_unmap(). The folio is culled to the unevictable list when it is released by the shrinker. @@ -269,7 +269,7 @@ the LRU. Such pages can be "noticed" by memory management in several places: (4) in the fault path and when a VM_LOCKED stack segment is expanded; or - (5) as mentioned above, in vmscan:shrink_page_list() when attempting to + (5) as mentioned above, in vmscan:shrink_folio_list() when attempting to reclaim a page in a VM_LOCKED VMA by folio_referenced() or try_to_unmap(). mlocked pages become unlocked and rescued from the unevictable list when: @@ -548,12 +548,12 @@ Some examples of these unevictable pages on the LRU lists are: (3) pages still mapped into VM_LOCKED VMAs, which should be marked mlocked, but events left mlock_count too low, so they were munlocked too early. -vmscan's shrink_inactive_list() and shrink_page_list() also divert obviously +vmscan's shrink_inactive_list() and shrink_folio_list() also divert obviously unevictable pages found on the inactive lists to the appropriate memory cgroup and node unevictable list. rmap's folio_referenced_one(), called via vmscan's shrink_active_list() or -shrink_page_list(), and rmap's try_to_unmap_one() called via shrink_page_list(), +shrink_folio_list(), and rmap's try_to_unmap_one() called via shrink_folio_list(), check for (3) pages still mapped into VM_LOCKED VMAs, and call mlock_vma_folio() to correct them. Such pages are culled to the unevictable list when released by the shrinker. -- cgit v1.2.3 From d39b6af2189095f2c411a0a261d44e0ac4be3ad1 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 27 May 2024 11:48:55 -0400 Subject: mm: drop leftover comment references to pxx_huge() pxx_huge() has been removed in recent commit 9636f055dae1 ("mm/treewide: remove pXd_huge()"), however there are still three comments referencing the API that got overlooked. Remove them. Link: https://lkml.kernel.org/r/20240527154855.528816-1-peterx@redhat.com Signed-off-by: Peter Xu Reported-by: Christophe Leroy Reviewed-by: David Hildenbrand Cc: Jason Gunthorpe Signed-off-by: Andrew Morton --- Documentation/mm/arch_pgtable_helpers.rst | 4 ---- arch/x86/mm/pat/set_memory.c | 4 ++-- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'Documentation') diff --git a/Documentation/mm/arch_pgtable_helpers.rst b/Documentation/mm/arch_pgtable_helpers.rst index ad50ca6f495e..af245161d8e7 100644 --- a/Documentation/mm/arch_pgtable_helpers.rst +++ b/Documentation/mm/arch_pgtable_helpers.rst @@ -90,8 +90,6 @@ PMD Page Table Helpers +---------------------------+--------------------------------------------------+ | pmd_leaf | Tests a leaf mapped PMD | +---------------------------+--------------------------------------------------+ -| pmd_huge | Tests a HugeTLB mapped PMD | -+---------------------------+--------------------------------------------------+ | pmd_trans_huge | Tests a Transparent Huge Page (THP) at PMD | +---------------------------+--------------------------------------------------+ | pmd_present | Tests whether pmd_page() points to valid memory | @@ -169,8 +167,6 @@ PUD Page Table Helpers +---------------------------+--------------------------------------------------+ | pud_leaf | Tests a leaf mapped PUD | +---------------------------+--------------------------------------------------+ -| pud_huge | Tests a HugeTLB mapped PUD | -+---------------------------+--------------------------------------------------+ | pud_trans_huge | Tests a Transparent Huge Page (THP) at PUD | +---------------------------+--------------------------------------------------+ | pud_present | Tests a valid mapped PUD | diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 19fdfbb171ed..8b2164509b4d 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -1120,8 +1120,8 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, lpinc = PMD_SIZE; /* * Clear the PSE flags if the PRESENT flag is not set - * otherwise pmd_present/pmd_huge will return true - * even on a non present pmd. + * otherwise pmd_present() will return true even on a non + * present pmd. */ if (!(pgprot_val(ref_prot) & _PAGE_PRESENT)) pgprot_val(ref_prot) &= ~_PAGE_PSE; -- cgit v1.2.3 From 4b98995530b77a97912230d8e1564ba7738db19c Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 11 Jun 2024 18:11:07 +0800 Subject: mm: shmem: add multi-size THP sysfs interface for anonymous shmem To support the use of mTHP with anonymous shmem, add a new sysfs interface 'shmem_enabled' in the '/sys/kernel/mm/transparent_hugepage/hugepages-kB/' directory for each mTHP to control whether shmem is enabled for that mTHP, with a value similar to the top level 'shmem_enabled', which can be set to: "always", "inherit (to inherit the top level setting)", "within_size", "advise", "never". An 'inherit' option is added to ensure compatibility with these global settings, and the options 'force' and 'deny' are dropped, which are rather testing artifacts from the old ages. By default, PMD-sized hugepages have enabled="inherit" and all other hugepage sizes have enabled="never" for '/sys/kernel/mm/transparent_hugepage/hugepages-xxkB/shmem_enabled'. In addition, if top level value is 'force', then only PMD-sized hugepages have enabled="inherit", otherwise configuration will be failed and vice versa. That means now we will avoid using non-PMD sized THP to override the global huge allocation. [baolin.wang@linux.alibaba.com: fix transhuge.rst indentation] Link: https://lkml.kernel.org/r/b189d815-998b-4dfd-ba89-218ff51313f8@linux.alibaba.com [akpm@linux-foundation.org: reflow transhuge.rst addition to 80 cols] [baolin.wang@linux.alibaba.com: move huge_shmem_orders_lock under CONFIG_SYSFS] Link: https://lkml.kernel.org/r/eb34da66-7f12-44f3-a39e-2bcc90c33354@linux.alibaba.com [akpm@linux-foundation.org: huge_memory.c needs mm_types.h] Link: https://lkml.kernel.org/r/ffddfa8b3cb4266ff963099ab78cfd7184c57ac7.1718090413.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Barry Song Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kefeng Wang Cc: Lance Yang Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 25 ++++++++ include/linux/huge_mm.h | 10 ++++ mm/huge_memory.c | 12 ++-- mm/shmem.c | 96 ++++++++++++++++++++++++++++++ 4 files changed, 135 insertions(+), 8 deletions(-) (limited to 'Documentation') diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index d414d3f5592a..e7232b46fe14 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -332,6 +332,31 @@ deny force Force the huge option on for all - very useful for testing; +Shmem can also use "multi-size THP" (mTHP) by adding a new sysfs knob to +control mTHP allocation: +'/sys/kernel/mm/transparent_hugepage/hugepages-kB/shmem_enabled', +and its value for each mTHP is essentially consistent with the global +setting. An 'inherit' option is added to ensure compatibility with these +global settings. Conversely, the options 'force' and 'deny' are dropped, +which are rather testing artifacts from the old ages. + +always + Attempt to allocate huge pages every time we need a new page; + +inherit + Inherit the top-level "shmem_enabled" value. By default, PMD-sized hugepages + have enabled="inherit" and all other hugepage sizes have enabled="never"; + +never + Do not allocate huge pages; + +within_size + Only allocate huge page if it will be fully within i_size. + Also respect fadvise()/madvise() hints; + +advise + Only allocate huge pages if requested with fadvise()/madvise(); + Need of application restart =========================== diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 088d66a54643..6a1edd752968 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -6,6 +6,7 @@ #include #include /* only for vma_is_dax() */ +#include vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -63,6 +64,7 @@ ssize_t single_hugepage_flag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf, enum transparent_hugepage_flag flag); extern struct kobj_attribute shmem_enabled_attr; +extern struct kobj_attribute thpsize_shmem_enabled_attr; /* * Mask of all large folio orders supported for anonymous THP; all orders up to @@ -265,6 +267,14 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders); } +struct thpsize { + struct kobject kobj; + struct list_head node; + int order; +}; + +#define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj) + enum mthp_stat_item { MTHP_STAT_ANON_FAULT_ALLOC, MTHP_STAT_ANON_FAULT_FALLBACK, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cff47b3fc72e..af6f239b0f97 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -449,14 +450,6 @@ static void thpsize_release(struct kobject *kobj); static DEFINE_SPINLOCK(huge_anon_orders_lock); static LIST_HEAD(thpsize_list); -struct thpsize { - struct kobject kobj; - struct list_head node; - int order; -}; - -#define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj) - static ssize_t thpsize_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -517,6 +510,9 @@ static struct kobj_attribute thpsize_enabled_attr = static struct attribute *thpsize_attrs[] = { &thpsize_enabled_attr.attr, +#ifdef CONFIG_SHMEM + &thpsize_shmem_enabled_attr.attr, +#endif NULL, }; diff --git a/mm/shmem.c b/mm/shmem.c index bf22909f1261..d80608f9d6af 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -131,6 +131,13 @@ struct shmem_options { #define SHMEM_SEEN_QUOTA 32 }; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static unsigned long huge_shmem_orders_always __read_mostly; +static unsigned long huge_shmem_orders_madvise __read_mostly; +static unsigned long huge_shmem_orders_inherit __read_mostly; +static unsigned long huge_shmem_orders_within_size __read_mostly; +#endif + #ifdef CONFIG_TMPFS static unsigned long shmem_default_max_blocks(void) { @@ -4672,6 +4679,12 @@ void __init shmem_init(void) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; else shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */ + + /* + * Default to setting PMD-sized THP to inherit the global setting and + * disable all other multi-size THPs. + */ + huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER); #endif return; @@ -4731,6 +4744,11 @@ static ssize_t shmem_enabled_store(struct kobject *kobj, huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) return -EINVAL; + /* Do not override huge allocation policy with non-PMD sized mTHP */ + if (huge == SHMEM_HUGE_FORCE && + huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER)) + return -EINVAL; + shmem_huge = huge; if (shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; @@ -4738,6 +4756,84 @@ static ssize_t shmem_enabled_store(struct kobject *kobj, } struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled); +static DEFINE_SPINLOCK(huge_shmem_orders_lock); + +static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int order = to_thpsize(kobj)->order; + const char *output; + + if (test_bit(order, &huge_shmem_orders_always)) + output = "[always] inherit within_size advise never"; + else if (test_bit(order, &huge_shmem_orders_inherit)) + output = "always [inherit] within_size advise never"; + else if (test_bit(order, &huge_shmem_orders_within_size)) + output = "always inherit [within_size] advise never"; + else if (test_bit(order, &huge_shmem_orders_madvise)) + output = "always inherit within_size [advise] never"; + else + output = "always inherit within_size advise [never]"; + + return sysfs_emit(buf, "%s\n", output); +} + +static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int order = to_thpsize(kobj)->order; + ssize_t ret = count; + + if (sysfs_streq(buf, "always")) { + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_inherit); + clear_bit(order, &huge_shmem_orders_madvise); + clear_bit(order, &huge_shmem_orders_within_size); + set_bit(order, &huge_shmem_orders_always); + spin_unlock(&huge_shmem_orders_lock); + } else if (sysfs_streq(buf, "inherit")) { + /* Do not override huge allocation policy with non-PMD sized mTHP */ + if (shmem_huge == SHMEM_HUGE_FORCE && + order != HPAGE_PMD_ORDER) + return -EINVAL; + + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_always); + clear_bit(order, &huge_shmem_orders_madvise); + clear_bit(order, &huge_shmem_orders_within_size); + set_bit(order, &huge_shmem_orders_inherit); + spin_unlock(&huge_shmem_orders_lock); + } else if (sysfs_streq(buf, "within_size")) { + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_always); + clear_bit(order, &huge_shmem_orders_inherit); + clear_bit(order, &huge_shmem_orders_madvise); + set_bit(order, &huge_shmem_orders_within_size); + spin_unlock(&huge_shmem_orders_lock); + } else if (sysfs_streq(buf, "madvise")) { + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_always); + clear_bit(order, &huge_shmem_orders_inherit); + clear_bit(order, &huge_shmem_orders_within_size); + set_bit(order, &huge_shmem_orders_madvise); + spin_unlock(&huge_shmem_orders_lock); + } else if (sysfs_streq(buf, "never")) { + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_always); + clear_bit(order, &huge_shmem_orders_inherit); + clear_bit(order, &huge_shmem_orders_within_size); + clear_bit(order, &huge_shmem_orders_madvise); + spin_unlock(&huge_shmem_orders_lock); + } else { + ret = -EINVAL; + } + + return ret; +} + +struct kobj_attribute thpsize_shmem_enabled_attr = + __ATTR(shmem_enabled, 0644, thpsize_shmem_enabled_show, thpsize_shmem_enabled_store); #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ #else /* !CONFIG_SHMEM */ -- cgit v1.2.3 From 66f44583f9b617d74ffa2487e75a9c3adf344ddb Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 11 Jun 2024 18:11:10 +0800 Subject: mm: shmem: add mTHP counters for anonymous shmem Add mTHP counters for anonymous shmem. [baolin.wang@linux.alibaba.com: update Documentation/admin-guide/mm/transhuge.rst] Link: https://lkml.kernel.org/r/d86e2e7f-4141-432b-b2ba-c6691f36ef0b@linux.alibaba.com Link: https://lkml.kernel.org/r/4fd9e467d49ae4a747e428bcd821c7d13125ae67.1718090413.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Lance Yang Cc: Barry Song Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kefeng Wang Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 13 +++++++++++++ include/linux/huge_mm.h | 3 +++ mm/huge_memory.c | 6 ++++++ mm/shmem.c | 18 +++++++++++++++--- 4 files changed, 37 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index e7232b46fe14..1f72b00af5d3 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -501,6 +501,19 @@ swpout_fallback Usually because failed to allocate some continuous swap space for the huge page. +file_alloc + is incremented every time a file huge page is successfully + allocated. + +file_fallback + is incremented if a file huge page is attempted to be allocated + but fails and instead falls back to using small pages. + +file_fallback_charge + is incremented if a file huge page cannot be charged and instead + falls back to using small pages even though the allocation was + successful. + As the system ages, allocating huge pages may be expensive as the system uses memory compaction to copy data around memory to free a huge page for use. There are some counters in ``/proc/vmstat`` to help diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 53b7a137f460..7ad41de5eaea 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -281,6 +281,9 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, MTHP_STAT_SWPOUT, MTHP_STAT_SWPOUT_FALLBACK, + MTHP_STAT_FILE_ALLOC, + MTHP_STAT_FILE_FALLBACK, + MTHP_STAT_FILE_FALLBACK_CHARGE, __MTHP_STAT_COUNT }; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index af6f239b0f97..f55362a73e2d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -556,6 +556,9 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); +DEFINE_MTHP_STAT_ATTR(file_alloc, MTHP_STAT_FILE_ALLOC); +DEFINE_MTHP_STAT_ATTR(file_fallback, MTHP_STAT_FILE_FALLBACK); +DEFINE_MTHP_STAT_ATTR(file_fallback_charge, MTHP_STAT_FILE_FALLBACK_CHARGE); static struct attribute *stats_attrs[] = { &anon_fault_alloc_attr.attr, @@ -563,6 +566,9 @@ static struct attribute *stats_attrs[] = { &anon_fault_fallback_charge_attr.attr, &swpout_attr.attr, &swpout_fallback_attr.attr, + &file_alloc_attr.attr, + &file_fallback_attr.attr, + &file_fallback_charge_attr.attr, NULL, }; diff --git a/mm/shmem.c b/mm/shmem.c index 224162aa204b..d09c6bf1f28a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1772,6 +1772,9 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, if (pages == HPAGE_PMD_NR) count_vm_event(THP_FILE_FALLBACK); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + count_mthp_stat(order, MTHP_STAT_FILE_FALLBACK); +#endif order = next_order(&suitable_orders, order); } } else { @@ -1791,9 +1794,15 @@ allocated: if (xa_find(&mapping->i_pages, &index, index + pages - 1, XA_PRESENT)) { error = -EEXIST; - } else if (pages == HPAGE_PMD_NR) { - count_vm_event(THP_FILE_FALLBACK); - count_vm_event(THP_FILE_FALLBACK_CHARGE); + } else if (pages > 1) { + if (pages == HPAGE_PMD_NR) { + count_vm_event(THP_FILE_FALLBACK); + count_vm_event(THP_FILE_FALLBACK_CHARGE); + } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + count_mthp_stat(folio_order(folio), MTHP_STAT_FILE_FALLBACK); + count_mthp_stat(folio_order(folio), MTHP_STAT_FILE_FALLBACK_CHARGE); +#endif } goto unlock; } @@ -2167,6 +2176,9 @@ repeat: if (!IS_ERR(folio)) { if (folio_test_pmd_mappable(folio)) count_vm_event(THP_FILE_ALLOC); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + count_mthp_stat(folio_order(folio), MTHP_STAT_FILE_ALLOC); +#endif goto alloced; } if (PTR_ERR(folio) == -EEXIST) -- cgit v1.2.3 From 478fd0d8ec1286cb03cf8ebf6beed46746775c9e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 7 Jun 2024 14:23:57 +0200 Subject: Documentation/admin-guide/mm/pagemap.rst: drop "Using pagemap to do something useful" That example was added in 2008. In 2015, we restricted access to the PFNs in the pagemap to CAP_SYS_ADMIN, making that approach quite less usable. It's 2024 now, and using that racy and low-lewel mechanism to calculate the USS should not be considered a good example anymore. /proc/$pid/smaps and /proc/$pid/smaps_rollup can do a much better job without any of that low-level handling. Let's just drop that example. Link: https://lkml.kernel.org/r/20240607122357.115423-7-david@redhat.com Signed-off-by: David Hildenbrand Cc: Alexey Dobriyan Cc: Jonathan Corbet Cc: Kirill A. Shutemov Cc: Lance Yang Cc: Oscar Salvador Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/pagemap.rst | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'Documentation') diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst index f5f065c67615..f2817a801596 100644 --- a/Documentation/admin-guide/mm/pagemap.rst +++ b/Documentation/admin-guide/mm/pagemap.rst @@ -173,27 +173,6 @@ LRU related page flags The page-types tool in the tools/mm directory can be used to query the above flags. -Using pagemap to do something useful -==================================== - -The general procedure for using pagemap to find out about a process' memory -usage goes like this: - - 1. Read ``/proc/pid/maps`` to determine which parts of the memory space are - mapped to what. - 2. Select the maps you are interested in -- all of them, or a particular - library, or the stack or the heap, etc. - 3. Open ``/proc/pid/pagemap`` and seek to the pages you would like to examine. - 4. Read a u64 for each page from pagemap. - 5. Open ``/proc/kpagecount`` and/or ``/proc/kpageflags``. For each PFN you - just read, seek to that entry in the file, and read the data you want. - -For example, to find the "unique set size" (USS), which is the amount of -memory that a process is using that is not shared with any other process, -you can go through every map in the process, find the PFNs, look those up -in kpagecount, and tally up the number of pages that are only referenced -once. - Exceptions for Shared Memory ============================ -- cgit v1.2.3 From e36287c6e12d00f2087dc0c4899d8a9c54e22e1c Mon Sep 17 00:00:00 2001 From: Hyeongtak Ji Date: Fri, 14 Jun 2024 12:00:05 +0900 Subject: mm/damon/sysfs-schemes: add target_nid on sysfs-schemes This patch adds target_nid under /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes// The 'target_nid' can be used as the destination node for DAMOS actions such as DAMOS_MIGRATE_{HOT,COLD} in the follow up patches. [sj@kernel.org: document target_nid file] Link: https://lkml.kernel.org/r/20240618213630.84846-3-sj@kernel.org Link: https://lkml.kernel.org/r/20240614030010.751-4-honggyu.kim@sk.com Signed-off-by: Hyeongtak Ji Signed-off-by: Honggyu Kim Signed-off-by: SeongJae Park Cc: Gregory Price Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Masami Hiramatsu (Google) Cc: Mathieu Desnoyers Cc: Rakie Kim Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-kernel-mm-damon | 6 +++++ include/linux/damon.h | 11 ++++++++- mm/damon/core.c | 5 +++- mm/damon/dbgfs.c | 2 +- mm/damon/lru_sort.c | 3 ++- mm/damon/reclaim.c | 3 ++- mm/damon/sysfs-schemes.c | 33 ++++++++++++++++++++++++- 7 files changed, 57 insertions(+), 6 deletions(-) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon index cef6e1d20b18..f1b90cf1249b 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-damon +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -155,6 +155,12 @@ Contact: SeongJae Park Description: Writing to and reading from this file sets and gets the action of the scheme. +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//target_nid +Date: Jun 2024 +Contact: SeongJae Park +Description: Action's target NUMA node id. Supported by only relevant + actions. + What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//apply_interval_us Date: Sep 2023 Contact: SeongJae Park diff --git a/include/linux/damon.h b/include/linux/damon.h index f7da65e1ac04..21d6b69a015c 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -374,6 +374,7 @@ struct damos_access_pattern { * @apply_interval_us: The time between applying the @action. * @quota: Control the aggressiveness of this scheme. * @wmarks: Watermarks for automated (in)activation of this scheme. + * @target_nid: Destination node if @action is "migrate_{hot,cold}". * @filters: Additional set of &struct damos_filter for &action. * @stat: Statistics of this scheme. * @list: List head for siblings. @@ -389,6 +390,10 @@ struct damos_access_pattern { * monitoring context are inactive, DAMON stops monitoring either, and just * repeatedly checks the watermarks. * + * @target_nid is used to set the migration target node for migrate_hot or + * migrate_cold actions, which means it's only meaningful when @action is either + * "migrate_hot" or "migrate_cold". + * * Before applying the &action to a memory region, &struct damon_operations * implementation could check pages of the region and skip &action to respect * &filters @@ -410,6 +415,9 @@ struct damos { /* public: */ struct damos_quota quota; struct damos_watermarks wmarks; + union { + int target_nid; + }; struct list_head filters; struct damos_stat stat; struct list_head list; @@ -726,7 +734,8 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, enum damos_action action, unsigned long apply_interval_us, struct damos_quota *quota, - struct damos_watermarks *wmarks); + struct damos_watermarks *wmarks, + int target_nid); void damon_add_scheme(struct damon_ctx *ctx, struct damos *s); void damon_destroy_scheme(struct damos *s); diff --git a/mm/damon/core.c b/mm/damon/core.c index 6392f1cc97a3..c0ec5be4f56e 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -354,7 +354,8 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, enum damos_action action, unsigned long apply_interval_us, struct damos_quota *quota, - struct damos_watermarks *wmarks) + struct damos_watermarks *wmarks, + int target_nid) { struct damos *scheme; @@ -381,6 +382,8 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, scheme->wmarks = *wmarks; scheme->wmarks.activated = true; + scheme->target_nid = target_nid; + return scheme; } diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 2461cfe2e968..51a6f1cac385 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -281,7 +281,7 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, pos += parsed; scheme = damon_new_scheme(&pattern, action, 0, "a, - &wmarks); + &wmarks, NUMA_NO_NODE); if (!scheme) goto fail; diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 3de2916a65c3..3775f0f2743d 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -163,7 +163,8 @@ static struct damos *damon_lru_sort_new_scheme( /* under the quota. */ "a, /* (De)activate this according to the watermarks. */ - &damon_lru_sort_wmarks); + &damon_lru_sort_wmarks, + NUMA_NO_NODE); } /* Create a DAMON-based operation scheme for hot memory regions */ diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 9bd341d62b4c..a05ccb41749b 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -177,7 +177,8 @@ static struct damos *damon_reclaim_new_scheme(void) /* under the quota. */ &damon_reclaim_quota, /* (De)activate this according to the watermarks. */ - &damon_reclaim_wmarks); + &damon_reclaim_wmarks, + NUMA_NO_NODE); } static void damon_reclaim_copy_quota_status(struct damos_quota *dst, diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index bea5bc52846a..0632d28b67f8 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -6,6 +6,7 @@ */ #include +#include #include "sysfs-common.h" @@ -1445,6 +1446,7 @@ struct damon_sysfs_scheme { struct damon_sysfs_scheme_filters *filters; struct damon_sysfs_stats *stats; struct damon_sysfs_scheme_regions *tried_regions; + int target_nid; }; /* This should match with enum damos_action */ @@ -1470,6 +1472,7 @@ static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc( scheme->kobj = (struct kobject){}; scheme->action = action; scheme->apply_interval_us = apply_interval_us; + scheme->target_nid = NUMA_NO_NODE; return scheme; } @@ -1692,6 +1695,28 @@ static ssize_t apply_interval_us_store(struct kobject *kobj, return err ? err : count; } +static ssize_t target_nid_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme *scheme = container_of(kobj, + struct damon_sysfs_scheme, kobj); + + return sysfs_emit(buf, "%d\n", scheme->target_nid); +} + +static ssize_t target_nid_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_scheme *scheme = container_of(kobj, + struct damon_sysfs_scheme, kobj); + int err = 0; + + /* TODO: error handling for target_nid range. */ + err = kstrtoint(buf, 0, &scheme->target_nid); + + return err ? err : count; +} + static void damon_sysfs_scheme_release(struct kobject *kobj) { kfree(container_of(kobj, struct damon_sysfs_scheme, kobj)); @@ -1703,9 +1728,13 @@ static struct kobj_attribute damon_sysfs_scheme_action_attr = static struct kobj_attribute damon_sysfs_scheme_apply_interval_us_attr = __ATTR_RW_MODE(apply_interval_us, 0600); +static struct kobj_attribute damon_sysfs_scheme_target_nid_attr = + __ATTR_RW_MODE(target_nid, 0600); + static struct attribute *damon_sysfs_scheme_attrs[] = { &damon_sysfs_scheme_action_attr.attr, &damon_sysfs_scheme_apply_interval_us_attr.attr, + &damon_sysfs_scheme_target_nid_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_scheme); @@ -2031,7 +2060,8 @@ static struct damos *damon_sysfs_mk_scheme( }; scheme = damon_new_scheme(&pattern, sysfs_scheme->action, - sysfs_scheme->apply_interval_us, "a, &wmarks); + sysfs_scheme->apply_interval_us, "a, &wmarks, + sysfs_scheme->target_nid); if (!scheme) return NULL; @@ -2068,6 +2098,7 @@ static void damon_sysfs_update_scheme(struct damos *scheme, scheme->action = sysfs_scheme->action; scheme->apply_interval_us = sysfs_scheme->apply_interval_us; + scheme->target_nid = sysfs_scheme->target_nid; scheme->quota.ms = sysfs_quotas->ms; scheme->quota.sz = sysfs_quotas->sz; -- cgit v1.2.3 From 83d0d46a80340baa9bf7e6039f3ad5d097c1af77 Mon Sep 17 00:00:00 2001 From: Honggyu Kim Date: Fri, 14 Jun 2024 12:00:09 +0900 Subject: Docs/damon: document damos_migrate_{hot,cold} This patch adds damon description for "migrate_hot" and "migrate_cold" actions for both usage and design documents as long as a new "target_nid" knob to set the migration target node. [sj@kernel.org: trivial fixups for DAMOS_MIGRATE_{HOT,COLD} documentation] Link: https://lkml.kernel.org/r/20240618213630.84846-2-sj@kernel.org Link: https://lkml.kernel.org/r/20240614030010.751-8-honggyu.kim@sk.com Signed-off-by: Honggyu Kim Signed-off-by: SeongJae Park Reviewed-by: SeongJae Park Cc: Gregory Price Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Hyeongtak Ji Cc: Masami Hiramatsu (Google) Cc: Mathieu Desnoyers Cc: Rakie Kim Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 10 +++++++--- Documentation/mm/damon/design.rst | 4 ++++ 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index e58ceb89ea2a..26df6cfa4441 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -78,7 +78,7 @@ comma (","). │ │ │ │ │ │ │ │ ... │ │ │ │ │ │ ... │ │ │ │ │ :ref:`schemes `/nr_schemes - │ │ │ │ │ │ :ref:`0 `/action,apply_interval_us + │ │ │ │ │ │ :ref:`0 `/action,target_nid,apply_interval_us │ │ │ │ │ │ │ :ref:`access_pattern `/ │ │ │ │ │ │ │ │ sz/min,max │ │ │ │ │ │ │ │ nr_accesses/min,max @@ -289,14 +289,18 @@ schemes// ------------ In each scheme directory, five directories (``access_pattern``, ``quotas``, -``watermarks``, ``filters``, ``stats``, and ``tried_regions``) and two files -(``action`` and ``apply_interval``) exist. +``watermarks``, ``filters``, ``stats``, and ``tried_regions``) and three files +(``action``, ``target_nid`` and ``apply_interval``) exist. The ``action`` file is for setting and getting the scheme's :ref:`action `. The keywords that can be written to and read from the file and their meaning are same to those of the list on :ref:`design doc `. +The ``target_nid`` file is for setting the migration target node, which is +only meaningful when the ``action`` is either ``migrate_hot`` or +``migrate_cold``. + The ``apply_interval_us`` file is for setting and getting the scheme's :ref:`apply_interval ` in microseconds. diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 3df387249937..3f12c884eb3a 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -325,6 +325,10 @@ that supports each action are as below. Supported by ``paddr`` operations set. - ``lru_deprio``: Deprioritize the region on its LRU lists. Supported by ``paddr`` operations set. + - ``migrate_hot``: Migrate the regions prioritizing warmer regions. + Supported by ``paddr`` operations set. + - ``migrate_cold``: Migrate the regions prioritizing colder regions. + Supported by ``paddr`` operations set. - ``stat``: Do nothing but count the statistics. Supported by all operations sets. -- cgit v1.2.3 From 2669324b81e5f67d409d3ad988da77c7c2a09045 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 4 Jun 2024 19:48:20 +0800 Subject: mm: remove page_maybe_dma_pinned() After the last user of page_maybe_dma_pinned() is converted to folio_maybe_dma_pinned(), remove page_maybe_dma_pinned() and update the document and comment. [wangkefeng.wang@huawei.com: fix pin_user_pages.rst underlining] Link: https://lkml.kernel.org/r/61b256c7-4989-44ec-83db-f34a1bd4be2d@huawei.com Link: https://lkml.kernel.org/r/20240604114822.2089819-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: David Hildenbrand Cc: Daniel Vetter Cc: Helge Deller Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- Documentation/core-api/pin_user_pages.rst | 12 ++++++------ include/linux/mm.h | 9 ++------- 2 files changed, 8 insertions(+), 13 deletions(-) (limited to 'Documentation') diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst index 6b5f7e6e7155..57b17978e156 100644 --- a/Documentation/core-api/pin_user_pages.rst +++ b/Documentation/core-api/pin_user_pages.rst @@ -196,20 +196,20 @@ INCORRECT (uses FOLL_GET calls): write to the data within the pages put_page() -page_maybe_dma_pinned(): the whole point of pinning -=================================================== +folio_maybe_dma_pinned(): the whole point of pinning +==================================================== -The whole point of marking pages as "DMA-pinned" or "gup-pinned" is to be able -to query, "is this page DMA-pinned?" That allows code such as page_mkclean() +The whole point of marking folios as "DMA-pinned" or "gup-pinned" is to be able +to query, "is this folio DMA-pinned?" That allows code such as page_mkclean() (and file system writeback code in general) to make informed decisions about -what to do when a page cannot be unmapped due to such pins. +what to do when a folio cannot be unmapped due to such pins. What to do in those cases is the subject of a years-long series of discussions and debates (see the References at the end of this document). It's a TODO item here: fill in the details once that's worked out. Meanwhile, it's safe to say that having this available: :: - static inline bool page_maybe_dma_pinned(struct page *page) + static inline bool folio_maybe_dma_pinned(struct folio *folio) ...is a prerequisite to solving the long-running gup+DMA problem. diff --git a/include/linux/mm.h b/include/linux/mm.h index 4f75fee497f1..cb4aa725252e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1918,8 +1918,8 @@ static inline struct folio *pfn_folio(unsigned long pfn) * * For more information, please see Documentation/core-api/pin_user_pages.rst. * - * Return: True, if it is likely that the page has been "dma-pinned". - * False, if the page is definitely not dma-pinned. + * Return: True, if it is likely that the folio has been "dma-pinned". + * False, if the folio is definitely not dma-pinned. */ static inline bool folio_maybe_dma_pinned(struct folio *folio) { @@ -1938,11 +1938,6 @@ static inline bool folio_maybe_dma_pinned(struct folio *folio) GUP_PIN_COUNTING_BIAS; } -static inline bool page_maybe_dma_pinned(struct page *page) -{ - return folio_maybe_dma_pinned(page_folio(page)); -} - /* * This should most likely only be called during fork() to see whether we * should break the cow immediately for an anon page on the src mm. -- cgit v1.2.3 From a929e0d10f3db1a53668f6b9845db27d7fb63759 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 4 Jun 2024 19:48:22 +0800 Subject: mm: remove page_mkclean() There are no more users of page_mkclean(), remove it and update the document and comment. Link: https://lkml.kernel.org/r/20240604114822.2089819-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: David Hildenbrand Cc: Daniel Vetter Cc: Helge Deller Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- Documentation/core-api/pin_user_pages.rst | 8 ++++---- drivers/video/fbdev/core/fb_defio.c | 4 ++-- include/linux/mm.h | 2 +- include/linux/rmap.h | 4 ---- mm/gup.c | 2 +- mm/mremap.c | 2 +- 6 files changed, 9 insertions(+), 13 deletions(-) (limited to 'Documentation') diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst index 57b17978e156..c16ca163b55e 100644 --- a/Documentation/core-api/pin_user_pages.rst +++ b/Documentation/core-api/pin_user_pages.rst @@ -132,7 +132,7 @@ CASE 1: Direct IO (DIO) ----------------------- There are GUP references to pages that are serving as DIO buffers. These buffers are needed for a relatively short time (so they -are not "long term"). No special synchronization with page_mkclean() or +are not "long term"). No special synchronization with folio_mkclean() or munmap() is provided. Therefore, flags to set at the call site are: :: FOLL_PIN @@ -144,7 +144,7 @@ CASE 2: RDMA ------------ There are GUP references to pages that are serving as DMA buffers. These buffers are needed for a long time ("long term"). No special -synchronization with page_mkclean() or munmap() is provided. Therefore, flags +synchronization with folio_mkclean() or munmap() is provided. Therefore, flags to set at the call site are: :: FOLL_PIN | FOLL_LONGTERM @@ -170,7 +170,7 @@ callback, simply remove the range from the device's page tables. Either way, as long as the driver unpins the pages upon mmu notifier callback, then there is proper synchronization with both filesystem and mm -(page_mkclean(), munmap(), etc). Therefore, neither flag needs to be set. +(folio_mkclean(), munmap(), etc). Therefore, neither flag needs to be set. CASE 4: Pinning for struct page manipulation only ------------------------------------------------- @@ -200,7 +200,7 @@ folio_maybe_dma_pinned(): the whole point of pinning ==================================================== The whole point of marking folios as "DMA-pinned" or "gup-pinned" is to be able -to query, "is this folio DMA-pinned?" That allows code such as page_mkclean() +to query, "is this folio DMA-pinned?" That allows code such as folio_mkclean() (and file system writeback code in general) to make informed decisions about what to do when a folio cannot be unmapped due to such pins. diff --git a/drivers/video/fbdev/core/fb_defio.c b/drivers/video/fbdev/core/fb_defio.c index c9c8e294b7e7..d38998714215 100644 --- a/drivers/video/fbdev/core/fb_defio.c +++ b/drivers/video/fbdev/core/fb_defio.c @@ -113,7 +113,7 @@ static vm_fault_t fb_deferred_io_fault(struct vm_fault *vmf) printk(KERN_ERR "no mapping available\n"); BUG_ON(!page->mapping); - page->index = vmf->pgoff; /* for page_mkclean() */ + page->index = vmf->pgoff; /* for folio_mkclean() */ vmf->page = page; return 0; @@ -161,7 +161,7 @@ static vm_fault_t fb_deferred_io_track_page(struct fb_info *info, unsigned long /* * We want the page to remain locked from ->page_mkwrite until - * the PTE is marked dirty to avoid page_mkclean() being called + * the PTE is marked dirty to avoid folio_mkclean() being called * before the PTE is updated, which would leave the page ignored * by defio. * Do this by locking the page here and informing the caller diff --git a/include/linux/mm.h b/include/linux/mm.h index cb4aa725252e..101945baffc7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1577,7 +1577,7 @@ static inline void put_page(struct page *page) * issue. * * Locking: the lockless algorithm described in folio_try_get_rcu() - * provides safe operation for get_user_pages(), page_mkclean() and + * provides safe operation for get_user_pages(), folio_mkclean() and * other calls that race to set up page table entries. */ #define GUP_PIN_COUNTING_BIAS (1U << 10) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 694d3d886245..980fa5d75d69 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -802,8 +802,4 @@ static inline int folio_mkclean(struct folio *folio) } #endif /* CONFIG_MMU */ -static inline int page_mkclean(struct page *page) -{ - return folio_mkclean(page_folio(page)); -} #endif /* _LINUX_RMAP_H */ diff --git a/mm/gup.c b/mm/gup.c index a01a0c96da31..6ff9f95a99a7 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -378,7 +378,7 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, * 1) This code sees the page as already dirty, so it * skips the call to set_page_dirty(). That could happen * because clear_page_dirty_for_io() called - * page_mkclean(), followed by set_page_dirty(). + * folio_mkclean(), followed by set_page_dirty(). * However, now the page is going to get written back, * which meets the original intention of setting it * dirty, so all is well: clear_page_dirty_for_io() goes diff --git a/mm/mremap.c b/mm/mremap.c index 5f96bc5ee918..e7ae140fc640 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -198,7 +198,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, * PTE. * * NOTE! Both old and new PTL matter: the old one - * for racing with page_mkclean(), the new one to + * for racing with folio_mkclean(), the new one to * make sure the physical page stays valid until * the TLB entry for the old mapping has been * flushed. -- cgit v1.2.3 From 55ccad6fc1a03c814c26f0e6b35db50feda2c59e Mon Sep 17 00:00:00 2001 From: Shubhang Kaushik OS Date: Mon, 10 Jun 2024 17:22:58 +0000 Subject: vmalloc: modify the alloc_vmap_area() error message for better diagnostics 'vmap allocation for size %lu failed: use vmalloc= to increase size' The above warning is seen in the kernel functionality for allocation of the restricted virtual memory range till exhaustion. This message is misleading because 'vmalloc=' is supported on arm32, x86 platforms and is not a valid kernel parameter on a number of other platforms (in particular its not supported on arm64, alpha, loongarch, arc, csky, hexagon, microblaze, mips, nios2, openrisc, parisc, m64k, powerpc, riscv, sh, um, xtensa, s390, sparc). With the update, the output gets modified to include the function parameters along with the start and end of the virtual memory range allowed. The warning message after fix on kernel version 6.10.0-rc1+: vmalloc_node_range for size 33619968 failed: Address range restricted between 0xffff800082640000 - 0xffff800084650000 Backtrace with the misleading error message: vmap allocation for size 33619968 failed: use vmalloc= to increase size insmod: vmalloc error: size 33554432, vm_struct allocation failed, mode:0xcc0(GFP_KERNEL), nodemask=(null),cpuset=/,mems_allowed=0 CPU: 46 PID: 1977 Comm: insmod Tainted: G E 6.10.0-rc1+ #79 Hardware name: INGRASYS Yushan Server iSystem TEMP-S000141176+10/Yushan Motherboard, BIOS 2.10.20230517 (SCP: xxx) yyyy/mm/dd Call trace: dump_backtrace+0xa0/0x128 show_stack+0x20/0x38 dump_stack_lvl+0x78/0x90 dump_stack+0x18/0x28 warn_alloc+0x12c/0x1b8 __vmalloc_node_range_noprof+0x28c/0x7e0 custom_init+0xb4/0xfff8 [test_driver] do_one_initcall+0x60/0x290 do_init_module+0x68/0x250 load_module+0x236c/0x2428 init_module_from_file+0x8c/0xd8 __arm64_sys_finit_module+0x1b4/0x388 invoke_syscall+0x78/0x108 el0_svc_common.constprop.0+0x48/0xf0 do_el0_svc+0x24/0x38 el0_svc+0x3c/0x130 el0t_64_sync_handler+0x100/0x130 el0t_64_sync+0x190/0x198 [Shubhang@os.amperecomputing.com: v5] Link: https://lkml.kernel.org/r/CH2PR01MB5894B0182EA0B28DF2EFB916F5C72@CH2PR01MB5894.prod.exchangelabs.com Link: https://lkml.kernel.org/r/MN2PR01MB59025CC02D1D29516527A693F5C62@MN2PR01MB5902.prod.exchangelabs.com Signed-off-by: Shubhang Kaushik Reviewed-by: Christoph Lameter (Ampere) Cc: Christoph Lameter Cc: Guo Ren Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Uladzislau Rezki (Sony) Cc: Xiongwei Song Signed-off-by: Andrew Morton --- Documentation/admin-guide/kernel-parameters.txt | 9 ++++++--- mm/vmalloc.c | 4 ++-- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'Documentation') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 27ec49af1bf2..bc55fb55cd26 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -7220,9 +7220,12 @@ vmalloc=nn[KMG] [KNL,BOOT,EARLY] Forces the vmalloc area to have an exact size of . This can be used to increase - the minimum size (128MB on x86). It can also be - used to decrease the size and leave more room - for directly mapped kernel RAM. + the minimum size (128MB on x86, arm32 platforms). + It can also be used to decrease the size and leave more room + for directly mapped kernel RAM. Note that this parameter does + not exist on many other platforms (including arm64, alpha, + loongarch, arc, csky, hexagon, microblaze, mips, nios2, openrisc, + parisc, m64k, powerpc, riscv, sh, um, xtensa, s390, sparc). vmcp_cma=nn[MG] [KNL,S390,EARLY] Sets the memory size reserved for contiguous memory diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4d9e1cbba8c2..b4c42da9f390 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2056,8 +2056,8 @@ overflow: } if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) - pr_warn("vmap allocation for size %lu failed: use vmalloc= to increase size\n", - size); + pr_warn("vmalloc_node_range for size %lu failed: Address range restricted to %#lx - %#lx\n", + size, vstart, vend); kmem_cache_free(vmap_area_cachep, va); return ERR_PTR(-EBUSY); -- cgit v1.2.3 From 3fe17dd0969850bb6b80ee343ba75d3d8eb151e1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 21 Jun 2024 09:36:25 -0700 Subject: Docs/mm/damon/maintainer-profile: introduce HacKerMaiL Patch series "Docs/mm/damon/maintaier-profile: document a mailing tool and community meetup series", v2. There is a mailing tool that developed and maintained by DAMON maintainer aiming to support DAMON community. Also there are DAMON community meetup series. Both are known to have rooms of improvements in terms of their visibility. Document those on the maintainer's profile document. This patch (of 2): Since DAMON was merged into mainline, I periodically received some questions around DAMON's mailing lists based workflow. The workflow is not different from the normal ones that well documented, but it is also true that it is not always easy and familiar for everyone. I personally overcame it by developing and using a simple tool, named HacKerMaiL (hkml)[1]. Based on my experience, I believe it is matured enough to be used for simple workflows like that of DAMON. Actually some DAMON contributors and Linux kernel developers other than myself told me they are using the tool. As DAMON maintainer, I also believe helping new DAMON community members onboarding to the worklow is one of the most important parts of my responsibilities. For the reason, the tool is announced[2] to support DAMON community. To further increasing the visibility of the fact, document the tool and the support plan on DAMON maintainer's profile. [1] https://github.com/damonitor/hackermail [2] https://github.com/damonitor/hackermail/commit/3909dad91301 Link: https://lkml.kernel.org/r/20240621163626.74815-1-sj@kernel.org Link: https://lkml.kernel.org/r/20240621163626.74815-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Randy Dunlap Signed-off-by: Andrew Morton --- Documentation/mm/damon/maintainer-profile.rst | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'Documentation') diff --git a/Documentation/mm/damon/maintainer-profile.rst b/Documentation/mm/damon/maintainer-profile.rst index 8213cf61d38a..c7db0572faa9 100644 --- a/Documentation/mm/damon/maintainer-profile.rst +++ b/Documentation/mm/damon/maintainer-profile.rst @@ -53,6 +53,22 @@ Mon-Fri) in PT (Pacific Time). The response to patches will occasionally be slow. Do not hesitate to send a ping if you have not heard back within a week of sending a patch. +Mailing tool +------------ + +Like many other Linux kernel subsystems, DAMON uses the mailing lists +(damon@lists.linux.dev and linux-mm@kvack.org) as the major communication +channel. There is a simple tool called HacKerMaiL (``hkml``) [8]_ , which is +for people who are not very familiar with the mailing lists based +communication. The tool could be particularly helpful for DAMON community +members since it is developed and maintained by DAMON maintainer. The tool is +also officially announced to support DAMON and general Linux kernel development +workflow. + +In other words, ``hkml`` [8]_ is a mailing tool for DAMON community, which +DAMON maintainer is committed to support. Please feel free to try and report +issues or feature requests for the tool to the maintainer. + .. [1] https://git.kernel.org/akpm/mm/h/mm-unstable .. [2] https://git.kernel.org/sj/h/damon/next @@ -61,3 +77,4 @@ of sending a patch. .. [5] https://github.com/awslabs/damon-tests/blob/master/corr/tests/kunit.sh .. [6] https://github.com/awslabs/damon-tests/tree/master/corr .. [7] https://github.com/awslabs/damon-tests/tree/master/perf +.. [8] https://github.com/damonitor/hackermail -- cgit v1.2.3 From 437881bc1c6b2edc269857874690b359f1ad1985 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 21 Jun 2024 09:36:26 -0700 Subject: Docs/mm/damon/maintainer-profile: document DAMON community meetups DAMON bi-weekly community meetup series has continued since 2022-08-15 for community members who prefer synchronous chat over asynchronous mails. Recently I got some feedbacks about the series from a few people. They told me the series is helpful for understanding of the project and particiapting to the development, but it could be further better in terms of the visibility. Based on that, I started sending meeting reminder for every occurrence. For people who don't subscribe the mailing list, however, adding an announcement on the official document could be helpful. Document the series on DAMON maintainer's profile for the purpose. Link: https://lkml.kernel.org/r/20240621163626.74815-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Randy Dunlap Signed-off-by: Andrew Morton --- Documentation/mm/damon/maintainer-profile.rst | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'Documentation') diff --git a/Documentation/mm/damon/maintainer-profile.rst b/Documentation/mm/damon/maintainer-profile.rst index c7db0572faa9..feccf6a0f6c3 100644 --- a/Documentation/mm/damon/maintainer-profile.rst +++ b/Documentation/mm/damon/maintainer-profile.rst @@ -69,6 +69,24 @@ In other words, ``hkml`` [8]_ is a mailing tool for DAMON community, which DAMON maintainer is committed to support. Please feel free to try and report issues or feature requests for the tool to the maintainer. +Community meetup +---------------- + +DAMON community is maintaining two bi-weekly meetup series for community +members who prefer synchronous conversations over mails. + +The first one is for any discussion between every community member. No +reservation is needed. + +The seconds one is for discussions on specific topics between restricted +members including the maintainer. The maintainer shares the available time +slots, and attendees should reserve one of those at least 24 hours before the +time slot, by reaching out to the maintainer. + +Schedules and available reservation time slots are available at the Google doc +[9]_ . DAMON maintainer will also provide periodic reminder to the mailing +list (damon@lists.linux.dev). + .. [1] https://git.kernel.org/akpm/mm/h/mm-unstable .. [2] https://git.kernel.org/sj/h/damon/next @@ -78,3 +96,4 @@ issues or feature requests for the tool to the maintainer. .. [6] https://github.com/awslabs/damon-tests/tree/master/corr .. [7] https://github.com/awslabs/damon-tests/tree/master/perf .. [8] https://github.com/damonitor/hackermail +.. [9] https://docs.google.com/document/d/1v43Kcj3ly4CYqmAkMaZzLiM2GEnWfgdGbZAH3mi2vpM/edit?usp=sharing -- cgit v1.2.3 From ec3e837d8fd96c68599b2861dd412094b7bc335c Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 21 Jun 2024 13:34:55 +0200 Subject: kmsan: allow disabling KMSAN checks for the current task Like for KASAN, it's useful to temporarily disable KMSAN checks around, e.g., redzone accesses. Introduce kmsan_disable_current() and kmsan_enable_current(), which are similar to their KASAN counterparts. Make them reentrant in order to handle memory allocations in interrupt context. Repurpose the allow_reporting field for this. Link: https://lkml.kernel.org/r/20240621113706.315500-12-iii@linux.ibm.com Signed-off-by: Ilya Leoshkevich Reviewed-by: Alexander Potapenko Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Heiko Carstens Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Cc: Marco Elver Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Pekka Enberg Cc: Roman Gushchin Cc: Steven Rostedt (Google) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/dev-tools/kmsan.rst | 11 +++++++++-- include/linux/kmsan.h | 24 ++++++++++++++++++++++++ include/linux/kmsan_types.h | 2 +- mm/kmsan/core.c | 1 - mm/kmsan/hooks.c | 18 +++++++++++++++--- mm/kmsan/report.c | 7 ++++--- tools/objtool/check.c | 2 ++ 7 files changed, 55 insertions(+), 10 deletions(-) (limited to 'Documentation') diff --git a/Documentation/dev-tools/kmsan.rst b/Documentation/dev-tools/kmsan.rst index 323eedad53cd..6a48d96c5c85 100644 --- a/Documentation/dev-tools/kmsan.rst +++ b/Documentation/dev-tools/kmsan.rst @@ -110,6 +110,13 @@ in the Makefile. Think of this as applying ``__no_sanitize_memory`` to every function in the file or directory. Most users won't need KMSAN_SANITIZE, unless their code gets broken by KMSAN (e.g. runs at early boot time). +KMSAN checks can also be temporarily disabled for the current task using +``kmsan_disable_current()`` and ``kmsan_enable_current()`` calls. Each +``kmsan_enable_current()`` call must be preceded by a +``kmsan_disable_current()`` call; these call pairs may be nested. One needs to +be careful with these calls, keeping the regions short and preferring other +ways to disable instrumentation, where possible. + Support ======= @@ -338,11 +345,11 @@ Per-task KMSAN state ~~~~~~~~~~~~~~~~~~~~ Every task_struct has an associated KMSAN task state that holds the KMSAN -context (see above) and a per-task flag disallowing KMSAN reports:: +context (see above) and a per-task counter disallowing KMSAN reports:: struct kmsan_context { ... - bool allow_reporting; + unsigned int depth; struct kmsan_context_state cstate; ... } diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index fe6c2212bdb1..14b5ea6d3a43 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -239,6 +239,22 @@ void kmsan_unpoison_entry_regs(const struct pt_regs *regs); */ void *kmsan_get_metadata(void *addr, bool is_origin); +/** + * kmsan_enable_current(): Enable KMSAN for the current task. + * + * Each kmsan_enable_current() current call must be preceded by a + * kmsan_disable_current() call. These call pairs may be nested. + */ +void kmsan_enable_current(void); + +/** + * kmsan_disable_current(): Disable KMSAN for the current task. + * + * Each kmsan_disable_current() current call must be followed by a + * kmsan_enable_current() call. These call pairs may be nested. + */ +void kmsan_disable_current(void); + #else static inline void kmsan_init_shadow(void) @@ -338,6 +354,14 @@ static inline void kmsan_unpoison_entry_regs(const struct pt_regs *regs) { } +static inline void kmsan_enable_current(void) +{ +} + +static inline void kmsan_disable_current(void) +{ +} + #endif #endif /* _LINUX_KMSAN_H */ diff --git a/include/linux/kmsan_types.h b/include/linux/kmsan_types.h index 929287981afe..dfc59918b3c0 100644 --- a/include/linux/kmsan_types.h +++ b/include/linux/kmsan_types.h @@ -31,7 +31,7 @@ struct kmsan_context_state { struct kmsan_ctx { struct kmsan_context_state cstate; int kmsan_in_runtime; - bool allow_reporting; + unsigned int depth; }; #endif /* _LINUX_KMSAN_TYPES_H */ diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c index 95f859e38c53..81b22220711a 100644 --- a/mm/kmsan/core.c +++ b/mm/kmsan/core.c @@ -43,7 +43,6 @@ void kmsan_internal_task_create(struct task_struct *task) struct thread_info *info = current_thread_info(); __memset(ctx, 0, sizeof(*ctx)); - ctx->allow_reporting = true; kmsan_internal_unpoison_memory(info, sizeof(*info), false); } diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index b408714f9ba3..267d0afa2e8b 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -39,12 +39,10 @@ void kmsan_task_create(struct task_struct *task) void kmsan_task_exit(struct task_struct *task) { - struct kmsan_ctx *ctx = &task->kmsan_ctx; - if (!kmsan_enabled || kmsan_in_runtime()) return; - ctx->allow_reporting = false; + kmsan_disable_current(); } void kmsan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags) @@ -424,3 +422,17 @@ void kmsan_check_memory(const void *addr, size_t size) REASON_ANY); } EXPORT_SYMBOL(kmsan_check_memory); + +void kmsan_enable_current(void) +{ + KMSAN_WARN_ON(current->kmsan_ctx.depth == 0); + current->kmsan_ctx.depth--; +} +EXPORT_SYMBOL(kmsan_enable_current); + +void kmsan_disable_current(void) +{ + current->kmsan_ctx.depth++; + KMSAN_WARN_ON(current->kmsan_ctx.depth == 0); +} +EXPORT_SYMBOL(kmsan_disable_current); diff --git a/mm/kmsan/report.c b/mm/kmsan/report.c index c79d3b0d2d0d..92e73ec61435 100644 --- a/mm/kmsan/report.c +++ b/mm/kmsan/report.c @@ -8,6 +8,7 @@ */ #include +#include #include #include #include @@ -158,12 +159,12 @@ void kmsan_report(depot_stack_handle_t origin, void *address, int size, if (!kmsan_enabled) return; - if (!current->kmsan_ctx.allow_reporting) + if (current->kmsan_ctx.depth) return; if (!origin) return; - current->kmsan_ctx.allow_reporting = false; + kmsan_disable_current(); ua_flags = user_access_save(); raw_spin_lock(&kmsan_report_lock); pr_err("=====================================================\n"); @@ -216,5 +217,5 @@ void kmsan_report(depot_stack_handle_t origin, void *address, int size, if (panic_on_kmsan) panic("kmsan.panic set ...\n"); user_access_restore(ua_flags); - current->kmsan_ctx.allow_reporting = true; + kmsan_enable_current(); } diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 0a33d9195b7a..01237d167223 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1202,6 +1202,8 @@ static const char *uaccess_safe_builtin[] = { "__sanitizer_cov_trace_switch", /* KMSAN */ "kmsan_copy_to_user", + "kmsan_disable_current", + "kmsan_enable_current", "kmsan_report", "kmsan_unpoison_entry_regs", "kmsan_unpoison_memory", -- cgit v1.2.3 From 68cd9050d871e4db5433420b5ceb32f5512d18bc Mon Sep 17 00:00:00 2001 From: Dan Schatzberg Date: Wed, 3 Jan 2024 08:48:37 -0800 Subject: mm: add swappiness= arg to memory.reclaim Allow proactive reclaimers to submit an additional swappiness= argument to memory.reclaim. This overrides the global or per-memcg swappiness setting for that reclaim attempt. For example: echo "2M swappiness=0" > /sys/fs/cgroup/memory.reclaim will perform reclaim on the rootcg with a swappiness setting of 0 (no swap) regardless of the vm.swappiness sysctl setting. Userspace proactive reclaimers use the memory.reclaim interface to trigger reclaim. The memory.reclaim interface does not allow for any way to effect the balance of file vs anon during proactive reclaim. The only approach is to adjust the vm.swappiness setting. However, there are a few reasons we look to control the balance of file vs anon during proactive reclaim, separately from reactive reclaim: * Swapout should be limited to manage SSD write endurance. In near-OOM situations we are fine with lots of swap-out to avoid OOMs. As these are typically rare events, they have relatively little impact on write endurance. However, proactive reclaim runs continuously and so its impact on SSD write endurance is more significant. Therefore it is desireable to control swap-out for proactive reclaim separately from reactive reclaim * Some userspace OOM killers like systemd-oomd[1] support OOM killing on swap exhaustion. This makes sense if the swap exhaustion is triggered due to reactive reclaim but less so if it is triggered due to proactive reclaim (e.g. one could see OOMs when free memory is ample but anon is just particularly cold). Therefore, it's desireable to have proactive reclaim reduce or stop swap-out before the threshold at which OOM killing occurs. In the case of Meta's Senpai proactive reclaimer, we adjust vm.swappiness before writes to memory.reclaim[2]. This has been in production for nearly two years and has addressed our needs to control proactive vs reactive reclaim behavior but is still not ideal for a number of reasons: * vm.swappiness is a global setting, adjusting it can race/interfere with other system administration that wishes to control vm.swappiness. In our case, we need to disable Senpai before adjusting vm.swappiness. * vm.swappiness is stateful - so a crash or restart of Senpai can leave a misconfigured setting. This requires some additional management to record the "desired" setting and ensure Senpai always adjusts to it. With this patch, we avoid these downsides of adjusting vm.swappiness globally. [1]https://www.freedesktop.org/software/systemd/man/latest/systemd-oomd.service.html [2]https://github.com/facebookincubator/oomd/blob/main/src/oomd/plugins/Senpai.cpp#L585-L598 Link: https://lkml.kernel.org/r/20240103164841.2800183-3-schatzberg.dan@gmail.com Signed-off-by: Dan Schatzberg Suggested-by: Yosry Ahmed Acked-by: Michal Hocko Acked-by: David Rientjes Acked-by: Chris Li Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Shakeel Butt Cc: Tejun Heo Cc: Yue Zhao Cc: Zefan Li Cc: Nhat Pham Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 18 ++++++----- include/linux/swap.h | 3 +- mm/memcontrol-v1.c | 4 +-- mm/memcontrol.c | 53 +++++++++++++++++++++++++++------ mm/vmscan.c | 25 ++++++++++++++-- 5 files changed, 81 insertions(+), 22 deletions(-) (limited to 'Documentation') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 8fbb0519d556..3ef815845003 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1299,17 +1299,10 @@ PAGE_SIZE multiple when read back. This is a simple interface to trigger memory reclaim in the target cgroup. - This file accepts a single key, the number of bytes to reclaim. - No nested keys are currently supported. - Example:: echo "1G" > memory.reclaim - The interface can be later extended with nested keys to - configure the reclaim behavior. For example, specify the - type of memory to reclaim from (anon, file, ..). - Please note that the kernel can over or under reclaim from the target cgroup. If less bytes are reclaimed than the specified amount, -EAGAIN is returned. @@ -1321,6 +1314,17 @@ PAGE_SIZE multiple when read back. This means that the networking layer will not adapt based on reclaim induced by memory.reclaim. +The following nested keys are defined. + + ========== ================================ + swappiness Swappiness value to reclaim with + ========== ================================ + + Specifying a swappiness value instructs the kernel to perform + the reclaim with that swappiness value. Note that this has the + same semantics as vm.swappiness applied to memcg reclaim with + all the existing limitations and potential future extensions. + memory.peak A read-only single value file which exists on non-root cgroups. diff --git a/include/linux/swap.h b/include/linux/swap.h index e97db0fa7659..3df75d62a835 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -409,7 +409,8 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - unsigned int reclaim_options); + unsigned int reclaim_options, + int *swappiness); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 98da8d0af4d0..a3ea4a5e8c96 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -2241,7 +2241,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, } if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, - memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) { + memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) { ret = -EBUSY; break; } @@ -2273,7 +2273,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) return -EINTR; if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, - MEMCG_RECLAIM_MAY_SWAP)) + MEMCG_RECLAIM_MAY_SWAP, NULL)) nr_retries--; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 53a8491634b9..6fb1c933e03a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -1921,7 +1922,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, psi_memstall_enter(&pflags); nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, - MEMCG_RECLAIM_MAY_SWAP); + MEMCG_RECLAIM_MAY_SWAP, + NULL); psi_memstall_leave(&pflags); } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); @@ -2227,7 +2229,7 @@ retry: psi_memstall_enter(&pflags); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, - gfp_mask, reclaim_options); + gfp_mask, reclaim_options, NULL); psi_memstall_leave(&pflags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) @@ -4097,7 +4099,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, } reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL); if (!reclaimed && !nr_retries--) break; @@ -4146,7 +4148,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (nr_reclaims) { if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP)) + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL)) nr_reclaims--; continue; } @@ -4276,19 +4278,50 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, return nbytes; } +enum { + MEMORY_RECLAIM_SWAPPINESS = 0, + MEMORY_RECLAIM_NULL, +}; + +static const match_table_t tokens = { + { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"}, + { MEMORY_RECLAIM_NULL, NULL }, +}; + static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); unsigned int nr_retries = MAX_RECLAIM_RETRIES; unsigned long nr_to_reclaim, nr_reclaimed = 0; + int swappiness = -1; unsigned int reclaim_options; - int err; + char *old_buf, *start; + substring_t args[MAX_OPT_ARGS]; buf = strstrip(buf); - err = page_counter_memparse(buf, "", &nr_to_reclaim); - if (err) - return err; + + old_buf = buf; + nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE; + if (buf == old_buf) + return -EINVAL; + + buf = strstrip(buf); + + while ((start = strsep(&buf, " ")) != NULL) { + if (!strlen(start)) + continue; + switch (match_token(start, tokens, args)) { + case MEMORY_RECLAIM_SWAPPINESS: + if (match_int(&args[0], &swappiness)) + return -EINVAL; + if (swappiness < MIN_SWAPPINESS || swappiness > MAX_SWAPPINESS) + return -EINVAL; + break; + default: + return -EINVAL; + } + } reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; while (nr_reclaimed < nr_to_reclaim) { @@ -4308,7 +4341,9 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, lru_add_drain_all(); reclaimed = try_to_free_mem_cgroup_pages(memcg, - batch_size, GFP_KERNEL, reclaim_options); + batch_size, GFP_KERNEL, + reclaim_options, + swappiness == -1 ? NULL : &swappiness); if (!reclaimed && !nr_retries--) return -EAGAIN; diff --git a/mm/vmscan.c b/mm/vmscan.c index 91f99558650d..3d4c681c6d40 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -92,6 +92,11 @@ struct scan_control { unsigned long anon_cost; unsigned long file_cost; +#ifdef CONFIG_MEMCG + /* Swappiness value for proactive reclaim. Always use sc_swappiness()! */ + int *proactive_swappiness; +#endif + /* Can active folios be deactivated as part of reclaim? */ #define DEACTIVATE_ANON 1 #define DEACTIVATE_FILE 2 @@ -236,6 +241,13 @@ static bool writeback_throttling_sane(struct scan_control *sc) #endif return false; } + +static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg) +{ + if (sc->proactive && sc->proactive_swappiness) + return *sc->proactive_swappiness; + return mem_cgroup_swappiness(memcg); +} #else static bool cgroup_reclaim(struct scan_control *sc) { @@ -251,6 +263,11 @@ static bool writeback_throttling_sane(struct scan_control *sc) { return true; } + +static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg) +{ + return READ_ONCE(vm_swappiness); +} #endif static void set_task_reclaim_state(struct task_struct *task, @@ -2351,7 +2368,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct mem_cgroup *memcg = lruvec_memcg(lruvec); unsigned long anon_cost, file_cost, total_cost; - int swappiness = mem_cgroup_swappiness(memcg); + int swappiness = sc_swappiness(sc, memcg); u64 fraction[ANON_AND_FILE]; u64 denominator = 0; /* gcc */ enum scan_balance scan_balance; @@ -2632,7 +2649,7 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH) return 0; - return mem_cgroup_swappiness(memcg); + return sc_swappiness(sc, memcg); } static int get_nr_gens(struct lruvec *lruvec, int type) @@ -6549,12 +6566,14 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - unsigned int reclaim_options) + unsigned int reclaim_options, + int *swappiness) { unsigned long nr_reclaimed; unsigned int noreclaim_flag; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), + .proactive_swappiness = swappiness, .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), .reclaim_idx = MAX_NR_ZONES - 1, -- cgit v1.2.3 From 44195d1eba826a8af0afbcfa69ab4cc26f6ead7f Mon Sep 17 00:00:00 2001 From: Jiaqi Yan Date: Wed, 26 Jun 2024 05:08:18 +0000 Subject: docs: mm: add enable_soft_offline sysctl Add the documentation for soft offline behaviors / costs, and what the new enable_soft_offline sysctl is for. [jiaqiyan@google.com: fix kerneldoc warnings] Link: https://lkml.kernel.org/r/CACw3F52=GxTCDw-PqFh3-GDM-fo3GbhGdu0hedxYXOTT4TQSTg@mail.gmail.com [jiaqiyan@google.com: there are more blank lines needed] Link: https://lkml.kernel.org/r/CACw3F52_obAB742XeDRNun4BHBYtrxtbvp5NkUincXdaob0j1g@mail.gmail.com Link: https://lkml.kernel.org/r/20240626050818.2277273-5-jiaqiyan@google.com Signed-off-by: Jiaqi Yan Acked-by: Oscar Salvador Acked-by: Miaohe Lin Acked-by: David Rientjes Cc: Frank van der Linden Cc: Jane Chu Cc: Jonathan Corbet Cc: Lance Yang Cc: Muchun Song Cc: Naoya Horiguchi Cc: Randy Dunlap Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/admin-guide/sysctl/vm.rst | 38 +++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'Documentation') diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index e86c968a7a0e..f48eaa98d22d 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -36,6 +36,7 @@ Currently, these files are in /proc/sys/vm: - dirtytime_expire_seconds - dirty_writeback_centisecs - drop_caches +- enable_soft_offline - extfrag_threshold - highmem_is_dirtyable - hugetlb_shm_group @@ -267,6 +268,43 @@ used:: These are informational only. They do not mean that anything is wrong with your system. To disable them, echo 4 (bit 2) into drop_caches. +enable_soft_offline +=================== +Correctable memory errors are very common on servers. Soft-offline is kernel's +solution for memory pages having (excessive) corrected memory errors. + +For different types of page, soft-offline has different behaviors / costs. + +- For a raw error page, soft-offline migrates the in-use page's content to + a new raw page. + +- For a page that is part of a transparent hugepage, soft-offline splits the + transparent hugepage into raw pages, then migrates only the raw error page. + As a result, user is transparently backed by 1 less hugepage, impacting + memory access performance. + +- For a page that is part of a HugeTLB hugepage, soft-offline first migrates + the entire HugeTLB hugepage, during which a free hugepage will be consumed + as migration target. Then the original hugepage is dissolved into raw + pages without compensation, reducing the capacity of the HugeTLB pool by 1. + +It is user's call to choose between reliability (staying away from fragile +physical memory) vs performance / capacity implications in transparent and +HugeTLB cases. + +For all architectures, enable_soft_offline controls whether to soft offline +memory pages. When set to 1, kernel attempts to soft offline the pages +whenever it thinks needed. When set to 0, kernel returns EOPNOTSUPP to +the request to soft offline the pages. Its default value is 1. + +It is worth mentioning that after setting enable_soft_offline to 0, the +following requests to soft offline pages will not be performed: + +- Request to soft offline pages from RAS Correctable Errors Collector. + +- On ARM, the request to soft offline pages from GHES driver. + +- On PARISC, the request to soft offline pages from Page Deallocation Table. extfrag_threshold ================= -- cgit v1.2.3 From efeacc2cb6b7a0f44757d7d50069189d592c2d5c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 1 Jul 2024 12:26:58 -0700 Subject: Docs/mm/damon/design: fix two typos Patch series "Docs/damon: minor fixups and improvements". Fixup typos, clarify regions merging operation design with recent change, add access pattern snapshot example use case, and improve readability of the design document and subsystem documents index by reorganizing/wordsmithing and adding links to other sections and/or documents for easy browsing. This patch (of 9): Fix two typos. The first one is just a simple typo: s/accurach/accuracy/ The second one is made by the author being out of their mind. 'Region Based Sampling' section of the doc is mistakenly calling the access frequency counter of region as 'nr_regions'. Fix it with the correct name, 'nr_accesses'. Link: https://lkml.kernel.org/r/20240701192706.51415-1-sj@kernel.org Link: https://lkml.kernel.org/r/20240701192706.51415-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 3f12c884eb3a..6beb245cbb62 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -25,7 +25,7 @@ DAMON subsystem is configured with three layers including - Operations Set: Implements fundamental operations for DAMON that depends on the given monitoring target address-space and available set of software/hardware primitives, -- Core: Implements core logics including monitoring overhead/accurach control +- Core: Implements core logics including monitoring overhead/accuracy control and access-aware system operations on top of the operations set layer, and - Modules: Implements kernel modules for various purposes that provides interfaces for the user space, on top of the core layer. @@ -192,7 +192,7 @@ one page in the region is required to be checked. Thus, for each ``sampling interval``, DAMON randomly picks one page in each region, waits for one ``sampling interval``, checks whether the page is accessed meanwhile, and increases the access frequency counter of the region if so. The counter is -called ``nr_regions`` of the region. Therefore, the monitoring overhead is +called ``nr_accesses`` of the region. Therefore, the monitoring overhead is controllable by setting the number of regions. DAMON allows users to set the minimum and the maximum number of regions for the trade-off. -- cgit v1.2.3 From 2081610d98abefdfbbcfd710d65309bccd2e2dfe Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 1 Jul 2024 12:26:59 -0700 Subject: Docs/mm/damon/design: clarify regions merging operation DAMON design document is not explaining how min_nr_regions limit is kept, and what happens if the number of regions exceeds max_nr_regions. Add more clarification for those. Link: https://lkml.kernel.org/r/20240701192706.51415-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'Documentation') diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 6beb245cbb62..fe08a3796e60 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -209,11 +209,18 @@ the data access pattern can be dynamically changed. This will result in low monitoring quality. To keep the assumption as much as possible, DAMON adaptively merges and splits each region based on their access frequency. -For each ``aggregation interval``, it compares the access frequencies of -adjacent regions and merges those if the frequency difference is small. Then, -after it reports and clears the aggregated access frequency of each region, it -splits each region into two or three regions if the total number of regions -will not exceed the user-specified maximum number of regions after the split. +For each ``aggregation interval``, it compares the access frequencies +(``nr_accesses``) of adjacent regions. If the difference is small, and if the +sum of the two regions' sizes is smaller than the size of total regions divided +by the ``minimum number of regions``, DAMON merges the two regions. If the +resulting number of total regions is still higher than ``maximum number of +regions``, it repeats the merging with increasing access frequenceis difference +threshold until the upper-limit of the number of regions is met, or the +threshold becomes higher than possible maximum value (``aggregation interval`` +divided by ``sampling interval``). Then, after it reports and clears the +aggregated access frequency of each region, it splits each region into two or +three regions if the total number of regions will not exceed the user-specified +maximum number of regions after the split. In this way, DAMON provides its best-effort quality and minimal overhead while keeping the bounds users set for their trade-off. -- cgit v1.2.3 From 752f18b9d9c23791b0e9fc70054b972eb176796b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 1 Jul 2024 12:27:00 -0700 Subject: Docs/admin-guide/mm/damon/start: add access pattern snapshot example DAMON user-space tool (damo) provides access pattern snapshot feature, which is expected to be frequently used for real time access pattern analysis. The snapshot output is also showing what DAMON provides on its own, including the 'age' information. In contrast, the recorded access patterns, which is shown as an example usage on the quick start section, shows what users can make from what DAMON provided. It includes information that generated outside of DAMON and makes the 'age' concept bit unclear. Hence snapshot output is easier at understanding the raw realtime output of DAMON. Add the snapshot usage example on the quick start section. Link: https://lkml.kernel.org/r/20240701192706.51415-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/start.rst | 46 +++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 4 deletions(-) (limited to 'Documentation') diff --git a/Documentation/admin-guide/mm/damon/start.rst b/Documentation/admin-guide/mm/damon/start.rst index 7aa0071ff1c3..054010a7f3d8 100644 --- a/Documentation/admin-guide/mm/damon/start.rst +++ b/Documentation/admin-guide/mm/damon/start.rst @@ -34,18 +34,56 @@ detail) of DAMON, you should ensure :doc:`sysfs ` is mounted. +Snapshot Data Access Patterns +============================= + +The commands below show the memory access pattern of a program at the moment of +the execution. :: + + $ git clone https://github.com/sjp38/masim; cd masim; make + $ sudo damo start "./masim ./configs/stairs.cfg --quiet" + $ sudo ./damo show + 0 addr [85.541 TiB , 85.541 TiB ) (57.707 MiB ) access 0 % age 10.400 s + 1 addr [85.541 TiB , 85.542 TiB ) (413.285 MiB) access 0 % age 11.400 s + 2 addr [127.649 TiB , 127.649 TiB) (57.500 MiB ) access 0 % age 1.600 s + 3 addr [127.649 TiB , 127.649 TiB) (32.500 MiB ) access 0 % age 500 ms + 4 addr [127.649 TiB , 127.649 TiB) (9.535 MiB ) access 100 % age 300 ms + 5 addr [127.649 TiB , 127.649 TiB) (8.000 KiB ) access 60 % age 0 ns + 6 addr [127.649 TiB , 127.649 TiB) (6.926 MiB ) access 0 % age 1 s + 7 addr [127.998 TiB , 127.998 TiB) (120.000 KiB) access 0 % age 11.100 s + 8 addr [127.998 TiB , 127.998 TiB) (8.000 KiB ) access 40 % age 100 ms + 9 addr [127.998 TiB , 127.998 TiB) (4.000 KiB ) access 0 % age 11 s + total size: 577.590 MiB + $ sudo ./damo stop + +The first command of the above example downloads and builds an artificial +memory access generator program called ``masim``. The second command asks DAMO +to execute the artificial generator process start via the given command and +make DAMON monitors the generator process. The third command retrieves the +current snapshot of the monitored access pattern of the process from DAMON and +shows the pattern in a human readable format. + +Each line of the output shows which virtual address range (``addr [XX, XX)``) +of the process is how frequently (``access XX %``) accessed for how long time +(``age XX``). For example, the fifth region of ~9 MiB size is being most +frequently accessed for last 300 milliseconds. Finally, the fourth command +stops DAMON. + +Note that DAMON can monitor not only virtual address spaces but multiple types +of address spaces including the physical address space. + + Recording Data Access Patterns ============================== The commands below record the memory access patterns of a program and save the monitoring results to a file. :: - $ git clone https://github.com/sjp38/masim - $ cd masim; make; ./masim ./configs/zigzag.cfg & + $ ./masim ./configs/zigzag.cfg & $ sudo damo record -o damon.data $(pidof masim) -The first two lines of the commands download an artificial memory access -generator program and run it in the background. The generator will repeatedly +The line of the commands run the artificial memory access +generator program again. The generator will repeatedly access two 100 MiB sized memory regions one by one. You can substitute this with your real workload. The last line asks ``damo`` to record the access pattern in the ``damon.data`` file. -- cgit v1.2.3 From 316df988caf8cbb18a96ad7a02c9e65815acb059 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 1 Jul 2024 12:27:01 -0700 Subject: Docs/mm/damon/design: add links from overall architecture to sections of details DAMON design document briefly explains the overall layers architecture first, and then provides detailed explanations of each layer with dedicated sections. Letting readers go directly to the detailed sections for specific layers could help easy browsing of the not-very-short document. Add links from the overall summary to the sections of details. Link: https://lkml.kernel.org/r/20240701192706.51415-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'Documentation') diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index fe08a3796e60..991839200f80 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -22,13 +22,15 @@ Overall Architecture DAMON subsystem is configured with three layers including -- Operations Set: Implements fundamental operations for DAMON that depends on - the given monitoring target address-space and available set of - software/hardware primitives, -- Core: Implements core logics including monitoring overhead/accuracy control - and access-aware system operations on top of the operations set layer, and -- Modules: Implements kernel modules for various purposes that provides - interfaces for the user space, on top of the core layer. +- :ref:`Operations Set `: Implements fundamental + operations for DAMON that depends on the given monitoring target + address-space and available set of software/hardware primitives, +- :ref:`Core `: Implements core logics including monitoring + overhead/accuracy control and access-aware system operations on top of the + operations set layer, and +- :ref:`Modules `: Implements kernel modules for various + purposes that provides interfaces for the user space, on top of the core + layer. .. _damon_design_configurable_operations_set: @@ -140,6 +142,8 @@ conflict with the reclaim logic using ``PG_idle`` and ``PG_young`` page flags, as Idle page tracking does. +.. _damon_core_logic: + Core Logics =========== @@ -512,6 +516,8 @@ interface, namely ``include/linux/damon.h``. Please refer to the API :doc:`document ` for details of the interface. +.. _damon_modules: + Modules ======= -- cgit v1.2.3 From 92117a6e05f25cd87a01b20c841946d4b368e528 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 1 Jul 2024 12:27:02 -0700 Subject: Docs/mm/damon/design: move 'Configurable Operations Set' section into 'Operations Set Layer' section 'Configurable Operations Set' section is for providing a description of the pluggability of the operations set layer. Just after that, 'Operations Set Layer' section, which is dedicated for the entire things of the layer, follows. The layout is odd, and some descriptions are duplicated. Move 'Configurable Operations Set' section into 'Operations Set Layer' and re-write some of the detailed descriptions. Link: https://lkml.kernel.org/r/20240701192706.51415-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 47 ++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 25 deletions(-) (limited to 'Documentation') diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 991839200f80..f7029bc840ce 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -33,30 +33,6 @@ DAMON subsystem is configured with three layers including layer. -.. _damon_design_configurable_operations_set: - -Configurable Operations Set ---------------------------- - -For data access monitoring and additional low level work, DAMON needs a set of -implementations for specific operations that are dependent on and optimized for -the given target address space. On the other hand, the accuracy and overhead -tradeoff mechanism, which is the core logic of DAMON, is in the pure logic -space. DAMON separates the two parts in different layers, namely DAMON -Operations Set and DAMON Core Logics Layers, respectively. It further defines -the interface between the layers to allow various operations sets to be -configured with the core logic. - -Due to this design, users can extend DAMON for any address space by configuring -the core logic to use the appropriate operations set. If any appropriate set -is unavailable, users can implement one on their own. - -For example, physical memory, virtual memory, swap space, those for specific -processes, NUMA nodes, files, and backing memory devices would be supportable. -Also, if some architectures or devices supporting special optimized access -check primitives, those will be easily configurable. - - Programmable Modules -------------------- @@ -72,11 +48,32 @@ used by the user space end users. Operations Set Layer ==================== -The monitoring operations are defined in two parts: +.. _damon_design_configurable_operations_set: + +For data access monitoring and additional low level work, DAMON needs a set of +implementations for specific operations that are dependent on and optimized for +the given target address space. For example, below two operations for access +monitoring are address-space dependent. 1. Identification of the monitoring target address range for the address space. 2. Access check of specific address range in the target space. +DAMON consolidates these implementations in a layer called DAMON Operations +Set, and defines the interface between it and the upper layer. The upper layer +is dedicated for DAMON's core logics including the mechanism for control of the +monitoring accruracy and the overhead. + +Hence, DAMON can easily be extended for any address space and/or available +hardware features by configuring the core logic to use the appropriate +operations set. If there is no available operations set for a given purpose, a +new operations set can be implemented following the interface between the +layers. + +For example, physical memory, virtual memory, swap space, those for specific +processes, NUMA nodes, files, and backing memory devices would be supportable. +Also, if some architectures or devices support special optimized access check +features, those will be easily configurable. + DAMON currently provides below three operation sets. Below two subsections describe how those work. -- cgit v1.2.3 From ce15e8fe054cc5977ccb192d508e50ca4583cd1b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 1 Jul 2024 12:27:03 -0700 Subject: Docs/mm/damon/design: remove 'Programmable Modules' section in favor of 'Modules' section 'Programmable Modules' section provides high level descriptions of the DAMON API-based kernel modules layer. But 'Modules' section, which is at the end of the document, provides every detail about the layer including that of 'Programmable Modules' section. Since the brief summary of the layers at the beginning of the document has a link to the 'Modules' section, browsing to the section is not that difficult. Remove 'Programmable Modules' section in favor of 'Modules' section and reducing duplicates. Link: https://lkml.kernel.org/r/20240701192706.51415-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'Documentation') diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index f7029bc840ce..39e686c6862d 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -33,16 +33,6 @@ DAMON subsystem is configured with three layers including layer. -Programmable Modules --------------------- - -Core layer of DAMON is implemented as a framework, and exposes its application -programming interface to all kernel space components such as subsystems and -modules. For common use cases of DAMON, DAMON subsystem provides kernel -modules that built on top of the core layer using the API, which can be easily -used by the user space end users. - - .. _damon_operations_set: Operations Set Layer -- cgit v1.2.3 From d31f5626a0e11eea6f12c72970d550d4837bbdfe Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 1 Jul 2024 12:27:04 -0700 Subject: Docs/mm/damon/design: add links to sections of DAMON sysfs interface usage doc Readers of the design document would wonder how they can configure and use specific DAMON features. Add links to sections of DAMON sysfs interface usage document that provides the answers for easier browsing. Link: https://lkml.kernel.org/r/20240701192706.51415-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 48 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'Documentation') diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 39e686c6862d..89f5330e003f 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -16,6 +16,9 @@ called DAMON ``context``. DAMON executes each context with a kernel thread called ``kdamond``. Multiple kdamonds could run in parallel, for different types of monitoring. +To know how user-space can do the configurations and start/stop DAMON, refer to +:ref:`DAMON sysfs interface ` documentation. + Overall Architecture ==================== @@ -71,6 +74,10 @@ describe how those work. - fvaddr: Monitor fixed virtual address ranges - paddr: Monitor the physical address space of the system +To know how user-space can do the configuration via :ref:`DAMON sysfs interface +`, refer to :ref:`operations ` file part of the +documentation. + .. _damon_design_vaddr_target_regions_construction: @@ -143,6 +150,10 @@ monitoring attributes, ``sampling interval``, ``aggregation interval``, ``update interval``, ``minimum number of regions``, and ``maximum number of regions``. +To know how user-space can set the attributes via :ref:`DAMON sysfs interface +`, refer to :ref:`monitoring_attrs ` +part of the documentation. + Access Frequency Monitoring ~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -246,6 +257,11 @@ and applies it to monitoring operations-related data structures such as the abstracted monitoring target memory area only for each of a user-specified time interval (``update interval``). +User-space can get the monitoring results via DAMON sysfs interface and/or +tracepoints. For more details, please refer to the documentations for +:ref:`DAMOS tried regions ` and :ref:`tracepoint`, +respectively. + .. _damon_design_damos: @@ -286,6 +302,10 @@ the access pattern of interest, and applies the user-desired operation actions to the regions, for every user-specified time interval called ``apply_interval``. +To know how user-space can set ``apply_interval`` via :ref:`DAMON sysfs +interface `, refer to :ref:`apply_interval_us ` +part of the documentation. + .. _damon_design_damos_action: @@ -334,6 +354,10 @@ Applying the actions except ``stat`` to a region is considered as changing the region's characteristics. Hence, DAMOS resets the age of regions when any such actions are applied to those. +To know how user-space can set the action via :ref:`DAMON sysfs interface +`, refer to :ref:`action ` part of the +documentation. + .. _damon_design_damos_access_pattern: @@ -347,6 +371,10 @@ interest by setting minimum and maximum values of the three properties. If a region's three properties are in the ranges, DAMOS classifies it as one of the regions that the scheme is having an interest in. +To know how user-space can set the access pattern via :ref:`DAMON sysfs +interface `, refer to :ref:`access_pattern +` part of the documentation. + .. _damon_design_damos_quotas: @@ -366,6 +394,10 @@ feature called quotas. It lets users specify an upper limit of time that DAMOS can use for applying the action, and/or a maximum bytes of memory regions that the action can be applied within a user-specified time duration. +To know how user-space can set the basic quotas via :ref:`DAMON sysfs interface +`, refer to :ref:`quotas ` part of the +documentation. + .. _damon_design_damos_quotas_prioritization: @@ -393,6 +425,10 @@ information to the underlying mechanism. Nevertheless, how and even whether the weight will be respected are up to the underlying prioritization mechanism implementation. +To know how user-space can set the prioritization weights via :ref:`DAMON sysfs +interface `, refer to :ref:`weights ` part of +the documentation. + .. _damon_design_damos_quotas_auto_tuning: @@ -422,6 +458,10 @@ Currently, two ``target_metric`` are provided. DAMOS does the measurement on its own, so only ``target_value`` need to be set by users at the initial time. In other words, DAMOS does self-feedback. +To know how user-space can set the tuning goal metric, the target value, and/or +the current value via :ref:`DAMON sysfs interface `, refer to +:ref:`quota goals ` part of the documentation. + .. _damon_design_damos_watermarks: @@ -444,6 +484,10 @@ is activated. If all schemes are deactivated by the watermarks, the monitoring is also deactivated. In this case, the DAMON worker thread only periodically checks the watermarks and therefore incurs nearly zero overhead. +To know how user-space can set the watermarks via :ref:`DAMON sysfs interface +`, refer to :ref:`watermarks ` part of the +documentation. + .. _damon_design_damos_filters: @@ -490,6 +534,10 @@ Below types of filters are currently supported. - Applied to pages that belonging to a given DAMON monitoring target. - Handled by the core logic. +To know how user-space can set the watermarks via :ref:`DAMON sysfs interface +`, refer to :ref:`filters ` part of the +documentation. + Application Programming Interface --------------------------------- -- cgit v1.2.3 From 5fb9f0448fd9888e276fa5872fc37f27ae3f8acc Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 1 Jul 2024 12:27:05 -0700 Subject: Docs/mm/damon/index: add links to design DAMON subsystem documents index page provides a short intro of DAMON core concepts. Add links to sections of the design document to let users easily browse to the details. Link: https://lkml.kernel.org/r/20240701192706.51415-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 1 + Documentation/mm/damon/index.rst | 11 ++++++----- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'Documentation') diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 89f5330e003f..8730c246ceaa 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -141,6 +141,7 @@ as Idle page tracking does. Core Logics =========== +.. _damon_design_monitoring: Monitoring ---------- diff --git a/Documentation/mm/damon/index.rst b/Documentation/mm/damon/index.rst index 5e0a50583500..3d3b345d8889 100644 --- a/Documentation/mm/damon/index.rst +++ b/Documentation/mm/damon/index.rst @@ -6,7 +6,7 @@ DAMON: Data Access MONitor DAMON is a Linux kernel subsystem that provides a framework for data access monitoring and the monitoring results based system operations. The core -monitoring mechanisms of DAMON (refer to :doc:`design` for the detail) make it +monitoring :ref:`mechanisms ` of DAMON make it - *accurate* (the monitoring output is useful enough for DRAM level memory management; It might not appropriate for CPU Cache levels, though), @@ -21,10 +21,11 @@ users who have special information about their workloads can write personalized applications for better understanding and optimizations of their workloads and systems. -For easier development of such systems, DAMON provides a feature called DAMOS -(DAMon-based Operation Schemes) in addition to the monitoring. Using the -feature, DAMON users in both kernel and user spaces can do access-aware system -operations with no code but simple configurations. +For easier development of such systems, DAMON provides a feature called +:ref:`DAMOS ` (DAMon-based Operation Schemes) in addition +to the monitoring. Using the feature, DAMON users in both kernel and user +spaces can do access-aware system operations with no code but simple +configurations. .. toctree:: :maxdepth: 2 -- cgit v1.2.3 From f6a6de245fdb1dfb4307b0a80ce7fa35ba2c35a6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 1 Jul 2024 12:27:06 -0700 Subject: Docs/mm/damon/index: add links to admin-guide doc Readers of DAMON subsystem documents index would want to further learn how they can use DAMON from the user-space. Add the link to the admin guide. Link: https://lkml.kernel.org/r/20240701192706.51415-10-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/mm/damon/index.rst | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'Documentation') diff --git a/Documentation/mm/damon/index.rst b/Documentation/mm/damon/index.rst index 3d3b345d8889..dafd6d028924 100644 --- a/Documentation/mm/damon/index.rst +++ b/Documentation/mm/damon/index.rst @@ -16,16 +16,16 @@ monitoring :ref:`mechanisms ` of DAMON make it of the size of target workloads). Using this framework, therefore, the kernel can operate system in an -access-aware fashion. Because the features are also exposed to the user space, -users who have special information about their workloads can write personalized -applications for better understanding and optimizations of their workloads and -systems. +access-aware fashion. Because the features are also exposed to the :doc:`user +space `, users who have special information about +their workloads can write personalized applications for better understanding +and optimizations of their workloads and systems. For easier development of such systems, DAMON provides a feature called :ref:`DAMOS ` (DAMon-based Operation Schemes) in addition -to the monitoring. Using the feature, DAMON users in both kernel and user -spaces can do access-aware system operations with no code but simple -configurations. +to the monitoring. Using the feature, DAMON users in both kernel and :doc:`user +spaces ` can do access-aware system operations +with no code but simple configurations. .. toctree:: :maxdepth: 2 @@ -34,3 +34,6 @@ configurations. design api maintainer-profile + +To utilize and control DAMON from the user-space, please refer to the +administration :doc:`guide `. -- cgit v1.2.3 From c10cb9148e5175e65326f81930ecebd0147e2721 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 27 Jun 2024 10:08:56 -0700 Subject: docs/procfs: call out ioctl()-based PROCMAP_QUERY command existence Call out PROCMAP_QUERY ioctl() existence in the section describing /proc/PID/maps file in documentation. We refer user to UAPI header for low-level details of this programmatic interface. Link: https://lkml.kernel.org/r/20240627170900.1672542-5-andrii@kernel.org Signed-off-by: Andrii Nakryiko Acked-by: Liam R. Howlett Cc: Alexey Dobriyan Cc: Al Viro Cc: Christian Brauner Cc: Greg Kroah-Hartman Cc: Mike Rapoport (IBM) Cc: Suren Baghdasaryan Cc: Andi Kleen Cc: Arnd Bergmann Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.rst | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'Documentation') diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 82d142de3461..e834779d9611 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -443,6 +443,15 @@ is not associated with a file: or if empty, the mapping is anonymous. +Starting with 6.11 kernel, /proc/PID/maps provides an alternative +ioctl()-based API that gives ability to flexibly and efficiently query and +filter individual VMAs. This interface is binary and is meant for more +efficient and easy programmatic use. `struct procmap_query`, defined in +linux/fs.h UAPI header, serves as an input/output argument to the +`PROCMAP_QUERY` ioctl() command. See comments in linus/fs.h UAPI header for +details on query semantics, supported flags, data returned, and general API +usage information. + The /proc/PID/smaps is an extension based on maps, showing the memory consumption for each of the process's mappings. For each mapping (aka Virtual Memory Area, or VMA) there is a series of lines such as the following:: -- cgit v1.2.3 From 9b89e018990de47c72ef8b2ca29204f88fda8f05 Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Fri, 28 Jun 2024 21:07:50 +0800 Subject: mm: add docs for per-order mTHP split counters This commit introduces documentation for mTHP split counters in transhuge.rst. [ioworker0@gmail.com: improve the doc as suggested by Ryan] Link: https://lkml.kernel.org/r/20240704012905.42971-3-ioworker0@gmail.com [ioworker0@gmail.com: tweak Documentation/admin-guide/mm/transhuge.rst] Link: https://lkml.kernel.org/r/20240707013659.1151-1-ioworker0@gmail.com Link: https://lkml.kernel.org/r/20240628130750.73097-3-ioworker0@gmail.com Signed-off-by: Mingzhe Yang Signed-off-by: Lance Yang Reviewed-by: Barry Song Reviewed-by: Ryan Roberts Acked-by: David Hildenbrand Cc: Bang Li Cc: Baolin Wang Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'Documentation') diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 1f72b00af5d3..a1bc9b24e29a 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -369,10 +369,6 @@ also applies to the regions registered in khugepaged. Monitoring usage ================ -.. note:: - Currently the below counters only record events relating to - PMD-sized THP. Events relating to other THP sizes are not included. - The number of PMD-sized anonymous transparent huge pages currently used by the system is available by reading the AnonHugePages field in ``/proc/meminfo``. To identify what applications are using PMD-sized anonymous transparent huge @@ -514,6 +510,21 @@ file_fallback_charge falls back to using small pages even though the allocation was successful. +split + is incremented every time a huge page is successfully split into + smaller orders. This can happen for a variety of reasons but a + common reason is that a huge page is old and is being reclaimed. + +split_failed + is incremented if kernel fails to split huge + page. This can happen if the page was pinned by somebody. + +split_deferred + is incremented when a huge page is put onto split queue. + This happens when a huge page is partially unmapped and splitting + it would free up some memory. Pages on split queue are going to + be split under memory pressure, if splitting is possible. + As the system ages, allocating huge pages may be expensive as the system uses memory compaction to copy data around memory to free a huge page for use. There are some counters in ``/proc/vmstat`` to help -- cgit v1.2.3 From 00f58104202c472e487f0866fbd38832523fd4f9 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 4 Jul 2024 10:10:50 +0100 Subject: mm: fix khugepaged activation policy Since the introduction of mTHP, the docuementation has stated that khugepaged would be enabled when any mTHP size is enabled, and disabled when all mTHP sizes are disabled. There are 2 problems with this; 1. this is not what was implemented by the code and 2. this is not the desirable behavior. Desirable behavior is for khugepaged to be enabled when any PMD-sized THP is enabled, anon or file. (Note that file THP is still controlled by the top-level control so we must always consider that, as well as the PMD-size mTHP control for anon). khugepaged only supports collapsing to PMD-sized THP so there is no value in enabling it when PMD-sized THP is disabled. So let's change the code and documentation to reflect this policy. Further, per-size enabled control modification events were not previously forwarded to khugepaged to give it an opportunity to start or stop. Consequently the following was resulting in khugepaged eroneously not being activated: echo never > /sys/kernel/mm/transparent_hugepage/enabled echo always > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled [ryan.roberts@arm.com: v3] Link: https://lkml.kernel.org/r/20240705102849.2479686-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20240705102849.2479686-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20240704091051.2411934-1-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Fixes: 3485b88390b0 ("mm: thp: introduce multi-size THP sysfs interface") Closes: https://lore.kernel.org/linux-mm/7a0bbe69-1e3d-4263-b206-da007791a5c4@redhat.com/ Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Barry Song Cc: Jonathan Corbet Cc: Lance Yang Cc: Yang Shi Cc: Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 11 +++++----- include/linux/huge_mm.h | 12 ----------- mm/huge_memory.c | 7 +++++++ mm/khugepaged.c | 33 +++++++++++++++++++++++------- 4 files changed, 38 insertions(+), 25 deletions(-) (limited to 'Documentation') diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index a1bc9b24e29a..fe237825b95c 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -202,12 +202,11 @@ PMD-mappable transparent hugepage:: cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size -khugepaged will be automatically started when one or more hugepage -sizes are enabled (either by directly setting "always" or "madvise", -or by setting "inherit" while the top-level enabled is set to "always" -or "madvise"), and it'll be automatically shutdown when the last -hugepage size is disabled (either by directly setting "never", or by -setting "inherit" while the top-level enabled is set to "never"). +khugepaged will be automatically started when PMD-sized THP is enabled +(either of the per-size anon control or the top-level control are set +to "always" or "madvise"), and it'll be automatically shutdown when +PMD-sized THP is disabled (when both the per-size anon control and the +top-level control are "never") Khugepaged controls ------------------- diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index cee3c5da8f0e..acb6ac24a07e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -128,18 +128,6 @@ static inline bool hugepage_global_always(void) (1< 0) { + int err; + + err = start_stop_khugepaged(); + if (err) + ret = err; + } return ret; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 409f67a817f1..a5ec03ef8722 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -413,6 +413,26 @@ static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) test_bit(MMF_DISABLE_THP, &mm->flags); } +static bool hugepage_pmd_enabled(void) +{ + /* + * We cover both the anon and the file-backed case here; file-backed + * hugepages, when configured in, are determined by the global control. + * Anon pmd-sized hugepages are determined by the pmd-size control. + */ + if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && + hugepage_global_enabled()) + return true; + if (test_bit(PMD_ORDER, &huge_anon_orders_always)) + return true; + if (test_bit(PMD_ORDER, &huge_anon_orders_madvise)) + return true; + if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) && + hugepage_global_enabled()) + return true; + return false; +} + void __khugepaged_enter(struct mm_struct *mm) { struct khugepaged_mm_slot *mm_slot; @@ -449,7 +469,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags) { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && - hugepage_flags_enabled()) { + hugepage_pmd_enabled()) { if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS, PMD_ORDER)) __khugepaged_enter(vma->vm_mm); @@ -2462,8 +2482,7 @@ breakouterloop_mmap_lock: static int khugepaged_has_work(void) { - return !list_empty(&khugepaged_scan.mm_head) && - hugepage_flags_enabled(); + return !list_empty(&khugepaged_scan.mm_head) && hugepage_pmd_enabled(); } static int khugepaged_wait_event(void) @@ -2536,7 +2555,7 @@ static void khugepaged_wait_work(void) return; } - if (hugepage_flags_enabled()) + if (hugepage_pmd_enabled()) wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); } @@ -2567,7 +2586,7 @@ static void set_recommended_min_free_kbytes(void) int nr_zones = 0; unsigned long recommended_min; - if (!hugepage_flags_enabled()) { + if (!hugepage_pmd_enabled()) { calculate_min_free_kbytes(); goto update_wmarks; } @@ -2617,7 +2636,7 @@ int start_stop_khugepaged(void) int err = 0; mutex_lock(&khugepaged_mutex); - if (hugepage_flags_enabled()) { + if (hugepage_pmd_enabled()) { if (!khugepaged_thread) khugepaged_thread = kthread_run(khugepaged, NULL, "khugepaged"); @@ -2643,7 +2662,7 @@ fail: void khugepaged_min_free_kbytes_update(void) { mutex_lock(&khugepaged_mutex); - if (hugepage_flags_enabled() && khugepaged_thread) + if (hugepage_pmd_enabled() && khugepaged_thread) set_recommended_min_free_kbytes(); mutex_unlock(&khugepaged_mutex); } -- cgit v1.2.3 From 4c8763e84aae4d04d94b35aca9f7db6a8930ad77 Mon Sep 17 00:00:00 2001 From: Ran Xiaokai Date: Fri, 5 Jul 2024 10:43:43 +0000 Subject: kpageflags: detect isolated KPF_THP folios When folio is isolated, the PG_lru bit is cleared. So the PG_lru check in stable_page_flags() will miss this kind of isolated folios. Use folio_test_large_rmappable() instead to also include isolated folios. Since pagecache supports large folios and the introduction of mTHP, the semantics of KPF_THP have been expanded, now it indicates not only PMD-sized THP. Update related documentation to clearly state that KPF_THP indicates multiple order THPs. [ran.xiaokai@zte.com.cn: directly use is_zero_folio(), per David] Link: https://lkml.kernel.org/r/20240708062601.165215-1-ranxiaokai627@163.com Link: https://lkml.kernel.org/r/20240705104343.112680-1-ranxiaokai627@163.com Signed-off-by: Ran Xiaokai Acked-by: David Hildenbrand Cc: Andrei Vagin Cc: Jonathan Corbet Cc: Muhammad Usama Anjum Cc: Ryan Roberts Cc: Svetly Todorov Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/pagemap.rst | 4 ++-- fs/proc/page.c | 21 +++++++++------------ 2 files changed, 11 insertions(+), 14 deletions(-) (limited to 'Documentation') diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst index f2817a801596..caba0f52dd36 100644 --- a/Documentation/admin-guide/mm/pagemap.rst +++ b/Documentation/admin-guide/mm/pagemap.rst @@ -118,7 +118,7 @@ Short descriptions to the page flags 21 - KSM Identical memory pages dynamically shared between one or more processes. 22 - THP - Contiguous pages which construct transparent hugepages. + Contiguous pages which construct THP of any size and mapped by any granularity. 23 - OFFLINE The page is logically offline. 24 - ZERO_PAGE @@ -231,7 +231,7 @@ Following flags about pages are currently supported: - ``PAGE_IS_PRESENT`` - Page is present in the memory - ``PAGE_IS_SWAPPED`` - Page is in swapped - ``PAGE_IS_PFNZERO`` - Page has zero PFN -- ``PAGE_IS_HUGE`` - Page is THP or Hugetlb backed +- ``PAGE_IS_HUGE`` - Page is PMD-mapped THP or Hugetlb backed - ``PAGE_IS_SOFT_DIRTY`` - Page is soft-dirty The ``struct pm_scan_arg`` is used as the argument of the IOCTL. diff --git a/fs/proc/page.c b/fs/proc/page.c index e8440db8cfbf..b7a5c84b5819 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -147,19 +147,16 @@ u64 stable_page_flags(const struct page *page) u |= 1 << KPF_COMPOUND_TAIL; if (folio_test_hugetlb(folio)) u |= 1 << KPF_HUGE; - /* - * We need to check PageLRU/PageAnon - * to make sure a given page is a thp, not a non-huge compound page. - */ - else if (folio_test_large(folio)) { - if ((k & (1 << PG_lru)) || is_anon) - u |= 1 << KPF_THP; - else if (is_huge_zero_folio(folio)) { - u |= 1 << KPF_ZERO_PAGE; - u |= 1 << KPF_THP; - } - } else if (is_zero_pfn(page_to_pfn(page))) + else if (folio_test_large(folio) && + folio_test_large_rmappable(folio)) { + /* Note: we indicate any THPs here, not just PMD-sized ones */ + u |= 1 << KPF_THP; + } else if (is_huge_zero_folio(folio)) { u |= 1 << KPF_ZERO_PAGE; + u |= 1 << KPF_THP; + } else if (is_zero_folio(folio)) { + u |= 1 << KPF_ZERO_PAGE; + } /* * Caveats on high order pages: PG_buddy and PG_slab will only be set -- cgit v1.2.3 From 63d9866ab01ffd0d0835d5564107283a4afc0a38 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 10 Jul 2024 10:55:01 +0100 Subject: mm: shmem: rename mTHP shmem counters The legacy PMD-sized THP counters at /proc/vmstat include thp_file_alloc, thp_file_fallback and thp_file_fallback_charge, which rather confusingly refer to shmem THP and do not include any other types of file pages. This is inconsistent since in most other places in the kernel, THP counters are explicitly separated for anon, shmem and file flavours. However, we are stuck with it since it constitutes a user ABI. Recently, commit 66f44583f9b6 ("mm: shmem: add mTHP counters for anonymous shmem") added equivalent mTHP stats for shmem, keeping the same "file_" prefix in the names. But in future, we may want to add extra stats to cover actual file pages, at which point, it would all become very confusing. So let's take the opportunity to rename these new counters "shmem_" before the change makes it upstream and the ABI becomes immutable. While we are at it, let's improve the documentation for the legacy counters to make it clear that they count shmem pages only. Link: https://lkml.kernel.org/r/20240710095503.3193901-1-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Reviewed-by: Baolin Wang Reviewed-by: Lance Yang Reviewed-by: Zi Yan Reviewed-by: Barry Song Acked-by: David Hildenbrand Cc: Daniel Gomez Cc: Hugh Dickins Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 29 ++++++++++++++++------------- include/linux/huge_mm.h | 6 +++--- mm/huge_memory.c | 12 ++++++------ mm/shmem.c | 8 ++++---- 4 files changed, 29 insertions(+), 26 deletions(-) (limited to 'Documentation') diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index fe237825b95c..058485daf186 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -412,20 +412,23 @@ thp_collapse_alloc_failed the allocation. thp_file_alloc - is incremented every time a file huge page is successfully - allocated. + is incremented every time a shmem huge page is successfully + allocated (Note that despite being named after "file", the counter + measures only shmem). thp_file_fallback - is incremented if a file huge page is attempted to be allocated - but fails and instead falls back to using small pages. + is incremented if a shmem huge page is attempted to be allocated + but fails and instead falls back to using small pages. (Note that + despite being named after "file", the counter measures only shmem). thp_file_fallback_charge - is incremented if a file huge page cannot be charged and instead + is incremented if a shmem huge page cannot be charged and instead falls back to using small pages even though the allocation was - successful. + successful. (Note that despite being named after "file", the + counter measures only shmem). thp_file_mapped - is incremented every time a file huge page is mapped into + is incremented every time a file or shmem huge page is mapped into user address space. thp_split_page @@ -496,16 +499,16 @@ swpout_fallback Usually because failed to allocate some continuous swap space for the huge page. -file_alloc - is incremented every time a file huge page is successfully +shmem_alloc + is incremented every time a shmem huge page is successfully allocated. -file_fallback - is incremented if a file huge page is attempted to be allocated +shmem_fallback + is incremented if a shmem huge page is attempted to be allocated but fails and instead falls back to using small pages. -file_fallback_charge - is incremented if a file huge page cannot be charged and instead +shmem_fallback_charge + is incremented if a shmem huge page cannot be charged and instead falls back to using small pages even though the allocation was successful. diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index acb6ac24a07e..cff002be83eb 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -269,9 +269,9 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, MTHP_STAT_SWPOUT, MTHP_STAT_SWPOUT_FALLBACK, - MTHP_STAT_FILE_ALLOC, - MTHP_STAT_FILE_FALLBACK, - MTHP_STAT_FILE_FALLBACK_CHARGE, + MTHP_STAT_SHMEM_ALLOC, + MTHP_STAT_SHMEM_FALLBACK, + MTHP_STAT_SHMEM_FALLBACK_CHARGE, MTHP_STAT_SPLIT, MTHP_STAT_SPLIT_FAILED, MTHP_STAT_SPLIT_DEFERRED, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9ec64aa2be94..f9696c94e211 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -568,9 +568,9 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); -DEFINE_MTHP_STAT_ATTR(file_alloc, MTHP_STAT_FILE_ALLOC); -DEFINE_MTHP_STAT_ATTR(file_fallback, MTHP_STAT_FILE_FALLBACK); -DEFINE_MTHP_STAT_ATTR(file_fallback_charge, MTHP_STAT_FILE_FALLBACK_CHARGE); +DEFINE_MTHP_STAT_ATTR(shmem_alloc, MTHP_STAT_SHMEM_ALLOC); +DEFINE_MTHP_STAT_ATTR(shmem_fallback, MTHP_STAT_SHMEM_FALLBACK); +DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE); DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT); DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED); DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED); @@ -581,9 +581,9 @@ static struct attribute *stats_attrs[] = { &anon_fault_fallback_charge_attr.attr, &swpout_attr.attr, &swpout_fallback_attr.attr, - &file_alloc_attr.attr, - &file_fallback_attr.attr, - &file_fallback_charge_attr.attr, + &shmem_alloc_attr.attr, + &shmem_fallback_attr.attr, + &shmem_fallback_charge_attr.attr, &split_attr.attr, &split_failed_attr.attr, &split_deferred_attr.attr, diff --git a/mm/shmem.c b/mm/shmem.c index 921d59c3d669..f24dfbd387ba 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1777,7 +1777,7 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, if (pages == HPAGE_PMD_NR) count_vm_event(THP_FILE_FALLBACK); #ifdef CONFIG_TRANSPARENT_HUGEPAGE - count_mthp_stat(order, MTHP_STAT_FILE_FALLBACK); + count_mthp_stat(order, MTHP_STAT_SHMEM_FALLBACK); #endif order = next_order(&suitable_orders, order); } @@ -1804,8 +1804,8 @@ allocated: count_vm_event(THP_FILE_FALLBACK_CHARGE); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE - count_mthp_stat(folio_order(folio), MTHP_STAT_FILE_FALLBACK); - count_mthp_stat(folio_order(folio), MTHP_STAT_FILE_FALLBACK_CHARGE); + count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK); + count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK_CHARGE); #endif } goto unlock; @@ -2181,7 +2181,7 @@ repeat: if (folio_test_pmd_mappable(folio)) count_vm_event(THP_FILE_ALLOC); #ifdef CONFIG_TRANSPARENT_HUGEPAGE - count_mthp_stat(folio_order(folio), MTHP_STAT_FILE_ALLOC); + count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_ALLOC); #endif goto alloced; } -- cgit v1.2.3