diff options
Diffstat (limited to 'mm')
45 files changed, 1230 insertions, 1048 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 761f5021ba51..034d87953600 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -762,6 +762,9 @@ config ARCH_HAS_CURRENT_STACK_POINTER register alias named "current_stack_pointer", this config can be selected. +config ARCH_HAS_FILTER_PGPROT + bool + config ARCH_HAS_PTE_DEVMAP bool diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 907fefde2572..4b8eab4b3f45 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -203,7 +203,7 @@ EXPORT_SYMBOL_GPL(balloon_page_dequeue); #ifdef CONFIG_BALLOON_COMPACTION -bool balloon_page_isolate(struct page *page, isolate_mode_t mode) +static bool balloon_page_isolate(struct page *page, isolate_mode_t mode) { struct balloon_dev_info *b_dev_info = balloon_page_device(page); @@ -217,7 +217,7 @@ bool balloon_page_isolate(struct page *page, isolate_mode_t mode) return true; } -void balloon_page_putback(struct page *page) +static void balloon_page_putback(struct page *page) { struct balloon_dev_info *b_dev_info = balloon_page_device(page); unsigned long flags; @@ -230,7 +230,7 @@ void balloon_page_putback(struct page *page) /* move_to_new_page() counterpart for a ballooned page */ -int balloon_page_migrate(struct address_space *mapping, +static int balloon_page_migrate(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode) { diff --git a/mm/damon/core.c b/mm/damon/core.c index c1e0fed4e877..5ce8d7c867f0 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1019,12 +1019,15 @@ static int kdamond_wait_activation(struct damon_ctx *ctx) struct damos *s; unsigned long wait_time; unsigned long min_wait_time = 0; + bool init_wait_time = false; while (!kdamond_need_stop(ctx)) { damon_for_each_scheme(s, ctx) { wait_time = damos_wmark_wait_us(s); - if (!min_wait_time || wait_time < min_wait_time) + if (!init_wait_time || wait_time < min_wait_time) { + init_wait_time = true; min_wait_time = wait_time; + } } if (!min_wait_time) return 0; diff --git a/mm/debug.c b/mm/debug.c index eeb7ea3ca292..bef329bf28f0 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -261,5 +261,4 @@ void page_init_poison(struct page *page, size_t size) if (page_init_poisoning) memset(page, PAGE_POISON_PATTERN, size); } -EXPORT_SYMBOL_GPL(page_init_poison); #endif /* CONFIG_DEBUG_VM */ diff --git a/mm/filemap.c b/mm/filemap.c index d2e6a79fe69d..3a5ffb5587cd 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -152,25 +152,25 @@ static void filemap_unaccount_folio(struct address_space *mapping, VM_BUG_ON_FOLIO(folio_mapped(folio), folio); if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) { - int mapcount; - pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n", current->comm, folio_pfn(folio)); dump_page(&folio->page, "still mapped when deleted"); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); - mapcount = page_mapcount(&folio->page); - if (mapping_exiting(mapping) && - folio_ref_count(folio) >= mapcount + 2) { - /* - * All vmas have already been torn down, so it's - * a good bet that actually the folio is unmapped, - * and we'd prefer not to leak it: if we're wrong, - * some other bad page check should catch it later. - */ - page_mapcount_reset(&folio->page); - folio_ref_sub(folio, mapcount); + if (mapping_exiting(mapping) && !folio_test_large(folio)) { + int mapcount = page_mapcount(&folio->page); + + if (folio_ref_count(folio) >= mapcount + 2) { + /* + * All vmas have already been torn down, so it's + * a good bet that actually the page is unmapped + * and we'd rather not leak it: if we're wrong, + * another bad page check should catch it later. + */ + page_mapcount_reset(&folio->page); + folio_ref_sub(folio, mapcount); + } } } @@ -193,16 +193,20 @@ static void filemap_unaccount_folio(struct address_space *mapping, /* * At this point folio must be either written or cleaned by * truncate. Dirty folio here signals a bug and loss of - * unwritten data. + * unwritten data - on ordinary filesystems. + * + * But it's harmless on in-memory filesystems like tmpfs; and can + * occur when a driver which did get_user_pages() sets page dirty + * before putting it, while the inode is being finally evicted. * - * This fixes dirty accounting after removing the folio entirely + * Below fixes dirty accounting after removing the folio entirely * but leaves the dirty flag set: it has no effect for truncated * folio and anyway will be cleared before returning folio to * buddy allocator. */ - if (WARN_ON_ONCE(folio_test_dirty(folio))) - folio_account_cleaned(folio, mapping, - inode_to_wb(mapping->host)); + if (WARN_ON_ONCE(folio_test_dirty(folio) && + mapping_can_writeback(mapping))) + folio_account_cleaned(folio, inode_to_wb(mapping->host)); } /* @@ -1185,24 +1189,17 @@ static void folio_wake_bit(struct folio *folio, int bit_nr) } /* - * It is possible for other pages to have collided on the waitqueue - * hash, so in that case check for a page match. That prevents a long- - * term waiter + * It's possible to miss clearing waiters here, when we woke our page + * waiters, but the hashed waitqueue has waiters for other pages on it. + * That's okay, it's a rare case. The next waker will clear it. * - * It is still possible to miss a case here, when we woke page waiters - * and removed them from the waitqueue, but there are still other - * page waiters. + * Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE, + * other), the flag may be cleared in the course of freeing the page; + * but that is not required for correctness. */ - if (!waitqueue_active(q) || !key.page_match) { + if (!waitqueue_active(q) || !key.page_match) folio_clear_waiters(folio); - /* - * It's possible to miss clearing Waiters here, when we woke - * our page waiters, but the hashed waitqueue has waiters for - * other pages on it. - * - * That's okay, it's a rare case. The next waker will clear it. - */ - } + spin_unlock_irqrestore(&q->lock, flags); } @@ -2541,7 +2538,7 @@ static int filemap_create_folio(struct file *file, * the page cache as the locked folio would then be enough to * synchronize with hole punching. But there are code paths * such as filemap_update_page() filling in partially uptodate - * pages or ->readpages() that need to hold invalidate_lock + * pages or ->readahead() that need to hold invalidate_lock * while mapping blocks for IO so let's hold the lock here as * well to keep locking rules simple. */ @@ -3755,9 +3752,10 @@ out: } EXPORT_SYMBOL(generic_file_direct_write); -ssize_t generic_perform_write(struct file *file, - struct iov_iter *i, loff_t pos) +ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i) { + struct file *file = iocb->ki_filp; + loff_t pos = iocb->ki_pos; struct address_space *mapping = file->f_mapping; const struct address_space_operations *a_ops = mapping->a_ops; long status = 0; @@ -3782,7 +3780,7 @@ again: * same page as we're writing to, without it being marked * up-to-date. */ - if (unlikely(fault_in_iov_iter_readable(i, bytes))) { + if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { status = -EFAULT; break; } @@ -3887,7 +3885,8 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (written < 0 || !iov_iter_count(from) || IS_DAX(inode)) goto out; - status = generic_perform_write(file, from, pos = iocb->ki_pos); + pos = iocb->ki_pos; + status = generic_perform_write(iocb, from); /* * If generic_perform_write() returned a synchronous error * then we want to return the number of bytes which were @@ -3919,7 +3918,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) */ } } else { - written = generic_perform_write(file, from, iocb->ki_pos); + written = generic_perform_write(iocb, from); if (likely(written > 0)) iocb->ki_pos += written; } @@ -1404,6 +1404,7 @@ long populate_vma_page_range(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; unsigned long nr_pages = (end - start) / PAGE_SIZE; int gup_flags; + long ret; VM_BUG_ON(!PAGE_ALIGNED(start)); VM_BUG_ON(!PAGE_ALIGNED(end)); @@ -1438,8 +1439,10 @@ long populate_vma_page_range(struct vm_area_struct *vma, * We made sure addr is within a VMA, so the following will * not result in a stack expansion that recurses back here. */ - return __get_user_pages(mm, start, nr_pages, gup_flags, + ret = __get_user_pages(mm, start, nr_pages, gup_flags, NULL, NULL, locked); + lru_add_drain(); + return ret; } /* @@ -1471,6 +1474,7 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, struct mm_struct *mm = vma->vm_mm; unsigned long nr_pages = (end - start) / PAGE_SIZE; int gup_flags; + long ret; VM_BUG_ON(!PAGE_ALIGNED(start)); VM_BUG_ON(!PAGE_ALIGNED(end)); @@ -1498,8 +1502,10 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, if (check_vma_flags(vma, gup_flags)) return -EINVAL; - return __get_user_pages(mm, start, nr_pages, gup_flags, + ret = __get_user_pages(mm, start, nr_pages, gup_flags, NULL, NULL, locked); + lru_add_drain(); + return ret; } /* diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 005fab2f3b73..2fe38212e07c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -40,6 +40,9 @@ #include <asm/pgalloc.h> #include "internal.h" +#define CREATE_TRACE_POINTS +#include <trace/events/thp.h> + /* * By default, transparent hugepage support is disabled in order to avoid * risking an increased memory footprint for applications that are not @@ -530,7 +533,7 @@ void prep_transhuge_page(struct page *page) set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); } -bool is_transparent_hugepage(struct page *page) +static inline bool is_transparent_hugepage(struct page *page) { if (!PageCompound(page)) return false; @@ -539,7 +542,6 @@ bool is_transparent_hugepage(struct page *page) return is_huge_zero_page(page) || page[1].compound_dtor == TRANSHUGE_PAGE_DTOR; } -EXPORT_SYMBOL_GPL(is_transparent_hugepage); static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, @@ -1301,7 +1303,6 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) page = pmd_page(orig_pmd); VM_BUG_ON_PAGE(!PageHead(page), page); - /* Lock page for reuse_swap_page() */ if (!trylock_page(page)) { get_page(page); spin_unlock(vmf->ptl); @@ -1317,10 +1318,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) } /* - * We can only reuse the page if nobody else maps the huge page or it's - * part. + * See do_wp_page(): we can only map the page writable if there are + * no additional references. Note that we always drain the LRU + * pagevecs immediately after adding a THP. */ - if (reuse_swap_page(page)) { + if (page_count(page) > 1 + PageSwapCache(page) * thp_nr_pages(page)) + goto unlock_fallback; + if (PageSwapCache(page)) + try_to_free_swap(page); + if (page_count(page) == 1) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); @@ -1331,6 +1337,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) return VM_FAULT_WRITE; } +unlock_fallback: unlock_page(page); spin_unlock(vmf->ptl); fallback: @@ -2126,8 +2133,6 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, { spinlock_t *ptl; struct mmu_notifier_range range; - bool do_unlock_folio = false; - pmd_t _pmd; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address & HPAGE_PMD_MASK, @@ -2146,42 +2151,12 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, goto out; } -repeat: - if (pmd_trans_huge(*pmd)) { - if (!folio) { - folio = page_folio(pmd_page(*pmd)); - /* - * An anonymous page must be locked, to ensure that a - * concurrent reuse_swap_page() sees stable mapcount; - * but reuse_swap_page() is not used on shmem or file, - * and page lock must not be taken when zap_pmd_range() - * calls __split_huge_pmd() while i_mmap_lock is held. - */ - if (folio_test_anon(folio)) { - if (unlikely(!folio_trylock(folio))) { - folio_get(folio); - _pmd = *pmd; - spin_unlock(ptl); - folio_lock(folio); - spin_lock(ptl); - if (unlikely(!pmd_same(*pmd, _pmd))) { - folio_unlock(folio); - folio_put(folio); - folio = NULL; - goto repeat; - } - folio_put(folio); - } - do_unlock_folio = true; - } - } - } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) - goto out; - __split_huge_pmd_locked(vma, pmd, range.start, freeze); + if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) || + is_pmd_migration_entry(*pmd)) + __split_huge_pmd_locked(vma, pmd, range.start, freeze); + out: spin_unlock(ptl); - if (do_unlock_folio) - folio_unlock(folio); /* * No need to double call mmu_notifier->invalidate_range() callback. * They are 3 cases to consider inside __split_huge_pmd_locked(): @@ -2476,54 +2451,6 @@ static void __split_huge_page(struct page *page, struct list_head *list, } } -/* - * This calculates accurately how many mappings a transparent hugepage - * has (unlike page_mapcount() which isn't fully accurate). This full - * accuracy is primarily needed to know if copy-on-write faults can - * reuse the page and change the mapping to read-write instead of - * copying them. At the same time this returns the total_mapcount too. - * - * The function returns the highest mapcount any one of the subpages - * has. If the return value is one, even if different processes are - * mapping different subpages of the transparent hugepage, they can - * all reuse it, because each process is reusing a different subpage. - * - * The total_mapcount is instead counting all virtual mappings of the - * subpages. If the total_mapcount is equal to "one", it tells the - * caller all mappings belong to the same "mm" and in turn the - * anon_vma of the transparent hugepage can become the vma->anon_vma - * local one as no other process may be mapping any of the subpages. - * - * It would be more accurate to replace page_mapcount() with - * page_trans_huge_mapcount(), however we only use - * page_trans_huge_mapcount() in the copy-on-write faults where we - * need full accuracy to avoid breaking page pinning, because - * page_trans_huge_mapcount() is slower than page_mapcount(). - */ -int page_trans_huge_mapcount(struct page *page) -{ - int i, ret; - - /* hugetlbfs shouldn't call it */ - VM_BUG_ON_PAGE(PageHuge(page), page); - - if (likely(!PageTransCompound(page))) - return atomic_read(&page->_mapcount) + 1; - - page = compound_head(page); - - ret = 0; - for (i = 0; i < thp_nr_pages(page); i++) { - int mapcount = atomic_read(&page[i]._mapcount) + 1; - ret = max(ret, mapcount); - } - - if (PageDoubleMap(page)) - ret -= 1; - - return ret + compound_mapcount(page); -} - /* Racy check whether the huge page can be split */ bool can_split_folio(struct folio *folio, int *pextra_pins) { @@ -3131,6 +3058,7 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, set_pmd_at(mm, address, pvmw->pmd, pmdswp); page_remove_rmap(page, vma, true); put_page(page); + trace_set_migration_pmd(address, pmd_val(pmdswp)); } void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) @@ -3163,5 +3091,6 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) /* No need to invalidate - it was non-present before */ update_mmu_cache_pmd(vma, address, pvmw->pmd); + trace_remove_migration_pmd(address, pmd_val(pmde)); } #endif diff --git a/mm/internal.h b/mm/internal.h index 58dc6adc19c5..cf16280ce132 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -456,7 +456,8 @@ static inline void munlock_vma_page(struct page *page, } void mlock_new_page(struct page *page); bool need_mlock_page_drain(int cpu); -void mlock_page_drain(int cpu); +void mlock_page_drain_local(void); +void mlock_page_drain_remote(int cpu); extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); @@ -539,7 +540,8 @@ static inline void munlock_vma_page(struct page *page, struct vm_area_struct *vma, bool compound) { } static inline void mlock_new_page(struct page *page) { } static inline bool need_mlock_page_drain(int cpu) { return false; } -static inline void mlock_page_drain(int cpu) { } +static inline void mlock_page_drain_local(void) { } +static inline void mlock_page_drain_remote(int cpu) { } static inline void vunmap_range_noflush(unsigned long start, unsigned long end) { } diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index adcd9acaef61..1f84df9c302e 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -35,7 +35,7 @@ CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME) CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) -obj-$(CONFIG_KASAN) := common.o report.o +obj-y := common.o report.o obj-$(CONFIG_KASAN_GENERIC) += init.o generic.o report_generic.o shadow.o quarantine.o obj-$(CONFIG_KASAN_HW_TAGS) += hw_tags.o report_hw_tags.o tags.o report_tags.o obj-$(CONFIG_KASAN_SW_TAGS) += init.o report_sw_tags.o shadow.o sw_tags.o tags.o report_tags.o diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 92196562687b..d9079ec11f31 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -387,7 +387,7 @@ static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip) } /* - * The object will be poisoned by kasan_free_pages() or + * The object will be poisoned by kasan_poison_pages() or * kasan_slab_free_mempool(). */ @@ -538,7 +538,7 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size, return NULL; /* - * The object has already been unpoisoned by kasan_alloc_pages() for + * The object has already been unpoisoned by kasan_unpoison_pages() for * alloc_pages() or by kasan_krealloc() for krealloc(). */ diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 7355cb534e4f..07a76c46daa5 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -32,6 +32,12 @@ enum kasan_arg_mode { KASAN_ARG_MODE_ASYMM, }; +enum kasan_arg_vmalloc { + KASAN_ARG_VMALLOC_DEFAULT, + KASAN_ARG_VMALLOC_OFF, + KASAN_ARG_VMALLOC_ON, +}; + enum kasan_arg_stacktrace { KASAN_ARG_STACKTRACE_DEFAULT, KASAN_ARG_STACKTRACE_OFF, @@ -40,18 +46,28 @@ enum kasan_arg_stacktrace { static enum kasan_arg kasan_arg __ro_after_init; static enum kasan_arg_mode kasan_arg_mode __ro_after_init; -static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init; +static enum kasan_arg_vmalloc kasan_arg_vmalloc __initdata; +static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata; -/* Whether KASAN is enabled at all. */ +/* + * Whether KASAN is enabled at all. + * The value remains false until KASAN is initialized by kasan_init_hw_tags(). + */ DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled); EXPORT_SYMBOL(kasan_flag_enabled); -/* Whether the selected mode is synchronous/asynchronous/asymmetric.*/ +/* + * Whether the selected mode is synchronous, asynchronous, or asymmetric. + * Defaults to KASAN_MODE_SYNC. + */ enum kasan_mode kasan_mode __ro_after_init; EXPORT_SYMBOL_GPL(kasan_mode); +/* Whether to enable vmalloc tagging. */ +DEFINE_STATIC_KEY_TRUE(kasan_flag_vmalloc); + /* Whether to collect alloc/free stack traces. */ -DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace); +DEFINE_STATIC_KEY_TRUE(kasan_flag_stacktrace); /* kasan=off/on */ static int __init early_kasan_flag(char *arg) @@ -89,6 +105,23 @@ static int __init early_kasan_mode(char *arg) } early_param("kasan.mode", early_kasan_mode); +/* kasan.vmalloc=off/on */ +static int __init early_kasan_flag_vmalloc(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "off")) + kasan_arg_vmalloc = KASAN_ARG_VMALLOC_OFF; + else if (!strcmp(arg, "on")) + kasan_arg_vmalloc = KASAN_ARG_VMALLOC_ON; + else + return -EINVAL; + + return 0; +} +early_param("kasan.vmalloc", early_kasan_flag_vmalloc); + /* kasan.stacktrace=off/on */ static int __init early_kasan_flag_stacktrace(char *arg) { @@ -116,7 +149,10 @@ static inline const char *kasan_mode_info(void) return "sync"; } -/* kasan_init_hw_tags_cpu() is called for each CPU. */ +/* + * kasan_init_hw_tags_cpu() is called for each CPU. + * Not marked as __init as a CPU can be hot-plugged after boot. + */ void kasan_init_hw_tags_cpu(void) { /* @@ -124,7 +160,11 @@ void kasan_init_hw_tags_cpu(void) * as this function is only called for MTE-capable hardware. */ - /* If KASAN is disabled via command line, don't initialize it. */ + /* + * If KASAN is disabled via command line, don't initialize it. + * When this function is called, kasan_flag_enabled is not yet + * set by kasan_init_hw_tags(). Thus, check kasan_arg instead. + */ if (kasan_arg == KASAN_ARG_OFF) return; @@ -132,12 +172,7 @@ void kasan_init_hw_tags_cpu(void) * Enable async or asymm modes only when explicitly requested * through the command line. */ - if (kasan_arg_mode == KASAN_ARG_MODE_ASYNC) - hw_enable_tagging_async(); - else if (kasan_arg_mode == KASAN_ARG_MODE_ASYMM) - hw_enable_tagging_asymm(); - else - hw_enable_tagging_sync(); + kasan_enable_tagging(); } /* kasan_init_hw_tags() is called once on boot CPU. */ @@ -151,86 +186,168 @@ void __init kasan_init_hw_tags(void) if (kasan_arg == KASAN_ARG_OFF) return; - /* Enable KASAN. */ - static_branch_enable(&kasan_flag_enabled); - switch (kasan_arg_mode) { case KASAN_ARG_MODE_DEFAULT: - /* - * Default to sync mode. - */ - fallthrough; + /* Default is specified by kasan_mode definition. */ + break; case KASAN_ARG_MODE_SYNC: - /* Sync mode enabled. */ kasan_mode = KASAN_MODE_SYNC; break; case KASAN_ARG_MODE_ASYNC: - /* Async mode enabled. */ kasan_mode = KASAN_MODE_ASYNC; break; case KASAN_ARG_MODE_ASYMM: - /* Asymm mode enabled. */ kasan_mode = KASAN_MODE_ASYMM; break; } + switch (kasan_arg_vmalloc) { + case KASAN_ARG_VMALLOC_DEFAULT: + /* Default is specified by kasan_flag_vmalloc definition. */ + break; + case KASAN_ARG_VMALLOC_OFF: + static_branch_disable(&kasan_flag_vmalloc); + break; + case KASAN_ARG_VMALLOC_ON: + static_branch_enable(&kasan_flag_vmalloc); + break; + } + switch (kasan_arg_stacktrace) { case KASAN_ARG_STACKTRACE_DEFAULT: - /* Default to enabling stack trace collection. */ - static_branch_enable(&kasan_flag_stacktrace); + /* Default is specified by kasan_flag_stacktrace definition. */ break; case KASAN_ARG_STACKTRACE_OFF: - /* Do nothing, kasan_flag_stacktrace keeps its default value. */ + static_branch_disable(&kasan_flag_stacktrace); break; case KASAN_ARG_STACKTRACE_ON: static_branch_enable(&kasan_flag_stacktrace); break; } - pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, stacktrace=%s)\n", + /* KASAN is now initialized, enable it. */ + static_branch_enable(&kasan_flag_enabled); + + pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, vmalloc=%s, stacktrace=%s)\n", kasan_mode_info(), + kasan_vmalloc_enabled() ? "on" : "off", kasan_stack_collection_enabled() ? "on" : "off"); } -void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags) +#ifdef CONFIG_KASAN_VMALLOC + +static void unpoison_vmalloc_pages(const void *addr, u8 tag) { + struct vm_struct *area; + int i; + /* - * This condition should match the one in post_alloc_hook() in - * page_alloc.c. + * As hardware tag-based KASAN only tags VM_ALLOC vmalloc allocations + * (see the comment in __kasan_unpoison_vmalloc), all of the pages + * should belong to a single area. */ - bool init = !want_init_on_free() && want_init_on_alloc(flags); - - if (flags & __GFP_SKIP_KASAN_POISON) - SetPageSkipKASanPoison(page); + area = find_vm_area((void *)addr); + if (WARN_ON(!area)) + return; - if (flags & __GFP_ZEROTAGS) { - int i; + for (i = 0; i < area->nr_pages; i++) { + struct page *page = area->pages[i]; - for (i = 0; i != 1 << order; ++i) - tag_clear_highpage(page + i); - } else { - kasan_unpoison_pages(page, order, init); + page_kasan_tag_set(page, tag); } } -void kasan_free_pages(struct page *page, unsigned int order) +void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, + kasan_vmalloc_flags_t flags) { + u8 tag; + unsigned long redzone_start, redzone_size; + + if (!kasan_vmalloc_enabled()) + return (void *)start; + + if (!is_vmalloc_or_module_addr(start)) + return (void *)start; + + /* + * Skip unpoisoning and assigning a pointer tag for non-VM_ALLOC + * mappings as: + * + * 1. Unlike the software KASAN modes, hardware tag-based KASAN only + * supports tagging physical memory. Therefore, it can only tag a + * single mapping of normal physical pages. + * 2. Hardware tag-based KASAN can only tag memory mapped with special + * mapping protection bits, see arch_vmalloc_pgprot_modify(). + * As non-VM_ALLOC mappings can be mapped outside of vmalloc code, + * providing these bits would require tracking all non-VM_ALLOC + * mappers. + * + * Thus, for VM_ALLOC mappings, hardware tag-based KASAN only tags + * the first virtual mapping, which is created by vmalloc(). + * Tagging the page_alloc memory backing that vmalloc() allocation is + * skipped, see ___GFP_SKIP_KASAN_UNPOISON. + * + * For non-VM_ALLOC allocations, page_alloc memory is tagged as usual. + */ + if (!(flags & KASAN_VMALLOC_VM_ALLOC)) + return (void *)start; + + /* + * Don't tag executable memory. + * The kernel doesn't tolerate having the PC register tagged. + */ + if (!(flags & KASAN_VMALLOC_PROT_NORMAL)) + return (void *)start; + + tag = kasan_random_tag(); + start = set_tag(start, tag); + + /* Unpoison and initialize memory up to size. */ + kasan_unpoison(start, size, flags & KASAN_VMALLOC_INIT); + + /* + * Explicitly poison and initialize the in-page vmalloc() redzone. + * Unlike software KASAN modes, hardware tag-based KASAN doesn't + * unpoison memory when populating shadow for vmalloc() space. + */ + redzone_start = round_up((unsigned long)start + size, + KASAN_GRANULE_SIZE); + redzone_size = round_up(redzone_start, PAGE_SIZE) - redzone_start; + kasan_poison((void *)redzone_start, redzone_size, KASAN_TAG_INVALID, + flags & KASAN_VMALLOC_INIT); + /* - * This condition should match the one in free_pages_prepare() in - * page_alloc.c. + * Set per-page tag flags to allow accessing physical memory for the + * vmalloc() mapping through page_address(vmalloc_to_page()). */ - bool init = want_init_on_free(); + unpoison_vmalloc_pages(start, tag); - kasan_poison_pages(page, order, init); + return (void *)start; +} + +void __kasan_poison_vmalloc(const void *start, unsigned long size) +{ + /* + * No tagging here. + * The physical pages backing the vmalloc() allocation are poisoned + * through the usual page_alloc paths. + */ } +#endif + #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) -void kasan_enable_tagging_sync(void) +void kasan_enable_tagging(void) { - hw_enable_tagging_sync(); + if (kasan_arg_mode == KASAN_ARG_MODE_ASYNC) + hw_enable_tagging_async(); + else if (kasan_arg_mode == KASAN_ARG_MODE_ASYMM) + hw_enable_tagging_asymm(); + else + hw_enable_tagging_sync(); } -EXPORT_SYMBOL_GPL(kasan_enable_tagging_sync); +EXPORT_SYMBOL_GPL(kasan_enable_tagging); void kasan_force_async_fault(void) { diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index c17fa8d26ffe..d79b83d673b1 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -12,7 +12,8 @@ #include <linux/static_key.h> #include "../slab.h" -DECLARE_STATIC_KEY_FALSE(kasan_flag_stacktrace); +DECLARE_STATIC_KEY_TRUE(kasan_flag_vmalloc); +DECLARE_STATIC_KEY_TRUE(kasan_flag_stacktrace); enum kasan_mode { KASAN_MODE_SYNC, @@ -22,6 +23,11 @@ enum kasan_mode { extern enum kasan_mode kasan_mode __ro_after_init; +static inline bool kasan_vmalloc_enabled(void) +{ + return static_branch_likely(&kasan_flag_vmalloc); +} + static inline bool kasan_stack_collection_enabled(void) { return static_branch_unlikely(&kasan_flag_stacktrace); @@ -71,17 +77,19 @@ static inline bool kasan_sync_fault_possible(void) #define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */ #define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */ #define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */ -#define KASAN_KMALLOC_FREETRACK 0xFA /* object was freed and has free track set */ +#define KASAN_VMALLOC_INVALID 0xF8 /* unallocated space in vmapped page */ #else #define KASAN_FREE_PAGE KASAN_TAG_INVALID #define KASAN_PAGE_REDZONE KASAN_TAG_INVALID #define KASAN_KMALLOC_REDZONE KASAN_TAG_INVALID #define KASAN_KMALLOC_FREE KASAN_TAG_INVALID -#define KASAN_KMALLOC_FREETRACK KASAN_TAG_INVALID +#define KASAN_VMALLOC_INVALID KASAN_TAG_INVALID /* only for SW_TAGS */ #endif +#ifdef CONFIG_KASAN_GENERIC + +#define KASAN_KMALLOC_FREETRACK 0xFA /* object was freed and has free track set */ #define KASAN_GLOBAL_REDZONE 0xF9 /* redzone for global variable */ -#define KASAN_VMALLOC_INVALID 0xF8 /* unallocated space in vmapped page */ /* * Stack redzone shadow values @@ -110,6 +118,8 @@ static inline bool kasan_sync_fault_possible(void) #define KASAN_ABI_VERSION 1 #endif +#endif /* CONFIG_KASAN_GENERIC */ + /* Metadata layout customization. */ #define META_BYTES_PER_BLOCK 1 #define META_BLOCKS_PER_ROW 16 @@ -117,9 +127,15 @@ static inline bool kasan_sync_fault_possible(void) #define META_MEM_BYTES_PER_ROW (META_BYTES_PER_ROW * KASAN_GRANULE_SIZE) #define META_ROWS_AROUND_ADDR 2 -struct kasan_access_info { - const void *access_addr; - const void *first_bad_addr; +enum kasan_report_type { + KASAN_REPORT_ACCESS, + KASAN_REPORT_INVALID_FREE, +}; + +struct kasan_report_info { + enum kasan_report_type type; + void *access_addr; + void *first_bad_addr; size_t access_size; bool is_write; unsigned long ip; @@ -204,6 +220,14 @@ struct kasan_free_meta { #endif }; +#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) +/* Used in KUnit-compatible KASAN tests. */ +struct kunit_kasan_status { + bool report_found; + bool sync_fault; +}; +#endif + struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, const void *object); #ifdef CONFIG_KASAN_GENERIC @@ -221,7 +245,8 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr) static inline bool addr_has_metadata(const void *addr) { - return (addr >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); + return (kasan_reset_tag(addr) >= + kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); } /** @@ -251,10 +276,10 @@ static inline void kasan_print_tags(u8 addr_tag, const void *addr) { } #endif void *kasan_find_first_bad_addr(void *addr, size_t size); -const char *kasan_get_bug_type(struct kasan_access_info *info); +const char *kasan_get_bug_type(struct kasan_report_info *info); void kasan_metadata_fetch_row(char *buffer, void *row); -#if defined(CONFIG_KASAN_GENERIC) && defined(CONFIG_KASAN_STACK) +#if defined(CONFIG_KASAN_STACK) void kasan_print_address_stack_frame(const void *addr); #else static inline void kasan_print_address_stack_frame(const void *addr) { } @@ -340,12 +365,12 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) -void kasan_enable_tagging_sync(void); +void kasan_enable_tagging(void); void kasan_force_async_fault(void); #else /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */ -static inline void kasan_enable_tagging_sync(void) { } +static inline void kasan_enable_tagging(void) { } static inline void kasan_force_async_fault(void) { } #endif /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */ @@ -467,6 +492,13 @@ static inline bool kasan_arch_is_ready(void) { return true; } #error kasan_arch_is_ready only works in KASAN generic outline mode! #endif +#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST) + +bool kasan_save_enable_multi_shot(void); +void kasan_restore_multi_shot(bool enabled); + +#endif + /* * Exported functions for interfaces called from assembly or from generated * code. Declarations here to avoid warning about missing declarations. diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 3ad9624dcc56..199d77cce21a 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -13,6 +13,7 @@ #include <linux/ftrace.h> #include <linux/init.h> #include <linux/kernel.h> +#include <linux/lockdep.h> #include <linux/mm.h> #include <linux/printk.h> #include <linux/sched.h> @@ -64,6 +65,40 @@ static int __init early_kasan_fault(char *arg) } early_param("kasan.fault", early_kasan_fault); +static int __init kasan_set_multi_shot(char *str) +{ + set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); + return 1; +} +__setup("kasan_multi_shot", kasan_set_multi_shot); + +/* + * Used to suppress reports within kasan_disable/enable_current() critical + * sections, which are used for marking accesses to slab metadata. + */ +static bool report_suppressed(void) +{ +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) + if (current->kasan_depth) + return true; +#endif + return false; +} + +/* + * Used to avoid reporting more than one KASAN bug unless kasan_multi_shot + * is enabled. Note that KASAN tests effectively enable kasan_multi_shot + * for their duration. + */ +static bool report_enabled(void) +{ + if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) + return true; + return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); +} + +#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST) + bool kasan_save_enable_multi_shot(void) { return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); @@ -77,61 +112,87 @@ void kasan_restore_multi_shot(bool enabled) } EXPORT_SYMBOL_GPL(kasan_restore_multi_shot); -static int __init kasan_set_multi_shot(char *str) -{ - set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); - return 1; -} -__setup("kasan_multi_shot", kasan_set_multi_shot); +#endif -static void print_error_description(struct kasan_access_info *info) +#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) +static void update_kunit_status(bool sync) { - pr_err("BUG: KASAN: %s in %pS\n", - kasan_get_bug_type(info), (void *)info->ip); - if (info->access_size) - pr_err("%s of size %zu at addr %px by task %s/%d\n", - info->is_write ? "Write" : "Read", info->access_size, - info->access_addr, current->comm, task_pid_nr(current)); - else - pr_err("%s at addr %px by task %s/%d\n", - info->is_write ? "Write" : "Read", - info->access_addr, current->comm, task_pid_nr(current)); + struct kunit *test; + struct kunit_resource *resource; + struct kunit_kasan_status *status; + + test = current->kunit_test; + if (!test) + return; + + resource = kunit_find_named_resource(test, "kasan_status"); + if (!resource) { + kunit_set_failure(test); + return; + } + + status = (struct kunit_kasan_status *)resource->data; + WRITE_ONCE(status->report_found, true); + WRITE_ONCE(status->sync_fault, sync); + + kunit_put_resource(resource); } +#else +static void update_kunit_status(bool sync) { } +#endif static DEFINE_SPINLOCK(report_lock); -static void start_report(unsigned long *flags) +static void start_report(unsigned long *flags, bool sync) { - /* - * Make sure we don't end up in loop. - */ + /* Respect the /proc/sys/kernel/traceoff_on_warning interface. */ + disable_trace_on_warning(); + /* Update status of the currently running KASAN test. */ + update_kunit_status(sync); + /* Do not allow LOCKDEP mangling KASAN reports. */ + lockdep_off(); + /* Make sure we don't end up in loop. */ kasan_disable_current(); spin_lock_irqsave(&report_lock, *flags); pr_err("==================================================================\n"); } -static void end_report(unsigned long *flags, unsigned long addr) +static void end_report(unsigned long *flags, void *addr) { - if (!kasan_async_fault_possible()) - trace_error_report_end(ERROR_DETECTOR_KASAN, addr); + if (addr) + trace_error_report_end(ERROR_DETECTOR_KASAN, + (unsigned long)addr); pr_err("==================================================================\n"); - add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&report_lock, *flags); - if (panic_on_warn && !test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) { - /* - * This thread may hit another WARN() in the panic path. - * Resetting this prevents additional WARN() from panicking the - * system on this thread. Other threads are blocked by the - * panic_mutex in panic(). - */ - panic_on_warn = 0; + if (panic_on_warn && !test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) panic("panic_on_warn set ...\n"); - } if (kasan_arg_fault == KASAN_ARG_FAULT_PANIC) panic("kasan.fault=panic set ...\n"); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + lockdep_on(); kasan_enable_current(); } +static void print_error_description(struct kasan_report_info *info) +{ + if (info->type == KASAN_REPORT_INVALID_FREE) { + pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", + (void *)info->ip); + return; + } + + pr_err("BUG: KASAN: %s in %pS\n", + kasan_get_bug_type(info), (void *)info->ip); + if (info->access_size) + pr_err("%s of size %zu at addr %px by task %s/%d\n", + info->is_write ? "Write" : "Read", info->access_size, + info->access_addr, current->comm, task_pid_nr(current)); + else + pr_err("%s at addr %px by task %s/%d\n", + info->is_write ? "Write" : "Read", + info->access_addr, current->comm, task_pid_nr(current)); +} + static void print_track(struct kasan_track *track, const char *prefix) { pr_err("%s by task %u:\n", prefix, track->pid); @@ -170,9 +231,6 @@ static void describe_object_addr(struct kmem_cache *cache, void *object, " which belongs to the cache %s of size %d\n", object, cache->name, cache->object_size); - if (!addr) - return; - if (access_addr < object_addr) { rel_type = "to the left"; rel_bytes = object_addr - access_addr; @@ -261,19 +319,43 @@ static void print_address_description(void *addr, u8 tag) void *object = nearest_obj(cache, slab, addr); describe_object(cache, object, addr, tag); + pr_err("\n"); } if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) { pr_err("The buggy address belongs to the variable:\n"); pr_err(" %pS\n", addr); + pr_err("\n"); + } + + if (object_is_on_stack(addr)) { + /* + * Currently, KASAN supports printing frame information only + * for accesses to the task's own stack. + */ + kasan_print_address_stack_frame(addr); + pr_err("\n"); + } + + if (is_vmalloc_addr(addr)) { + struct vm_struct *va = find_vm_area(addr); + + if (va) { + pr_err("The buggy address belongs to the virtual mapping at\n" + " [%px, %px) created by:\n" + " %pS\n", + va->addr, va->addr + va->size, va->caller); + pr_err("\n"); + + page = vmalloc_to_page(page); + } } if (page) { - pr_err("The buggy address belongs to the page:\n"); + pr_err("The buggy address belongs to the physical page:\n"); dump_page(page, "kasan: bad access detected"); + pr_err("\n"); } - - kasan_print_address_stack_frame(addr); } static bool meta_row_is_guilty(const void *row, const void *addr) @@ -332,138 +414,110 @@ static void print_memory_metadata(const void *addr) } } -static bool report_enabled(void) +static void print_report(struct kasan_report_info *info) { -#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) - if (current->kasan_depth) - return false; -#endif - if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) - return true; - return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); -} + void *tagged_addr = info->access_addr; + void *untagged_addr = kasan_reset_tag(tagged_addr); + u8 tag = get_tag(tagged_addr); -#if IS_ENABLED(CONFIG_KUNIT) -static void kasan_update_kunit_status(struct kunit *cur_test) -{ - struct kunit_resource *resource; - struct kunit_kasan_expectation *kasan_data; - - resource = kunit_find_named_resource(cur_test, "kasan_data"); + print_error_description(info); + if (addr_has_metadata(untagged_addr)) + kasan_print_tags(tag, info->first_bad_addr); + pr_err("\n"); - if (!resource) { - kunit_set_failure(cur_test); - return; + if (addr_has_metadata(untagged_addr)) { + print_address_description(untagged_addr, tag); + print_memory_metadata(info->first_bad_addr); + } else { + dump_stack_lvl(KERN_ERR); } - - kasan_data = (struct kunit_kasan_expectation *)resource->data; - WRITE_ONCE(kasan_data->report_found, true); - kunit_put_resource(resource); } -#endif /* IS_ENABLED(CONFIG_KUNIT) */ -void kasan_report_invalid_free(void *object, unsigned long ip) +void kasan_report_invalid_free(void *ptr, unsigned long ip) { unsigned long flags; - u8 tag = get_tag(object); - - object = kasan_reset_tag(object); + struct kasan_report_info info; -#if IS_ENABLED(CONFIG_KUNIT) - if (current->kunit_test) - kasan_update_kunit_status(current->kunit_test); -#endif /* IS_ENABLED(CONFIG_KUNIT) */ + /* + * Do not check report_suppressed(), as an invalid-free cannot be + * caused by accessing slab metadata and thus should not be + * suppressed by kasan_disable/enable_current() critical sections. + */ + if (unlikely(!report_enabled())) + return; - start_report(&flags); - pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip); - kasan_print_tags(tag, object); - pr_err("\n"); - print_address_description(object, tag); - pr_err("\n"); - print_memory_metadata(object); - end_report(&flags, (unsigned long)object); -} + start_report(&flags, true); -#ifdef CONFIG_KASAN_HW_TAGS -void kasan_report_async(void) -{ - unsigned long flags; + info.type = KASAN_REPORT_INVALID_FREE; + info.access_addr = ptr; + info.first_bad_addr = kasan_reset_tag(ptr); + info.access_size = 0; + info.is_write = false; + info.ip = ip; -#if IS_ENABLED(CONFIG_KUNIT) - if (current->kunit_test) - kasan_update_kunit_status(current->kunit_test); -#endif /* IS_ENABLED(CONFIG_KUNIT) */ + print_report(&info); - start_report(&flags); - pr_err("BUG: KASAN: invalid-access\n"); - pr_err("Asynchronous mode enabled: no access details available\n"); - pr_err("\n"); - dump_stack_lvl(KERN_ERR); - end_report(&flags, 0); + end_report(&flags, ptr); } -#endif /* CONFIG_KASAN_HW_TAGS */ -static void __kasan_report(unsigned long addr, size_t size, bool is_write, - unsigned long ip) +/* + * kasan_report() is the only reporting function that uses + * user_access_save/restore(): kasan_report_invalid_free() cannot be called + * from a UACCESS region, and kasan_report_async() is not used on x86. + */ +bool kasan_report(unsigned long addr, size_t size, bool is_write, + unsigned long ip) { - struct kasan_access_info info; - void *tagged_addr; - void *untagged_addr; - unsigned long flags; - -#if IS_ENABLED(CONFIG_KUNIT) - if (current->kunit_test) - kasan_update_kunit_status(current->kunit_test); -#endif /* IS_ENABLED(CONFIG_KUNIT) */ - - disable_trace_on_warning(); + bool ret = true; + void *ptr = (void *)addr; + unsigned long ua_flags = user_access_save(); + unsigned long irq_flags; + struct kasan_report_info info; + + if (unlikely(report_suppressed()) || unlikely(!report_enabled())) { + ret = false; + goto out; + } - tagged_addr = (void *)addr; - untagged_addr = kasan_reset_tag(tagged_addr); + start_report(&irq_flags, true); - info.access_addr = tagged_addr; - if (addr_has_metadata(untagged_addr)) - info.first_bad_addr = - kasan_find_first_bad_addr(tagged_addr, size); - else - info.first_bad_addr = untagged_addr; + info.type = KASAN_REPORT_ACCESS; + info.access_addr = ptr; + info.first_bad_addr = kasan_find_first_bad_addr(ptr, size); info.access_size = size; info.is_write = is_write; info.ip = ip; - start_report(&flags); + print_report(&info); - print_error_description(&info); - if (addr_has_metadata(untagged_addr)) - kasan_print_tags(get_tag(tagged_addr), info.first_bad_addr); - pr_err("\n"); + end_report(&irq_flags, ptr); - if (addr_has_metadata(untagged_addr)) { - print_address_description(untagged_addr, get_tag(tagged_addr)); - pr_err("\n"); - print_memory_metadata(info.first_bad_addr); - } else { - dump_stack_lvl(KERN_ERR); - } +out: + user_access_restore(ua_flags); - end_report(&flags, addr); + return ret; } -bool kasan_report(unsigned long addr, size_t size, bool is_write, - unsigned long ip) +#ifdef CONFIG_KASAN_HW_TAGS +void kasan_report_async(void) { - unsigned long flags = user_access_save(); - bool ret = false; - - if (likely(report_enabled())) { - __kasan_report(addr, size, is_write, ip); - ret = true; - } + unsigned long flags; - user_access_restore(flags); + /* + * Do not check report_suppressed(), as kasan_disable/enable_current() + * critical sections do not affect Hardware Tag-Based KASAN. + */ + if (unlikely(!report_enabled())) + return; - return ret; + start_report(&flags, false); + pr_err("BUG: KASAN: invalid-access\n"); + pr_err("Asynchronous fault: no details available\n"); + pr_err("\n"); + dump_stack_lvl(KERN_ERR); + end_report(&flags, NULL); } +#endif /* CONFIG_KASAN_HW_TAGS */ #ifdef CONFIG_KASAN_INLINE /* diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 139615ef326b..efc5e79a103f 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -34,12 +34,16 @@ void *kasan_find_first_bad_addr(void *addr, size_t size) { void *p = addr; + if (!addr_has_metadata(p)) + return p; + while (p < addr + size && !(*(u8 *)kasan_mem_to_shadow(p))) p += KASAN_GRANULE_SIZE; + return p; } -static const char *get_shadow_bug_type(struct kasan_access_info *info) +static const char *get_shadow_bug_type(struct kasan_report_info *info) { const char *bug_type = "unknown-crash"; u8 *shadow_addr; @@ -91,7 +95,7 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info) return bug_type; } -static const char *get_wild_bug_type(struct kasan_access_info *info) +static const char *get_wild_bug_type(struct kasan_report_info *info) { const char *bug_type = "unknown-crash"; @@ -105,7 +109,7 @@ static const char *get_wild_bug_type(struct kasan_access_info *info) return bug_type; } -const char *kasan_get_bug_type(struct kasan_access_info *info) +const char *kasan_get_bug_type(struct kasan_report_info *info) { /* * If access_size is a negative number, then it has reason to be @@ -180,7 +184,7 @@ static void print_decoded_frame_descr(const char *frame_descr) return; pr_err("\n"); - pr_err("this frame has %lu %s:\n", num_objects, + pr_err("This frame has %lu %s:\n", num_objects, num_objects == 1 ? "object" : "objects"); while (num_objects--) { @@ -211,6 +215,7 @@ static void print_decoded_frame_descr(const char *frame_descr) } } +/* Returns true only if the address is on the current task's stack. */ static bool __must_check get_address_stack_frame_info(const void *addr, unsigned long *offset, const char **frame_descr, @@ -224,13 +229,6 @@ static bool __must_check get_address_stack_frame_info(const void *addr, BUILD_BUG_ON(IS_ENABLED(CONFIG_STACK_GROWSUP)); - /* - * NOTE: We currently only support printing frame information for - * accesses to the task's own stack. - */ - if (!object_is_on_stack(addr)) - return false; - aligned_addr = round_down((unsigned long)addr, sizeof(long)); mem_ptr = round_down(aligned_addr, KASAN_GRANULE_SIZE); shadow_ptr = kasan_mem_to_shadow((void *)aligned_addr); @@ -269,17 +267,17 @@ void kasan_print_address_stack_frame(const void *addr) const char *frame_descr; const void *frame_pc; + if (WARN_ON(!object_is_on_stack(addr))) + return; + + pr_err("The buggy address belongs to stack of task %s/%d\n", + current->comm, task_pid_nr(current)); + if (!get_address_stack_frame_info(addr, &offset, &frame_descr, &frame_pc)) return; - /* - * get_address_stack_frame_info only returns true if the given addr is - * on the current task's stack. - */ - pr_err("\n"); - pr_err("addr %px is located in stack of task %s/%d at offset %lu in frame:\n", - addr, current->comm, task_pid_nr(current), offset); + pr_err(" and is located at offset %lu in frame:\n", offset); pr_err(" %pS\n", frame_pc); if (!frame_descr) diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c index 5dbbbb930e7a..f3d3be614e4b 100644 --- a/mm/kasan/report_hw_tags.c +++ b/mm/kasan/report_hw_tags.c @@ -17,6 +17,7 @@ void *kasan_find_first_bad_addr(void *addr, size_t size) { + /* Return the same value regardless of whether addr_has_metadata(). */ return kasan_reset_tag(addr); } diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c index d2298c357834..7a26397297ed 100644 --- a/mm/kasan/report_sw_tags.c +++ b/mm/kasan/report_sw_tags.c @@ -16,6 +16,7 @@ #include <linux/mm.h> #include <linux/printk.h> #include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <linux/slab.h> #include <linux/stackdepot.h> #include <linux/stacktrace.h> @@ -35,8 +36,12 @@ void *kasan_find_first_bad_addr(void *addr, size_t size) void *p = kasan_reset_tag(addr); void *end = p + size; + if (!addr_has_metadata(p)) + return p; + while (p < end && tag == *(u8 *)kasan_mem_to_shadow(p)) p += KASAN_GRANULE_SIZE; + return p; } @@ -51,3 +56,14 @@ void kasan_print_tags(u8 addr_tag, const void *addr) pr_err("Pointer tag: [%02x], memory tag: [%02x]\n", addr_tag, *shadow); } + +#ifdef CONFIG_KASAN_STACK +void kasan_print_address_stack_frame(const void *addr) +{ + if (WARN_ON(!object_is_on_stack(addr))) + return; + + pr_err("The buggy address belongs to stack of task %s/%d\n", + current->comm, task_pid_nr(current)); +} +#endif diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index 1b41de88c53e..e25d2166e813 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -7,7 +7,7 @@ #include "kasan.h" #include "../slab.h" -const char *kasan_get_bug_type(struct kasan_access_info *info) +const char *kasan_get_bug_type(struct kasan_report_info *info) { #ifdef CONFIG_KASAN_TAGS_IDENTIFY struct kasan_alloc_meta *alloc_meta; diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 94136f84b449..a4f07de21771 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -345,27 +345,6 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) return 0; } -/* - * Poison the shadow for a vmalloc region. Called as part of the - * freeing process at the time the region is freed. - */ -void kasan_poison_vmalloc(const void *start, unsigned long size) -{ - if (!is_vmalloc_or_module_addr(start)) - return; - - size = round_up(size, KASAN_GRANULE_SIZE); - kasan_poison(start, size, KASAN_VMALLOC_INVALID, false); -} - -void kasan_unpoison_vmalloc(const void *start, unsigned long size) -{ - if (!is_vmalloc_or_module_addr(start)) - return; - - kasan_unpoison(start, size, false); -} - static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, void *unused) { @@ -496,9 +475,48 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, } } +void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, + kasan_vmalloc_flags_t flags) +{ + /* + * Software KASAN modes unpoison both VM_ALLOC and non-VM_ALLOC + * mappings, so the KASAN_VMALLOC_VM_ALLOC flag is ignored. + * Software KASAN modes can't optimize zeroing memory by combining it + * with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored. + */ + + if (!is_vmalloc_or_module_addr(start)) + return (void *)start; + + /* + * Don't tag executable memory with the tag-based mode. + * The kernel doesn't tolerate having the PC register tagged. + */ + if (IS_ENABLED(CONFIG_KASAN_SW_TAGS) && + !(flags & KASAN_VMALLOC_PROT_NORMAL)) + return (void *)start; + + start = set_tag(start, kasan_random_tag()); + kasan_unpoison(start, size, false); + return (void *)start; +} + +/* + * Poison the shadow for a vmalloc region. Called as part of the + * freeing process at the time the region is freed. + */ +void __kasan_poison_vmalloc(const void *start, unsigned long size) +{ + if (!is_vmalloc_or_module_addr(start)) + return; + + size = round_up(size, KASAN_GRANULE_SIZE); + kasan_poison(start, size, KASAN_VMALLOC_INVALID, false); +} + #else /* CONFIG_KASAN_VMALLOC */ -int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) +int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask) { void *ret; size_t scaled_size; @@ -534,7 +552,7 @@ int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) return -ENOMEM; } -void kasan_free_shadow(const struct vm_struct *vm) +void kasan_free_module_shadow(const struct vm_struct *vm) { if (vm->flags & VM_KASAN) vfree(kasan_mem_to_shadow(vm->addr)); diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 2f9fdfde1941..a203747ad2c0 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -566,6 +566,8 @@ static unsigned long kfence_init_pool(void) * enters __slab_free() slow-path. */ for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { + struct slab *slab = page_slab(&pages[i]); + if (!i || (i % 2)) continue; @@ -573,7 +575,11 @@ static unsigned long kfence_init_pool(void) if (WARN_ON(compound_head(&pages[i]) != &pages[i])) return addr; - __SetPageSlab(&pages[i]); + __folio_set_slab(slab_folio(slab)); +#ifdef CONFIG_MEMCG + slab->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg | + MEMCG_DATA_OBJCGS; +#endif } /* @@ -1033,6 +1039,9 @@ void __kfence_free(void *addr) { struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); +#ifdef CONFIG_MEMCG + KFENCE_WARN_ON(meta->objcg); +#endif /* * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing * the object, as the object page may be recycled for other-typed diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h index 2a2d5de9d379..9a6c4b1b12a8 100644 --- a/mm/kfence/kfence.h +++ b/mm/kfence/kfence.h @@ -89,6 +89,9 @@ struct kfence_metadata { struct kfence_track free_track; /* For updating alloc_covered on frees. */ u32 alloc_stack_hash; +#ifdef CONFIG_MEMCG + struct obj_cgroup *objcg; +#endif }; extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS]; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 1cdf7c38b9e5..a4e5eaf3eb01 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -46,7 +46,6 @@ enum scan_result { SCAN_VMA_NULL, SCAN_VMA_CHECK, SCAN_ADDRESS_RANGE, - SCAN_SWAP_CACHE_PAGE, SCAN_DEL_PAGE_LRU, SCAN_ALLOC_HUGE_PAGE_FAIL, SCAN_CGROUP_CHARGE_FAIL, @@ -683,16 +682,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, result = SCAN_PAGE_COUNT; goto out; } - if (!pte_write(pteval) && PageSwapCache(page) && - !reuse_swap_page(page)) { - /* - * Page is in the swap cache and cannot be re-used. - * It cannot be collapsed into a THP. - */ - unlock_page(page); - result = SCAN_SWAP_CACHE_PAGE; - goto out; - } /* * Isolate the page to avoid collapsing an hugepage diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 7580baa76af1..acd7cbb82e16 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -796,6 +796,8 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) unsigned long flags; struct kmemleak_object *object; struct kmemleak_scan_area *area = NULL; + unsigned long untagged_ptr; + unsigned long untagged_objp; object = find_and_get_object(ptr, 1); if (!object) { @@ -804,6 +806,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) return; } + untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr); + untagged_objp = (unsigned long)kasan_reset_tag((void *)object->pointer); + if (scan_area_cache) area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); @@ -815,8 +820,8 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) goto out_unlock; } if (size == SIZE_MAX) { - size = object->pointer + object->size - ptr; - } else if (ptr + size > object->pointer + object->size) { + size = untagged_objp + object->size - untagged_ptr; + } else if (untagged_ptr + size > untagged_objp + object->size) { kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); dump_object_info(object); kmem_cache_free(scan_area_cache, area); diff --git a/mm/maccess.c b/mm/maccess.c index 3fed2b876539..5f4d240f67ec 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -12,8 +12,6 @@ bool __weak copy_from_kernel_nofault_allowed(const void *unsafe_src, return true; } -#ifdef HAVE_GET_KERNEL_NOFAULT - #define copy_from_kernel_nofault_loop(dst, src, len, type, err_label) \ while (len >= sizeof(type)) { \ __get_kernel_nofault(dst, src, type, err_label); \ @@ -102,112 +100,6 @@ Efault: dst[-1] = '\0'; return -EFAULT; } -#else /* HAVE_GET_KERNEL_NOFAULT */ -/** - * copy_from_kernel_nofault(): safely attempt to read from kernel-space - * @dst: pointer to the buffer that shall take the data - * @src: address to read from - * @size: size of the data chunk - * - * Safely read from kernel address @src to the buffer at @dst. If a kernel - * fault happens, handle that and return -EFAULT. If @src is not a valid kernel - * address, return -ERANGE. - * - * We ensure that the copy_from_user is executed in atomic context so that - * do_page_fault() doesn't attempt to take mmap_lock. This makes - * copy_from_kernel_nofault() suitable for use within regions where the caller - * already holds mmap_lock, or other locks which nest inside mmap_lock. - */ -long copy_from_kernel_nofault(void *dst, const void *src, size_t size) -{ - long ret; - mm_segment_t old_fs = get_fs(); - - if (!copy_from_kernel_nofault_allowed(src, size)) - return -ERANGE; - - set_fs(KERNEL_DS); - pagefault_disable(); - ret = __copy_from_user_inatomic(dst, (__force const void __user *)src, - size); - pagefault_enable(); - set_fs(old_fs); - - if (ret) - return -EFAULT; - return 0; -} -EXPORT_SYMBOL_GPL(copy_from_kernel_nofault); - -/** - * copy_to_kernel_nofault(): safely attempt to write to a location - * @dst: address to write to - * @src: pointer to the data that shall be written - * @size: size of the data chunk - * - * Safely write to address @dst from the buffer at @src. If a kernel fault - * happens, handle that and return -EFAULT. - */ -long copy_to_kernel_nofault(void *dst, const void *src, size_t size) -{ - long ret; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - pagefault_disable(); - ret = __copy_to_user_inatomic((__force void __user *)dst, src, size); - pagefault_enable(); - set_fs(old_fs); - - if (ret) - return -EFAULT; - return 0; -} - -/** - * strncpy_from_kernel_nofault: - Copy a NUL terminated string from unsafe - * address. - * @dst: Destination address, in kernel space. This buffer must be at - * least @count bytes long. - * @unsafe_addr: Unsafe address. - * @count: Maximum number of bytes to copy, including the trailing NUL. - * - * Copies a NUL-terminated string from unsafe address to kernel buffer. - * - * On success, returns the length of the string INCLUDING the trailing NUL. - * - * If access fails, returns -EFAULT (some data may have been copied and the - * trailing NUL added). If @unsafe_addr is not a valid kernel address, return - * -ERANGE. - * - * If @count is smaller than the length of the string, copies @count-1 bytes, - * sets the last byte of @dst buffer to NUL and returns @count. - */ -long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count) -{ - mm_segment_t old_fs = get_fs(); - const void *src = unsafe_addr; - long ret; - - if (unlikely(count <= 0)) - return 0; - if (!copy_from_kernel_nofault_allowed(unsafe_addr, count)) - return -ERANGE; - - set_fs(KERNEL_DS); - pagefault_disable(); - - do { - ret = __get_user(*dst++, (const char __user __force *)src++); - } while (dst[-1] && ret == 0 && src - unsafe_addr < count); - - dst[-1] = '\0'; - pagefault_enable(); - set_fs(old_fs); - - return ret ? -EFAULT : src - unsafe_addr; -} -#endif /* HAVE_GET_KERNEL_NOFAULT */ /** * copy_from_user_nofault(): safely attempt to read from a user-space location @@ -221,14 +113,11 @@ long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count) long copy_from_user_nofault(void *dst, const void __user *src, size_t size) { long ret = -EFAULT; - mm_segment_t old_fs = force_uaccess_begin(); - if (access_ok(src, size)) { pagefault_disable(); ret = __copy_from_user_inatomic(dst, src, size); pagefault_enable(); } - force_uaccess_end(old_fs); if (ret) return -EFAULT; @@ -248,14 +137,12 @@ EXPORT_SYMBOL_GPL(copy_from_user_nofault); long copy_to_user_nofault(void __user *dst, const void *src, size_t size) { long ret = -EFAULT; - mm_segment_t old_fs = force_uaccess_begin(); if (access_ok(dst, size)) { pagefault_disable(); ret = __copy_to_user_inatomic(dst, src, size); pagefault_enable(); } - force_uaccess_end(old_fs); if (ret) return -EFAULT; @@ -284,17 +171,14 @@ EXPORT_SYMBOL_GPL(copy_to_user_nofault); long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, long count) { - mm_segment_t old_fs; long ret; if (unlikely(count <= 0)) return 0; - old_fs = force_uaccess_begin(); pagefault_disable(); ret = strncpy_from_user(dst, unsafe_addr, count); pagefault_enable(); - force_uaccess_end(old_fs); if (ret >= count) { ret = count; @@ -324,14 +208,11 @@ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, */ long strnlen_user_nofault(const void __user *unsafe_addr, long count) { - mm_segment_t old_fs; int ret; - old_fs = force_uaccess_begin(); pagefault_disable(); ret = strnlen_user(unsafe_addr, count); pagefault_enable(); - force_uaccess_end(old_fs); return ret; } diff --git a/mm/madvise.c b/mm/madvise.c index 39b712fd8300..1873616a37d2 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -52,6 +52,7 @@ static int madvise_need_mmap_write(int behavior) case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: + case MADV_DONTNEED_LOCKED: case MADV_COLD: case MADV_PAGEOUT: case MADV_FREE: @@ -504,7 +505,7 @@ static void madvise_cold_page_range(struct mmu_gather *tlb, static inline bool can_madv_lru_vma(struct vm_area_struct *vma) { - return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); + return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB)); } static long madvise_cold(struct vm_area_struct *vma, @@ -777,6 +778,29 @@ static long madvise_dontneed_single_vma(struct vm_area_struct *vma, return 0; } +static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, + unsigned long start, + unsigned long *end, + int behavior) +{ + if (!is_vm_hugetlb_page(vma)) { + unsigned int forbidden = VM_PFNMAP; + + if (behavior != MADV_DONTNEED_LOCKED) + forbidden |= VM_LOCKED; + + return !(vma->vm_flags & forbidden); + } + + if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED) + return false; + if (start & ~huge_page_mask(hstate_vma(vma))) + return false; + + *end = ALIGN(*end, huge_page_size(hstate_vma(vma))); + return true; +} + static long madvise_dontneed_free(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, @@ -785,7 +809,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; *prev = vma; - if (!can_madv_lru_vma(vma)) + if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) return -EINVAL; if (!userfaultfd_remove(vma, start, end)) { @@ -807,7 +831,12 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, */ return -ENOMEM; } - if (!can_madv_lru_vma(vma)) + /* + * Potential end adjustment for hugetlb vma is OK as + * the check below keeps end within vma. + */ + if (!madvise_dontneed_free_valid_vma(vma, start, &end, + behavior)) return -EINVAL; if (end > vma->vm_end) { /* @@ -827,7 +856,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, VM_WARN_ON(start >= end); } - if (behavior == MADV_DONTNEED) + if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) return madvise_dontneed_single_vma(vma, start, end); else if (behavior == MADV_FREE) return madvise_free_single_vma(vma, start, end); @@ -966,6 +995,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, return madvise_pageout(vma, prev, start, end); case MADV_FREE: case MADV_DONTNEED: + case MADV_DONTNEED_LOCKED: return madvise_dontneed_free(vma, prev, start, end, behavior); case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: @@ -1096,6 +1126,7 @@ madvise_behavior_valid(int behavior) case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: + case MADV_DONTNEED_LOCKED: case MADV_FREE: case MADV_COLD: case MADV_PAGEOUT: @@ -1433,16 +1464,9 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, while (iov_iter_count(&iter)) { iovec = iov_iter_iovec(&iter); - /* - * do_madvise returns ENOMEM if unmapped holes are present - * in the passed VMA. process_madvise() is expected to skip - * unmapped holes passed to it in the 'struct iovec' list - * and not fail because of them. Thus treat -ENOMEM return - * from do_madvise as valid and continue processing. - */ ret = do_madvise(mm, (unsigned long)iovec.iov_base, iovec.iov_len, behavior); - if (ret < 0 && ret != -ENOMEM) + if (ret < 0) break; iov_iter_advance(&iter, iovec.iov_len); } diff --git a/mm/memblock.c b/mm/memblock.c index b12a364f2766..e4f03a6e8e56 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1284,11 +1284,10 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, { int zone_nid = zone_to_nid(zone); phys_addr_t spa, epa; - int nid; __next_mem_range(idx, zone_nid, MEMBLOCK_NONE, &memblock.memory, &memblock.reserved, - &spa, &epa, &nid); + &spa, &epa, NULL); while (*idx != U64_MAX) { unsigned long epfn = PFN_DOWN(epa); @@ -1315,7 +1314,7 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, __next_mem_range(idx, zone_nid, MEMBLOCK_NONE, &memblock.memory, &memblock.reserved, - &spa, &epa, &nid); + &spa, &epa, NULL); } /* signal end of iteration */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d495c2acb9f0..725f76723220 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -60,7 +60,7 @@ #include <linux/oom.h> #include <linux/lockdep.h> #include <linux/file.h> -#include <linux/tracehook.h> +#include <linux/resume_user_mode.h> #include <linux/psi.h> #include <linux/seq_buf.h> #include "internal.h" diff --git a/mm/memory.c b/mm/memory.c index 7c40850b7124..76e3af9639d9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3287,19 +3287,35 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) if (PageAnon(vmf->page)) { struct page *page = vmf->page; - /* PageKsm() doesn't necessarily raise the page refcount */ - if (PageKsm(page) || page_count(page) != 1) + /* + * We have to verify under page lock: these early checks are + * just an optimization to avoid locking the page and freeing + * the swapcache if there is little hope that we can reuse. + * + * PageKsm() doesn't necessarily raise the page refcount. + */ + if (PageKsm(page) || page_count(page) > 3) + goto copy; + if (!PageLRU(page)) + /* + * Note: We cannot easily detect+handle references from + * remote LRU pagevecs or references to PageLRU() pages. + */ + lru_add_drain(); + if (page_count(page) > 1 + PageSwapCache(page)) goto copy; if (!trylock_page(page)) goto copy; - if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) { + if (PageSwapCache(page)) + try_to_free_swap(page); + if (PageKsm(page) || page_count(page) != 1) { unlock_page(page); goto copy; } /* - * Ok, we've got the only map reference, and the only - * page count reference, and the page is locked, - * it's dark out, and we're wearing sunglasses. Hit it. + * Ok, we've got the only page reference from our mapping + * and the page is locked, it's dark out, and we're wearing + * sunglasses. Hit it. */ unlock_page(page); wp_page_reuse(vmf); @@ -3372,11 +3388,11 @@ void unmap_mapping_folio(struct folio *folio) details.even_cows = false; details.single_folio = folio; - i_mmap_lock_write(mapping); + i_mmap_lock_read(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) unmap_mapping_range_tree(&mapping->i_mmap, first_index, last_index, &details); - i_mmap_unlock_write(mapping); + i_mmap_unlock_read(mapping); } /** @@ -3402,11 +3418,11 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, if (last_index < first_index) last_index = ULONG_MAX; - i_mmap_lock_write(mapping); + i_mmap_lock_read(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) unmap_mapping_range_tree(&mapping->i_mmap, first_index, last_index, &details); - i_mmap_unlock_write(mapping); + i_mmap_unlock_read(mapping); } EXPORT_SYMBOL_GPL(unmap_mapping_pages); @@ -3473,6 +3489,25 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) return 0; } +static inline bool should_try_to_free_swap(struct page *page, + struct vm_area_struct *vma, + unsigned int fault_flags) +{ + if (!PageSwapCache(page)) + return false; + if (mem_cgroup_swap_full(page) || (vma->vm_flags & VM_LOCKED) || + PageMlocked(page)) + return true; + /* + * If we want to map a page that's in the swapcache writable, we + * have to detect via the refcount if we're really the exclusive + * user. Try freeing the swapcache to get rid of the swapcache + * reference only in case it's likely that we'll be the exlusive user. + */ + return (fault_flags & FAULT_FLAG_WRITE) && !PageKsm(page) && + page_count(page) == 2; +} + /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. @@ -3591,21 +3626,39 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_release; } - /* - * Make sure try_to_free_swap or reuse_swap_page or swapoff did not - * release the swapcache from under us. The page pin, and pte_same - * test below, are not enough to exclude that. Even if it is still - * swapcache, we need to check that the page's swap has not changed. - */ - if (unlikely((!PageSwapCache(page) || - page_private(page) != entry.val)) && swapcache) - goto out_page; - - page = ksm_might_need_to_copy(page, vma, vmf->address); - if (unlikely(!page)) { - ret = VM_FAULT_OOM; - page = swapcache; - goto out_page; + if (swapcache) { + /* + * Make sure try_to_free_swap or swapoff did not release the + * swapcache from under us. The page pin, and pte_same test + * below, are not enough to exclude that. Even if it is still + * swapcache, we need to check that the page's swap has not + * changed. + */ + if (unlikely(!PageSwapCache(page) || + page_private(page) != entry.val)) + goto out_page; + + /* + * KSM sometimes has to copy on read faults, for example, if + * page->index of !PageKSM() pages would be nonlinear inside the + * anon VMA -- PageKSM() is lost on actual swapout. + */ + page = ksm_might_need_to_copy(page, vma, vmf->address); + if (unlikely(!page)) { + ret = VM_FAULT_OOM; + page = swapcache; + goto out_page; + } + + /* + * If we want to map a page that's in the swapcache writable, we + * have to detect via the refcount if we're really the exclusive + * owner. Try removing the extra reference from the local LRU + * pagevecs if required. + */ + if ((vmf->flags & FAULT_FLAG_WRITE) && page == swapcache && + !PageKsm(page) && !PageLRU(page)) + lru_add_drain(); } cgroup_throttle_swaprate(page, GFP_KERNEL); @@ -3624,19 +3677,25 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) } /* - * The page isn't present yet, go ahead with the fault. - * - * Be careful about the sequence of operations here. - * To get its accounting right, reuse_swap_page() must be called - * while the page is counted on swap but not yet in mapcount i.e. - * before page_add_anon_rmap() and swap_free(); try_to_free_swap() - * must be called after the swap_free(), or it will never succeed. + * Remove the swap entry and conditionally try to free up the swapcache. + * We're already holding a reference on the page but haven't mapped it + * yet. */ + swap_free(entry); + if (should_try_to_free_swap(page, vma, vmf->flags)) + try_to_free_swap(page); inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); - if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { + + /* + * Same logic as in do_wp_page(); however, optimize for fresh pages + * that are certainly not shared because we just allocated them without + * exposing them to the swapcache. + */ + if ((vmf->flags & FAULT_FLAG_WRITE) && !PageKsm(page) && + (page != swapcache || page_count(page) == 1)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); vmf->flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; @@ -3662,10 +3721,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); - swap_free(entry); - if (mem_cgroup_swap_full(page) || - (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) - try_to_free_swap(page); unlock_page(page); if (page != swapcache && swapcache) { /* @@ -3863,14 +3918,18 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) return ret; if (unlikely(PageHWPoison(vmf->page))) { + struct page *page = vmf->page; vm_fault_t poisonret = VM_FAULT_HWPOISON; if (ret & VM_FAULT_LOCKED) { + if (page_mapped(page)) + unmap_mapping_pages(page_mapping(page), + page->index, 1, false); /* Retry if a clean page was removed from the cache. */ - if (invalidate_inode_page(vmf->page)) - poisonret = 0; - unlock_page(vmf->page); + if (invalidate_inode_page(page)) + poisonret = VM_FAULT_NOPAGE; + unlock_page(page); } - put_page(vmf->page); + put_page(page); vmf->page = NULL; return poisonret; } @@ -5255,14 +5314,6 @@ void print_vma_addr(char *prefix, unsigned long ip) #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP) void __might_fault(const char *file, int line) { - /* - * Some code (nfs/sunrpc) uses socket ops on kernel memory while - * holding the mmap_lock, this is safe because kernel memory doesn't - * get paged out, therefore we'll never actually fault, and the - * below annotations will generate false positives. - */ - if (uaccess_kernel()) - return; if (pagefault_disabled()) return; __might_sleep(file, line); diff --git a/mm/memremap.c b/mm/memremap.c index c17eca4a48ca..af0223605e69 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -456,8 +456,6 @@ void free_zone_device_page(struct page *page) if (WARN_ON_ONCE(!page->pgmap->ops || !page->pgmap->ops->page_free)) return; - __ClearPageWaiters(page); - mem_cgroup_uncharge(page_folio(page)); /* diff --git a/mm/migrate.c b/mm/migrate.c index 4f30ed37856f..de175e2fdba5 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -53,7 +53,6 @@ #include <asm/tlbflush.h> -#define CREATE_TRACE_POINTS #include <trace/events/migrate.h> #include "internal.h" @@ -247,7 +246,10 @@ static bool remove_migration_pte(struct folio *folio, set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } if (vma->vm_flags & VM_LOCKED) - mlock_page_drain(smp_processor_id()); + mlock_page_drain_local(); + + trace_remove_migration_pte(pvmw.address, pte_val(pte), + compound_order(new)); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, pvmw.address, pvmw.pte); diff --git a/mm/mlock.c b/mm/mlock.c index efd2dd2943de..716caf851043 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -28,7 +28,14 @@ #include "internal.h" -static DEFINE_PER_CPU(struct pagevec, mlock_pvec); +struct mlock_pvec { + local_lock_t lock; + struct pagevec vec; +}; + +static DEFINE_PER_CPU(struct mlock_pvec, mlock_pvec) = { + .lock = INIT_LOCAL_LOCK(lock), +}; bool can_do_mlock(void) { @@ -203,18 +210,30 @@ static void mlock_pagevec(struct pagevec *pvec) pagevec_reinit(pvec); } -void mlock_page_drain(int cpu) +void mlock_page_drain_local(void) { struct pagevec *pvec; - pvec = &per_cpu(mlock_pvec, cpu); + local_lock(&mlock_pvec.lock); + pvec = this_cpu_ptr(&mlock_pvec.vec); + if (pagevec_count(pvec)) + mlock_pagevec(pvec); + local_unlock(&mlock_pvec.lock); +} + +void mlock_page_drain_remote(int cpu) +{ + struct pagevec *pvec; + + WARN_ON_ONCE(cpu_online(cpu)); + pvec = &per_cpu(mlock_pvec.vec, cpu); if (pagevec_count(pvec)) mlock_pagevec(pvec); } bool need_mlock_page_drain(int cpu) { - return pagevec_count(&per_cpu(mlock_pvec, cpu)); + return pagevec_count(&per_cpu(mlock_pvec.vec, cpu)); } /** @@ -223,7 +242,10 @@ bool need_mlock_page_drain(int cpu) */ void mlock_folio(struct folio *folio) { - struct pagevec *pvec = &get_cpu_var(mlock_pvec); + struct pagevec *pvec; + + local_lock(&mlock_pvec.lock); + pvec = this_cpu_ptr(&mlock_pvec.vec); if (!folio_test_set_mlocked(folio)) { int nr_pages = folio_nr_pages(folio); @@ -236,7 +258,7 @@ void mlock_folio(struct folio *folio) if (!pagevec_add(pvec, mlock_lru(&folio->page)) || folio_test_large(folio) || lru_cache_disabled()) mlock_pagevec(pvec); - put_cpu_var(mlock_pvec); + local_unlock(&mlock_pvec.lock); } /** @@ -245,9 +267,11 @@ void mlock_folio(struct folio *folio) */ void mlock_new_page(struct page *page) { - struct pagevec *pvec = &get_cpu_var(mlock_pvec); + struct pagevec *pvec; int nr_pages = thp_nr_pages(page); + local_lock(&mlock_pvec.lock); + pvec = this_cpu_ptr(&mlock_pvec.vec); SetPageMlocked(page); mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); @@ -256,7 +280,7 @@ void mlock_new_page(struct page *page) if (!pagevec_add(pvec, mlock_new(page)) || PageHead(page) || lru_cache_disabled()) mlock_pagevec(pvec); - put_cpu_var(mlock_pvec); + local_unlock(&mlock_pvec.lock); } /** @@ -265,8 +289,10 @@ void mlock_new_page(struct page *page) */ void munlock_page(struct page *page) { - struct pagevec *pvec = &get_cpu_var(mlock_pvec); + struct pagevec *pvec; + local_lock(&mlock_pvec.lock); + pvec = this_cpu_ptr(&mlock_pvec.vec); /* * TestClearPageMlocked(page) must be left to __munlock_page(), * which will check whether the page is multiply mlocked. @@ -276,7 +302,7 @@ void munlock_page(struct page *page) if (!pagevec_add(pvec, page) || PageHead(page) || lru_cache_disabled()) mlock_pagevec(pvec); - put_cpu_var(mlock_pvec); + local_unlock(&mlock_pvec.lock); } static int mlock_pte_range(pmd_t *pmd, unsigned long addr, @@ -721,13 +747,12 @@ int user_shm_lock(size_t size, struct ucounts *ucounts) locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; lock_limit = rlimit(RLIMIT_MEMLOCK); - if (lock_limit == RLIM_INFINITY) - allowed = 1; - lock_limit >>= PAGE_SHIFT; + if (lock_limit != RLIM_INFINITY) + lock_limit >>= PAGE_SHIFT; spin_lock(&shmlock_user_lock); memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); - if (!allowed && (memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) { + if ((memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) { dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); goto out; } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 435c02630593..7e2da284e427 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2465,16 +2465,14 @@ static void folio_account_dirtied(struct folio *folio, * * Caller must hold lock_page_memcg(). */ -void folio_account_cleaned(struct folio *folio, struct address_space *mapping, - struct bdi_writeback *wb) +void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) { - if (mapping_can_writeback(mapping)) { - long nr = folio_nr_pages(folio); - lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr); - zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); - wb_stat_mod(wb, WB_RECLAIMABLE, -nr); - task_io_account_cancelled_write(nr * PAGE_SIZE); - } + long nr = folio_nr_pages(folio); + + lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); + wb_stat_mod(wb, WB_RECLAIMABLE, -nr); + task_io_account_cancelled_write(nr * PAGE_SIZE); } /* @@ -2683,7 +2681,7 @@ void __folio_cancel_dirty(struct folio *folio) wb = unlocked_inode_to_wb_begin(inode, &cookie); if (folio_test_clear_dirty(folio)) - folio_account_cleaned(folio, mapping, wb); + folio_account_cleaned(folio, wb); unlocked_inode_to_wb_end(inode, &cookie); folio_memcg_unlock(folio); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6e0b4596cde9..2db95780e003 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -378,25 +378,9 @@ int page_group_by_mobility_disabled __read_mostly; */ static DEFINE_STATIC_KEY_TRUE(deferred_pages); -/* - * Calling kasan_poison_pages() only after deferred memory initialization - * has completed. Poisoning pages during deferred memory init will greatly - * lengthen the process and cause problem in large memory systems as the - * deferred pages initialization is done with interrupt disabled. - * - * Assuming that there will be no reference to those newly initialized - * pages before they are ever allocated, this should have no effect on - * KASAN memory tracking as the poison will be properly inserted at page - * allocation time. The only corner case is when pages are allocated by - * on-demand allocation and then freed again before the deferred pages - * initialization is done, but this is not likely to happen. - */ -static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) +static inline bool deferred_pages_enabled(void) { - return static_branch_unlikely(&deferred_pages) || - (!IS_ENABLED(CONFIG_KASAN_GENERIC) && - (fpi_flags & FPI_SKIP_KASAN_POISON)) || - PageSkipKASanPoison(page); + return static_branch_unlikely(&deferred_pages); } /* Returns true if the struct page for the pfn is uninitialised */ @@ -447,11 +431,9 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } #else -static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) +static inline bool deferred_pages_enabled(void) { - return (!IS_ENABLED(CONFIG_KASAN_GENERIC) && - (fpi_flags & FPI_SKIP_KASAN_POISON)) || - PageSkipKASanPoison(page); + return false; } static inline bool early_page_uninitialised(unsigned long pfn) @@ -1126,6 +1108,9 @@ continue_merging: buddy_pfn = __find_buddy_pfn(pfn, order); buddy = page + (buddy_pfn - pfn); + + if (!page_is_buddy(page, buddy, order)) + goto done_merging; buddy_mt = get_pageblock_migratetype(buddy); if (migratetype != buddy_mt @@ -1267,15 +1252,38 @@ out: return ret; } -static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags) +/* + * Skip KASAN memory poisoning when either: + * + * 1. Deferred memory initialization has not yet completed, + * see the explanation below. + * 2. Skipping poisoning is requested via FPI_SKIP_KASAN_POISON, + * see the comment next to it. + * 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON, + * see the comment next to it. + * + * Poisoning pages during deferred memory init will greatly lengthen the + * process and cause problem in large memory systems as the deferred pages + * initialization is done with interrupt disabled. + * + * Assuming that there will be no reference to those newly initialized + * pages before they are ever allocated, this should have no effect on + * KASAN memory tracking as the poison will be properly inserted at page + * allocation time. The only corner case is when pages are allocated by + * on-demand allocation and then freed again before the deferred pages + * initialization is done, but this is not likely to happen. + */ +static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) { - int i; + return deferred_pages_enabled() || + (!IS_ENABLED(CONFIG_KASAN_GENERIC) && + (fpi_flags & FPI_SKIP_KASAN_POISON)) || + PageSkipKASanPoison(page); +} - if (zero_tags) { - for (i = 0; i < numpages; i++) - tag_clear_highpage(page + i); - return; - } +static void kernel_init_free_pages(struct page *page, int numpages) +{ + int i; /* s390's use of memset() could override KASAN redzones. */ kasan_disable_current(); @@ -1292,7 +1300,7 @@ static __always_inline bool free_pages_prepare(struct page *page, unsigned int order, bool check_free, fpi_t fpi_flags) { int bad = 0; - bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags); + bool init = want_init_on_free(); VM_BUG_ON_PAGE(PageTail(page), page); @@ -1359,23 +1367,21 @@ static __always_inline bool free_pages_prepare(struct page *page, /* * As memory initialization might be integrated into KASAN, - * kasan_free_pages and kernel_init_free_pages must be + * KASAN poisoning and memory initialization code must be * kept together to avoid discrepancies in behavior. * * With hardware tag-based KASAN, memory tags must be set before the * page becomes unavailable via debug_pagealloc or arch_free_page. */ - if (kasan_has_integrated_init()) { - if (!skip_kasan_poison) - kasan_free_pages(page, order); - } else { - bool init = want_init_on_free(); + if (!should_skip_kasan_poison(page, fpi_flags)) { + kasan_poison_pages(page, order, init); - if (init) - kernel_init_free_pages(page, 1 << order, false); - if (!skip_kasan_poison) - kasan_poison_pages(page, order, init); + /* Memory is already initialized if KASAN did it internally. */ + if (kasan_has_integrated_init()) + init = false; } + if (init) + kernel_init_free_pages(page, 1 << order); /* * arch_free_page() can make the page's contents inaccessible. s390 @@ -2340,9 +2346,43 @@ static inline bool check_new_pcp(struct page *page, unsigned int order) } #endif /* CONFIG_DEBUG_VM */ +static inline bool should_skip_kasan_unpoison(gfp_t flags, bool init_tags) +{ + /* Don't skip if a software KASAN mode is enabled. */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC) || + IS_ENABLED(CONFIG_KASAN_SW_TAGS)) + return false; + + /* Skip, if hardware tag-based KASAN is not enabled. */ + if (!kasan_hw_tags_enabled()) + return true; + + /* + * With hardware tag-based KASAN enabled, skip if either: + * + * 1. Memory tags have already been cleared via tag_clear_highpage(). + * 2. Skipping has been requested via __GFP_SKIP_KASAN_UNPOISON. + */ + return init_tags || (flags & __GFP_SKIP_KASAN_UNPOISON); +} + +static inline bool should_skip_init(gfp_t flags) +{ + /* Don't skip, if hardware tag-based KASAN is not enabled. */ + if (!kasan_hw_tags_enabled()) + return false; + + /* For hardware tag-based KASAN, skip if requested. */ + return (flags & __GFP_SKIP_ZERO); +} + inline void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags) { + bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && + !should_skip_init(gfp_flags); + bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS); + set_page_private(page, 0); set_page_refcounted(page); @@ -2358,19 +2398,38 @@ inline void post_alloc_hook(struct page *page, unsigned int order, /* * As memory initialization might be integrated into KASAN, - * kasan_alloc_pages and kernel_init_free_pages must be + * KASAN unpoisoning and memory initializion code must be * kept together to avoid discrepancies in behavior. */ - if (kasan_has_integrated_init()) { - kasan_alloc_pages(page, order, gfp_flags); - } else { - bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags); + /* + * If memory tags should be zeroed (which happens only when memory + * should be initialized as well). + */ + if (init_tags) { + int i; + + /* Initialize both memory and tags. */ + for (i = 0; i != 1 << order; ++i) + tag_clear_highpage(page + i); + + /* Note that memory is already initialized by the loop above. */ + init = false; + } + if (!should_skip_kasan_unpoison(gfp_flags, init_tags)) { + /* Unpoison shadow memory or set memory tags. */ kasan_unpoison_pages(page, order, init); - if (init) - kernel_init_free_pages(page, 1 << order, - gfp_flags & __GFP_ZEROTAGS); + + /* Note that memory is already initialized by KASAN. */ + if (kasan_has_integrated_init()) + init = false; } + /* If memory is still not initialized, do it now. */ + if (init) + kernel_init_free_pages(page, 1 << order); + /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */ + if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON)) + SetPageSkipKASanPoison(page); set_page_owner(page, order, gfp_flags); page_table_check_alloc(page, order); @@ -8308,6 +8367,7 @@ static int page_alloc_cpu_dead(unsigned int cpu) struct zone *zone; lru_add_drain_cpu(cpu); + mlock_page_drain_remote(cpu); drain_pages(cpu); /* diff --git a/mm/page_owner.c b/mm/page_owner.c index 99e360df9465..fb3a05fdebdb 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -10,6 +10,7 @@ #include <linux/migrate.h> #include <linux/stackdepot.h> #include <linux/seq_file.h> +#include <linux/memcontrol.h> #include <linux/sched/clock.h> #include "internal.h" @@ -28,7 +29,9 @@ struct page_owner { depot_stack_handle_t free_handle; u64 ts_nsec; u64 free_ts_nsec; + char comm[TASK_COMM_LEN]; pid_t pid; + pid_t tgid; }; static bool page_owner_enabled = false; @@ -163,7 +166,10 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext, page_owner->gfp_mask = gfp_mask; page_owner->last_migrate_reason = -1; page_owner->pid = current->pid; + page_owner->tgid = current->tgid; page_owner->ts_nsec = local_clock(); + strlcpy(page_owner->comm, current->comm, + sizeof(page_owner->comm)); __set_bit(PAGE_EXT_OWNER, &page_ext->flags); __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); @@ -229,8 +235,10 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old) old_page_owner->last_migrate_reason; new_page_owner->handle = old_page_owner->handle; new_page_owner->pid = old_page_owner->pid; + new_page_owner->tgid = old_page_owner->tgid; new_page_owner->ts_nsec = old_page_owner->ts_nsec; new_page_owner->free_ts_nsec = old_page_owner->ts_nsec; + strcpy(new_page_owner->comm, old_page_owner->comm); /* * We don't clear the bit on the old folio as it's going to be freed @@ -325,6 +333,45 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, seq_putc(m, '\n'); } +/* + * Looking for memcg information and print it out + */ +static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret, + struct page *page) +{ +#ifdef CONFIG_MEMCG + unsigned long memcg_data; + struct mem_cgroup *memcg; + bool online; + char name[80]; + + rcu_read_lock(); + memcg_data = READ_ONCE(page->memcg_data); + if (!memcg_data) + goto out_unlock; + + if (memcg_data & MEMCG_DATA_OBJCGS) + ret += scnprintf(kbuf + ret, count - ret, + "Slab cache page\n"); + + memcg = page_memcg_check(page); + if (!memcg) + goto out_unlock; + + online = (memcg->css.flags & CSS_ONLINE); + cgroup_name(memcg->css.cgroup, name, sizeof(name)); + ret += scnprintf(kbuf + ret, count - ret, + "Charged %sto %smemcg %s\n", + PageMemcgKmem(page) ? "(via objcg) " : "", + online ? "" : "offline ", + name); +out_unlock: + rcu_read_unlock(); +#endif /* CONFIG_MEMCG */ + + return ret; +} + static ssize_t print_page_owner(char __user *buf, size_t count, unsigned long pfn, struct page *page, struct page_owner *page_owner, @@ -338,19 +385,17 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, if (!kbuf) return -ENOMEM; - ret = snprintf(kbuf, count, - "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns, free_ts %llu ns\n", + ret = scnprintf(kbuf, count, + "Page allocated via order %u, mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu ns, free_ts %llu ns\n", page_owner->order, page_owner->gfp_mask, &page_owner->gfp_mask, page_owner->pid, + page_owner->tgid, page_owner->comm, page_owner->ts_nsec, page_owner->free_ts_nsec); - if (ret >= count) - goto err; - /* Print information relevant to grouping pages by mobility */ pageblock_mt = get_pageblock_migratetype(page); page_mt = gfp_migratetype(page_owner->gfp_mask); - ret += snprintf(kbuf + ret, count - ret, + ret += scnprintf(kbuf + ret, count - ret, "PFN %lu type %s Block %lu type %s Flags %pGp\n", pfn, migratetype_names[page_mt], @@ -358,21 +403,18 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, migratetype_names[pageblock_mt], &page->flags); - if (ret >= count) - goto err; - ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0); if (ret >= count) goto err; if (page_owner->last_migrate_reason != -1) { - ret += snprintf(kbuf + ret, count - ret, + ret += scnprintf(kbuf + ret, count - ret, "Page has been migrated, last migrate reason: %s\n", migrate_reason_names[page_owner->last_migrate_reason]); - if (ret >= count) - goto err; } + ret = print_page_owner_memcg(kbuf, count, ret, page); + ret += snprintf(kbuf + ret, count - ret, "\n"); if (ret >= count) goto err; @@ -415,9 +457,10 @@ void __dump_page_owner(const struct page *page) else pr_alert("page_owner tracks the page as freed\n"); - pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu, free_ts %llu\n", + pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu, free_ts %llu\n", page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask, - page_owner->pid, page_owner->ts_nsec, page_owner->free_ts_nsec); + page_owner->pid, page_owner->tgid, page_owner->comm, + page_owner->ts_nsec, page_owner->free_ts_nsec); handle = READ_ONCE(page_owner->handle); if (!handle) diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index c6bd092ff7a3..dd3590dfc23d 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -144,7 +144,7 @@ alloc_buffer: spin_unlock_irq(&pcpu_lock); /* there can be at most this many free and allocated fragments */ - buffer = vmalloc(array_size(sizeof(int), (2 * max_nr_alloc + 1))); + buffer = vmalloc_array(2 * max_nr_alloc + 1, sizeof(int)); if (!buffer) return -ENOMEM; diff --git a/mm/readahead.c b/mm/readahead.c index d3a47546d17d..8e3775829513 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -13,29 +13,29 @@ * * Readahead is used to read content into the page cache before it is * explicitly requested by the application. Readahead only ever - * attempts to read pages that are not yet in the page cache. If a - * page is present but not up-to-date, readahead will not try to read + * attempts to read folios that are not yet in the page cache. If a + * folio is present but not up-to-date, readahead will not try to read * it. In that case a simple ->readpage() will be requested. * * Readahead is triggered when an application read request (whether a - * systemcall or a page fault) finds that the requested page is not in + * system call or a page fault) finds that the requested folio is not in * the page cache, or that it is in the page cache and has the - * %PG_readahead flag set. This flag indicates that the page was loaded - * as part of a previous read-ahead request and now that it has been - * accessed, it is time for the next read-ahead. + * readahead flag set. This flag indicates that the folio was read + * as part of a previous readahead request and now that it has been + * accessed, it is time for the next readahead. * * Each readahead request is partly synchronous read, and partly async - * read-ahead. This is reflected in the struct file_ra_state which - * contains ->size being to total number of pages, and ->async_size - * which is the number of pages in the async section. The first page in - * this async section will have %PG_readahead set as a trigger for a - * subsequent read ahead. Once a series of sequential reads has been + * readahead. This is reflected in the struct file_ra_state which + * contains ->size being the total number of pages, and ->async_size + * which is the number of pages in the async section. The readahead + * flag will be set on the first folio in this async section to trigger + * a subsequent readahead. Once a series of sequential reads has been * established, there should be no need for a synchronous component and - * all read ahead request will be fully asynchronous. + * all readahead request will be fully asynchronous. * - * When either of the triggers causes a readahead, three numbers need to - * be determined: the start of the region, the size of the region, and - * the size of the async tail. + * When either of the triggers causes a readahead, three numbers need + * to be determined: the start of the region to read, the size of the + * region, and the size of the async tail. * * The start of the region is simply the first page address at or after * the accessed address, which is not currently populated in the page @@ -45,14 +45,14 @@ * was explicitly requested from the determined request size, unless * this would be less than zero - then zero is used. NOTE THIS * CALCULATION IS WRONG WHEN THE START OF THE REGION IS NOT THE ACCESSED - * PAGE. + * PAGE. ALSO THIS CALCULATION IS NOT USED CONSISTENTLY. * * The size of the region is normally determined from the size of the * previous readahead which loaded the preceding pages. This may be * discovered from the struct file_ra_state for simple sequential reads, * or from examining the state of the page cache when multiple * sequential reads are interleaved. Specifically: where the readahead - * was triggered by the %PG_readahead flag, the size of the previous + * was triggered by the readahead flag, the size of the previous * readahead is assumed to be the number of pages from the triggering * page to the start of the new readahead. In these cases, the size of * the previous readahead is scaled, often doubled, for the new @@ -65,52 +65,52 @@ * larger than the current request, and it is not scaled up, unless it * is at the start of file. * - * In general read ahead is accelerated at the start of the file, as + * In general readahead is accelerated at the start of the file, as * reads from there are often sequential. There are other minor - * adjustments to the read ahead size in various special cases and these + * adjustments to the readahead size in various special cases and these * are best discovered by reading the code. * - * The above calculation determines the readahead, to which any requested - * read size may be added. + * The above calculation, based on the previous readahead size, + * determines the size of the readahead, to which any requested read + * size may be added. * * Readahead requests are sent to the filesystem using the ->readahead() * address space operation, for which mpage_readahead() is a canonical * implementation. ->readahead() should normally initiate reads on all - * pages, but may fail to read any or all pages without causing an IO + * folios, but may fail to read any or all folios without causing an I/O * error. The page cache reading code will issue a ->readpage() request - * for any page which ->readahead() does not provided, and only an error + * for any folio which ->readahead() did not read, and only an error * from this will be final. * - * ->readahead() will generally call readahead_page() repeatedly to get - * each page from those prepared for read ahead. It may fail to read a - * page by: + * ->readahead() will generally call readahead_folio() repeatedly to get + * each folio from those prepared for readahead. It may fail to read a + * folio by: * - * * not calling readahead_page() sufficiently many times, effectively - * ignoring some pages, as might be appropriate if the path to + * * not calling readahead_folio() sufficiently many times, effectively + * ignoring some folios, as might be appropriate if the path to * storage is congested. * - * * failing to actually submit a read request for a given page, + * * failing to actually submit a read request for a given folio, * possibly due to insufficient resources, or * * * getting an error during subsequent processing of a request. * - * In the last two cases, the page should be unlocked to indicate that - * the read attempt has failed. In the first case the page will be - * unlocked by the caller. + * In the last two cases, the folio should be unlocked by the filesystem + * to indicate that the read attempt has failed. In the first case the + * folio will be unlocked by the VFS. * - * Those pages not in the final ``async_size`` of the request should be + * Those folios not in the final ``async_size`` of the request should be * considered to be important and ->readahead() should not fail them due * to congestion or temporary resource unavailability, but should wait * for necessary resources (e.g. memory or indexing information) to - * become available. Pages in the final ``async_size`` may be + * become available. Folios in the final ``async_size`` may be * considered less urgent and failure to read them is more acceptable. - * In this case it is best to use delete_from_page_cache() to remove the - * pages from the page cache as is automatically done for pages that - * were not fetched with readahead_page(). This will allow a - * subsequent synchronous read ahead request to try them again. If they + * In this case it is best to use filemap_remove_folio() to remove the + * folios from the page cache as is automatically done for folios that + * were not fetched with readahead_folio(). This will allow a + * subsequent synchronous readahead request to try them again. If they * are left in the page cache, then they will be read individually using - * ->readpage(). - * + * ->readpage() which may be less efficient. */ #include <linux/kernel.h> @@ -142,91 +142,14 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) } EXPORT_SYMBOL_GPL(file_ra_state_init); -/* - * see if a page needs releasing upon read_cache_pages() failure - * - the caller of read_cache_pages() may have set PG_private or PG_fscache - * before calling, such as the NFS fs marking pages that are cached locally - * on disk, thus we need to give the fs a chance to clean up in the event of - * an error - */ -static void read_cache_pages_invalidate_page(struct address_space *mapping, - struct page *page) -{ - if (page_has_private(page)) { - if (!trylock_page(page)) - BUG(); - page->mapping = mapping; - folio_invalidate(page_folio(page), 0, PAGE_SIZE); - page->mapping = NULL; - unlock_page(page); - } - put_page(page); -} - -/* - * release a list of pages, invalidating them first if need be - */ -static void read_cache_pages_invalidate_pages(struct address_space *mapping, - struct list_head *pages) -{ - struct page *victim; - - while (!list_empty(pages)) { - victim = lru_to_page(pages); - list_del(&victim->lru); - read_cache_pages_invalidate_page(mapping, victim); - } -} - -/** - * read_cache_pages - populate an address space with some pages & start reads against them - * @mapping: the address_space - * @pages: The address of a list_head which contains the target pages. These - * pages have their ->index populated and are otherwise uninitialised. - * @filler: callback routine for filling a single page. - * @data: private data for the callback routine. - * - * Hides the details of the LRU cache etc from the filesystems. - * - * Returns: %0 on success, error return by @filler otherwise - */ -int read_cache_pages(struct address_space *mapping, struct list_head *pages, - int (*filler)(void *, struct page *), void *data) -{ - struct page *page; - int ret = 0; - - while (!list_empty(pages)) { - page = lru_to_page(pages); - list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, page->index, - readahead_gfp_mask(mapping))) { - read_cache_pages_invalidate_page(mapping, page); - continue; - } - put_page(page); - - ret = filler(data, page); - if (unlikely(ret)) { - read_cache_pages_invalidate_pages(mapping, pages); - break; - } - task_io_account_read(PAGE_SIZE); - } - return ret; -} - -EXPORT_SYMBOL(read_cache_pages); - -static void read_pages(struct readahead_control *rac, struct list_head *pages, - bool skip_page) +static void read_pages(struct readahead_control *rac) { const struct address_space_operations *aops = rac->mapping->a_ops; struct page *page; struct blk_plug plug; if (!readahead_count(rac)) - goto out; + return; blk_start_plug(&plug); @@ -234,7 +157,7 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages, aops->readahead(rac); /* * Clean up the remaining pages. The sizes in ->ra - * maybe be used to size next read-ahead, so make sure + * may be used to size the next readahead, so make sure * they accurately reflect what happened. */ while ((page = readahead_page(rac))) { @@ -246,13 +169,6 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages, unlock_page(page); put_page(page); } - } else if (aops->readpages) { - aops->readpages(rac->file, rac->mapping, pages, - readahead_count(rac)); - /* Clean up the remaining pages */ - put_pages_list(pages); - rac->_index += rac->_nr_pages; - rac->_nr_pages = 0; } else { while ((page = readahead_page(rac))) { aops->readpage(rac->file, page); @@ -262,12 +178,7 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages, blk_finish_plug(&plug); - BUG_ON(pages && !list_empty(pages)); BUG_ON(readahead_count(rac)); - -out: - if (skip_page) - rac->_index++; } /** @@ -289,7 +200,6 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, { struct address_space *mapping = ractl->mapping; unsigned long index = readahead_index(ractl); - LIST_HEAD(page_pool); gfp_t gfp_mask = readahead_gfp_mask(mapping); unsigned long i; @@ -321,7 +231,8 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, * have a stable reference to this page, and it's * not worth getting one just for that. */ - read_pages(ractl, &page_pool, true); + read_pages(ractl); + ractl->_index++; i = ractl->_index + ractl->_nr_pages - index - 1; continue; } @@ -329,13 +240,11 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, folio = filemap_alloc_folio(gfp_mask, 0); if (!folio) break; - if (mapping->a_ops->readpages) { - folio->index = index + i; - list_add(&folio->lru, &page_pool); - } else if (filemap_add_folio(mapping, folio, index + i, + if (filemap_add_folio(mapping, folio, index + i, gfp_mask) < 0) { folio_put(folio); - read_pages(ractl, &page_pool, true); + read_pages(ractl); + ractl->_index++; i = ractl->_index + ractl->_nr_pages - index - 1; continue; } @@ -349,7 +258,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, * uptodate then the caller will launch readpage again, and * will then handle the error. */ - read_pages(ractl, &page_pool, false); + read_pages(ractl); filemap_invalidate_unlock_shared(mapping); memalloc_nofs_restore(nofs); } @@ -394,8 +303,7 @@ void force_page_cache_ra(struct readahead_control *ractl, struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long max_pages, index; - if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages && - !mapping->a_ops->readahead)) + if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readahead)) return; /* @@ -512,7 +420,7 @@ static pgoff_t count_history_pages(struct address_space *mapping, } /* - * page cache context based read-ahead + * page cache context based readahead */ static int try_context_readahead(struct address_space *mapping, struct file_ra_state *ra, @@ -624,7 +532,7 @@ void page_cache_ra_order(struct readahead_control *ractl, ra->async_size += index - limit - 1; } - read_pages(ractl, NULL, false); + read_pages(ractl); /* * If there were already pages in the page cache, then we may have @@ -763,9 +671,9 @@ void page_cache_sync_ra(struct readahead_control *ractl, bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM); /* - * Even if read-ahead is disabled, issue this request as read-ahead + * Even if readahead is disabled, issue this request as readahead * as we'll need it to satisfy the requested range. The forced - * read-ahead will do the right thing and limit the read to just the + * readahead will do the right thing and limit the read to just the * requested range, which we'll set to 1 page for this case. */ if (!ractl->ra->ra_pages || blk_cgroup_congested()) { @@ -781,7 +689,6 @@ void page_cache_sync_ra(struct readahead_control *ractl, return; } - /* do read-ahead */ ondemand_readahead(ractl, NULL, req_count); } EXPORT_SYMBOL_GPL(page_cache_sync_ra); @@ -789,7 +696,7 @@ EXPORT_SYMBOL_GPL(page_cache_sync_ra); void page_cache_async_ra(struct readahead_control *ractl, struct folio *folio, unsigned long req_count) { - /* no read-ahead */ + /* no readahead */ if (!ractl->ra->ra_pages) return; @@ -804,7 +711,6 @@ void page_cache_async_ra(struct readahead_control *ractl, if (blk_cgroup_congested()) return; - /* do read-ahead */ ondemand_readahead(ractl, folio, req_count); } EXPORT_SYMBOL_GPL(page_cache_async_ra); diff --git a/mm/rmap.c b/mm/rmap.c index 615b5d323ee2..fedb82371efe 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -76,7 +76,9 @@ #include <asm/tlbflush.h> +#define CREATE_TRACE_POINTS #include <trace/events/tlb.h> +#include <trace/events/migrate.h> #include "internal.h" @@ -1236,14 +1238,14 @@ void page_add_new_anon_rmap(struct page *page, void page_add_file_rmap(struct page *page, struct vm_area_struct *vma, bool compound) { - int i, nr = 1; + int i, nr = 0; VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); lock_page_memcg(page); if (compound && PageTransHuge(page)) { int nr_pages = thp_nr_pages(page); - for (i = 0, nr = 0; i < nr_pages; i++) { + for (i = 0; i < nr_pages; i++) { if (atomic_inc_and_test(&page[i]._mapcount)) nr++; } @@ -1271,11 +1273,12 @@ void page_add_file_rmap(struct page *page, VM_WARN_ON_ONCE(!PageLocked(page)); SetPageDoubleMap(compound_head(page)); } - if (!atomic_inc_and_test(&page->_mapcount)) - goto out; + if (atomic_inc_and_test(&page->_mapcount)) + nr++; } - __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); out: + if (nr) + __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); unlock_page_memcg(page); mlock_vma_page(page, vma, compound); @@ -1283,7 +1286,7 @@ out: static void page_remove_file_rmap(struct page *page, bool compound) { - int i, nr = 1; + int i, nr = 0; VM_BUG_ON_PAGE(compound && !PageHead(page), page); @@ -1298,12 +1301,12 @@ static void page_remove_file_rmap(struct page *page, bool compound) if (compound && PageTransHuge(page)) { int nr_pages = thp_nr_pages(page); - for (i = 0, nr = 0; i < nr_pages; i++) { + for (i = 0; i < nr_pages; i++) { if (atomic_add_negative(-1, &page[i]._mapcount)) nr++; } if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) - return; + goto out; if (PageSwapBacked(page)) __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED, -nr_pages); @@ -1311,16 +1314,12 @@ static void page_remove_file_rmap(struct page *page, bool compound) __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED, -nr_pages); } else { - if (!atomic_add_negative(-1, &page->_mapcount)) - return; + if (atomic_add_negative(-1, &page->_mapcount)) + nr++; } - - /* - * We use the irq-unsafe __{inc|mod}_lruvec_page_state because - * these counters are not modified in interrupt context, and - * pte lock(a spinlock) is held, which implies preemption disabled. - */ - __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr); +out: + if (nr) + __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr); } static void page_remove_anon_compound_rmap(struct page *page) @@ -1589,7 +1588,30 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, /* MADV_FREE page check */ if (!folio_test_swapbacked(folio)) { - if (!folio_test_dirty(folio)) { + int ref_count, map_count; + + /* + * Synchronize with gup_pte_range(): + * - clear PTE; barrier; read refcount + * - inc refcount; barrier; read PTE + */ + smp_mb(); + + ref_count = folio_ref_count(folio); + map_count = folio_mapcount(folio); + + /* + * Order reads for page refcount and dirty flag + * (see comments in __remove_mapping()). + */ + smp_rmb(); + + /* + * The only page refs must be one from isolation + * plus the rmap(s) (dropped by discard:). + */ + if (ref_count == 1 + map_count && + !folio_test_dirty(folio)) { /* Invalidate as we cleared the pte */ mmu_notifier_invalidate_range(mm, address, address + PAGE_SIZE); @@ -1661,7 +1683,7 @@ discard: */ page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); if (vma->vm_flags & VM_LOCKED) - mlock_page_drain(smp_processor_id()); + mlock_page_drain_local(); folio_put(folio); } @@ -1852,6 +1874,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, if (pte_swp_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); + trace_set_migration_pte(pvmw.address, pte_val(swp_pte), + compound_order(&folio->page)); /* * No need to invalidate here it will synchronize on * against the special swap migration pte. @@ -1920,6 +1944,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, if (pte_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, address, pvmw.pte, swp_pte); + trace_set_migration_pte(address, pte_val(swp_pte), + compound_order(&folio->page)); /* * No need to invalidate here it will synchronize on * against the special swap migration pte. @@ -1935,7 +1961,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, */ page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); if (vma->vm_flags & VM_LOCKED) - mlock_page_drain(smp_processor_id()); + mlock_page_drain_local(); folio_put(folio); } diff --git a/mm/slab.c b/mm/slab.c index d9dec7a8fd79..b04e40078bdf 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3422,6 +3422,7 @@ static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp, if (is_kfence_address(objp)) { kmemleak_free_recursive(objp, cachep->flags); + memcg_slab_free_hook(cachep, &objp, 1); __kfence_free(objp); return; } diff --git a/mm/slab_common.c b/mm/slab_common.c index 23f2ab0713b7..6ee64d6208b3 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -807,7 +807,7 @@ void __init setup_kmalloc_cache_index_table(void) unsigned int i; BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || - (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); + !is_power_of_2(KMALLOC_MIN_SIZE)); for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { unsigned int elem = size_index_elem(i); diff --git a/mm/slob.c b/mm/slob.c index 8a8795520361..dfa6808dff36 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -714,7 +714,7 @@ int __kmem_cache_shrink(struct kmem_cache *d) return 0; } -struct kmem_cache kmem_cache_boot = { +static struct kmem_cache kmem_cache_boot = { .name = "kmem_cache", .size = sizeof(struct kmem_cache), .flags = SLAB_PANIC, diff --git a/mm/slub.c b/mm/slub.c index 07cdd999c3fe..74d92aa4a3a2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1788,8 +1788,8 @@ static void *setup_object(struct kmem_cache *s, struct slab *slab, /* * Slab allocation and freeing */ -static inline struct slab *alloc_slab_page(struct kmem_cache *s, - gfp_t flags, int node, struct kmem_cache_order_objects oo) +static inline struct slab *alloc_slab_page(gfp_t flags, int node, + struct kmem_cache_order_objects oo) { struct folio *folio; struct slab *slab; @@ -1941,7 +1941,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL); - slab = alloc_slab_page(s, alloc_gfp, node, oo); + slab = alloc_slab_page(alloc_gfp, node, oo); if (unlikely(!slab)) { oo = s->min; alloc_gfp = flags; @@ -1949,7 +1949,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * Allocation may have failed due to fragmentation. * Try a lower order alloc if possible */ - slab = alloc_slab_page(s, alloc_gfp, node, oo); + slab = alloc_slab_page(alloc_gfp, node, oo); if (unlikely(!slab)) goto out; stat(s, ORDER_FALLBACK); @@ -2348,10 +2348,10 @@ static void init_kmem_cache_cpus(struct kmem_cache *s) static void deactivate_slab(struct kmem_cache *s, struct slab *slab, void *freelist) { - enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; + enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE, M_FULL_NOLIST }; struct kmem_cache_node *n = get_node(s, slab_nid(slab)); - int lock = 0, free_delta = 0; - enum slab_modes l = M_NONE, m = M_NONE; + int free_delta = 0; + enum slab_modes mode = M_NONE; void *nextfree, *freelist_iter, *freelist_tail; int tail = DEACTIVATE_TO_HEAD; unsigned long flags = 0; @@ -2393,14 +2393,10 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab, * Ensure that the slab is unfrozen while the list presence * reflects the actual number of objects during unfreeze. * - * We setup the list membership and then perform a cmpxchg - * with the count. If there is a mismatch then the slab - * is not unfrozen but the slab is on the wrong list. - * - * Then we restart the process which may have to remove - * the slab from the list that we just put it on again - * because the number of objects in the slab may have - * changed. + * We first perform cmpxchg holding lock and insert to list + * when it succeed. If there is mismatch then the slab is not + * unfrozen and number of objects in the slab may have changed. + * Then release lock and retry cmpxchg again. */ redo: @@ -2419,61 +2415,52 @@ redo: new.frozen = 0; - if (!new.inuse && n->nr_partial >= s->min_partial) - m = M_FREE; - else if (new.freelist) { - m = M_PARTIAL; - if (!lock) { - lock = 1; - /* - * Taking the spinlock removes the possibility that - * acquire_slab() will see a slab that is frozen - */ - spin_lock_irqsave(&n->list_lock, flags); - } + if (!new.inuse && n->nr_partial >= s->min_partial) { + mode = M_FREE; + } else if (new.freelist) { + mode = M_PARTIAL; + /* + * Taking the spinlock removes the possibility that + * acquire_slab() will see a slab that is frozen + */ + spin_lock_irqsave(&n->list_lock, flags); + } else if (kmem_cache_debug_flags(s, SLAB_STORE_USER)) { + mode = M_FULL; + /* + * This also ensures that the scanning of full + * slabs from diagnostic functions will not see + * any frozen slabs. + */ + spin_lock_irqsave(&n->list_lock, flags); } else { - m = M_FULL; - if (kmem_cache_debug_flags(s, SLAB_STORE_USER) && !lock) { - lock = 1; - /* - * This also ensures that the scanning of full - * slabs from diagnostic functions will not see - * any frozen slabs. - */ - spin_lock_irqsave(&n->list_lock, flags); - } + mode = M_FULL_NOLIST; } - if (l != m) { - if (l == M_PARTIAL) - remove_partial(n, slab); - else if (l == M_FULL) - remove_full(s, n, slab); - - if (m == M_PARTIAL) - add_partial(n, slab, tail); - else if (m == M_FULL) - add_full(s, n, slab); - } - l = m; if (!cmpxchg_double_slab(s, slab, old.freelist, old.counters, new.freelist, new.counters, - "unfreezing slab")) + "unfreezing slab")) { + if (mode == M_PARTIAL || mode == M_FULL) + spin_unlock_irqrestore(&n->list_lock, flags); goto redo; + } - if (lock) - spin_unlock_irqrestore(&n->list_lock, flags); - if (m == M_PARTIAL) + if (mode == M_PARTIAL) { + add_partial(n, slab, tail); + spin_unlock_irqrestore(&n->list_lock, flags); stat(s, tail); - else if (m == M_FULL) - stat(s, DEACTIVATE_FULL); - else if (m == M_FREE) { + } else if (mode == M_FREE) { stat(s, DEACTIVATE_EMPTY); discard_slab(s, slab); stat(s, FREE_SLAB); + } else if (mode == M_FULL) { + add_full(s, n, slab); + spin_unlock_irqrestore(&n->list_lock, flags); + stat(s, DEACTIVATE_FULL); + } else if (mode == M_FULL_NOLIST) { + stat(s, DEACTIVATE_FULL); } } @@ -4014,15 +4001,6 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) return 1; } -static void set_min_partial(struct kmem_cache *s, unsigned long min) -{ - if (min < MIN_PARTIAL) - min = MIN_PARTIAL; - else if (min > MAX_PARTIAL) - min = MAX_PARTIAL; - s->min_partial = min; -} - static void set_cpu_partial(struct kmem_cache *s) { #ifdef CONFIG_SLUB_CPU_PARTIAL @@ -4060,7 +4038,7 @@ static void set_cpu_partial(struct kmem_cache *s) * calculate_sizes() determines the order and the distribution of data within * a slab object. */ -static int calculate_sizes(struct kmem_cache *s, int forced_order) +static int calculate_sizes(struct kmem_cache *s) { slab_flags_t flags = s->flags; unsigned int size = s->object_size; @@ -4164,10 +4142,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) size = ALIGN(size, s->align); s->size = size; s->reciprocal_size = reciprocal_value(size); - if (forced_order >= 0) - order = forced_order; - else - order = calculate_order(size); + order = calculate_order(size); if ((int)order < 0) return 0; @@ -4203,7 +4178,7 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) s->random = get_random_long(); #endif - if (!calculate_sizes(s, -1)) + if (!calculate_sizes(s)) goto error; if (disable_higher_order_debug) { /* @@ -4213,7 +4188,7 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) if (get_order(s->size) > get_order(s->object_size)) { s->flags &= ~DEBUG_METADATA_FLAGS; s->offset = 0; - if (!calculate_sizes(s, -1)) + if (!calculate_sizes(s)) goto error; } } @@ -4229,7 +4204,8 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) * The larger the object size is, the more slabs we want on the partial * list to avoid pounding the page allocator excessively. */ - set_min_partial(s, ilog2(s->size) / 2); + s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2); + s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial); set_cpu_partial(s); @@ -5358,12 +5334,10 @@ struct slab_attribute { }; #define SLAB_ATTR_RO(_name) \ - static struct slab_attribute _name##_attr = \ - __ATTR(_name, 0400, _name##_show, NULL) + static struct slab_attribute _name##_attr = __ATTR_RO_MODE(_name, 0400) #define SLAB_ATTR(_name) \ - static struct slab_attribute _name##_attr = \ - __ATTR(_name, 0600, _name##_show, _name##_store) + static struct slab_attribute _name##_attr = __ATTR_RW_MODE(_name, 0600) static ssize_t slab_size_show(struct kmem_cache *s, char *buf) { @@ -5410,7 +5384,7 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, if (err) return err; - set_min_partial(s, min); + s->min_partial = min; return length; } SLAB_ATTR(min_partial); diff --git a/mm/swap.c b/mm/swap.c index 5b30045207e1..7e320ec08c6a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -97,7 +97,6 @@ static void __page_cache_release(struct page *page) mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); } - __ClearPageWaiters(page); } static void __put_single_page(struct page *page) @@ -152,7 +151,6 @@ void put_pages_list(struct list_head *pages) continue; } /* Cannot be PageLRU because it's passed to us using the lru */ - __ClearPageWaiters(page); } free_unref_page_list(pages); @@ -626,7 +624,6 @@ void lru_add_drain_cpu(int cpu) pagevec_lru_move_fn(pvec, lru_lazyfree_fn); activate_page_drain(cpu); - mlock_page_drain(cpu); } /** @@ -708,6 +705,7 @@ void lru_add_drain(void) local_lock(&lru_pvecs.lock); lru_add_drain_cpu(smp_processor_id()); local_unlock(&lru_pvecs.lock); + mlock_page_drain_local(); } /* @@ -722,6 +720,7 @@ static void lru_add_and_bh_lrus_drain(void) lru_add_drain_cpu(smp_processor_id()); local_unlock(&lru_pvecs.lock); invalidate_bh_lrus_cpu(); + mlock_page_drain_local(); } void lru_add_drain_cpu_zone(struct zone *zone) @@ -730,6 +729,7 @@ void lru_add_drain_cpu_zone(struct zone *zone) lru_add_drain_cpu(smp_processor_id()); drain_local_pages(zone); local_unlock(&lru_pvecs.lock); + mlock_page_drain_local(); } #ifdef CONFIG_SMP @@ -971,8 +971,6 @@ void release_pages(struct page **pages, int nr) count_vm_event(UNEVICTABLE_PGCLEARED); } - __ClearPageWaiters(page); - list_add(&page->lru, &pages_to_free); } if (lruvec) diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index 7f34343c075a..5a9442979a18 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -167,14 +167,12 @@ unsigned short lookup_swap_cgroup_id(swp_entry_t ent) int swap_cgroup_swapon(int type, unsigned long max_pages) { void *array; - unsigned long array_size; unsigned long length; struct swap_cgroup_ctrl *ctrl; length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); - array_size = length * sizeof(void *); - array = vzalloc(array_size); + array = vcalloc(length, sizeof(void *)); if (!array) goto nomem; diff --git a/mm/swapfile.c b/mm/swapfile.c index 33c7abb16610..63c61f8b2611 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1167,16 +1167,6 @@ out: return NULL; } -static struct swap_info_struct *swap_info_get(swp_entry_t entry) -{ - struct swap_info_struct *p; - - p = _swap_info_get(entry); - if (p) - spin_lock(&p->lock); - return p; -} - static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, struct swap_info_struct *q) { @@ -1601,100 +1591,6 @@ static bool page_swapped(struct page *page) return false; } -static int page_trans_huge_map_swapcount(struct page *page, - int *total_swapcount) -{ - int i, map_swapcount, _total_swapcount; - unsigned long offset = 0; - struct swap_info_struct *si; - struct swap_cluster_info *ci = NULL; - unsigned char *map = NULL; - int swapcount = 0; - - /* hugetlbfs shouldn't call it */ - VM_BUG_ON_PAGE(PageHuge(page), page); - - if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) { - if (PageSwapCache(page)) - swapcount = page_swapcount(page); - if (total_swapcount) - *total_swapcount = swapcount; - return swapcount + page_trans_huge_mapcount(page); - } - - page = compound_head(page); - - _total_swapcount = map_swapcount = 0; - if (PageSwapCache(page)) { - swp_entry_t entry; - - entry.val = page_private(page); - si = _swap_info_get(entry); - if (si) { - map = si->swap_map; - offset = swp_offset(entry); - } - } - if (map) - ci = lock_cluster(si, offset); - for (i = 0; i < HPAGE_PMD_NR; i++) { - int mapcount = atomic_read(&page[i]._mapcount) + 1; - if (map) { - swapcount = swap_count(map[offset + i]); - _total_swapcount += swapcount; - } - map_swapcount = max(map_swapcount, mapcount + swapcount); - } - unlock_cluster(ci); - - if (PageDoubleMap(page)) - map_swapcount -= 1; - - if (total_swapcount) - *total_swapcount = _total_swapcount; - - return map_swapcount + compound_mapcount(page); -} - -/* - * We can write to an anon page without COW if there are no other references - * to it. And as a side-effect, free up its swap: because the old content - * on disk will never be read, and seeking back there to write new content - * later would only waste time away from clustering. - */ -bool reuse_swap_page(struct page *page) -{ - int count, total_swapcount; - - VM_BUG_ON_PAGE(!PageLocked(page), page); - if (unlikely(PageKsm(page))) - return false; - count = page_trans_huge_map_swapcount(page, &total_swapcount); - if (count == 1 && PageSwapCache(page) && - (likely(!PageTransCompound(page)) || - /* The remaining swap count will be freed soon */ - total_swapcount == page_swapcount(page))) { - if (!PageWriteback(page)) { - page = compound_head(page); - delete_from_swap_cache(page); - SetPageDirty(page); - } else { - swp_entry_t entry; - struct swap_info_struct *p; - - entry.val = page_private(page); - p = swap_info_get(entry); - if (p->flags & SWP_STABLE_WRITES) { - spin_unlock(&p->lock); - return false; - } - spin_unlock(&p->lock); - } - } - - return count <= 1; -} - /* * If swap is getting full, or if there are no more mappings of this page, * then try_to_free_swap is called to free its swap space. diff --git a/mm/util.c b/mm/util.c index 1e2728736398..54e5e761a9a9 100644 --- a/mm/util.c +++ b/mm/util.c @@ -649,6 +649,56 @@ void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) } EXPORT_SYMBOL(kvrealloc); +/** + * __vmalloc_array - allocate memory for a virtually contiguous array. + * @n: number of elements. + * @size: element size. + * @flags: the type of memory to allocate (see kmalloc). + */ +void *__vmalloc_array(size_t n, size_t size, gfp_t flags) +{ + size_t bytes; + + if (unlikely(check_mul_overflow(n, size, &bytes))) + return NULL; + return __vmalloc(bytes, flags); +} +EXPORT_SYMBOL(__vmalloc_array); + +/** + * vmalloc_array - allocate memory for a virtually contiguous array. + * @n: number of elements. + * @size: element size. + */ +void *vmalloc_array(size_t n, size_t size) +{ + return __vmalloc_array(n, size, GFP_KERNEL); +} +EXPORT_SYMBOL(vmalloc_array); + +/** + * __vcalloc - allocate and zero memory for a virtually contiguous array. + * @n: number of elements. + * @size: element size. + * @flags: the type of memory to allocate (see kmalloc). + */ +void *__vcalloc(size_t n, size_t size, gfp_t flags) +{ + return __vmalloc_array(n, size, flags | __GFP_ZERO); +} +EXPORT_SYMBOL(__vcalloc); + +/** + * vcalloc - allocate and zero memory for a virtually contiguous array. + * @n: number of elements. + * @size: element size. + */ +void *vcalloc(size_t n, size_t size) +{ + return __vmalloc_array(n, size, GFP_KERNEL | __GFP_ZERO); +} +EXPORT_SYMBOL(vcalloc); + /* Neutral page->mapping pointer to address_space or anon_vma or other */ void *page_rmapping(struct page *page) { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 99e0f3e8d1a5..e163372d3967 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -74,7 +74,7 @@ static const bool vmap_allow_huge = false; bool is_vmalloc_addr(const void *x) { - unsigned long addr = (unsigned long)x; + unsigned long addr = (unsigned long)kasan_reset_tag(x); return addr >= VMALLOC_START && addr < VMALLOC_END; } @@ -631,7 +631,7 @@ int is_vmalloc_or_module_addr(const void *x) * just put it in the vmalloc space. */ #if defined(CONFIG_MODULES) && defined(MODULES_VADDR) - unsigned long addr = (unsigned long)x; + unsigned long addr = (unsigned long)kasan_reset_tag(x); if (addr >= MODULES_VADDR && addr < MODULES_END) return 1; #endif @@ -795,6 +795,8 @@ static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr) struct vmap_area *va = NULL; struct rb_node *n = vmap_area_root.rb_node; + addr = (unsigned long)kasan_reset_tag((void *)addr); + while (n) { struct vmap_area *tmp; @@ -816,6 +818,8 @@ static struct vmap_area *__find_vmap_area(unsigned long addr) { struct rb_node *n = vmap_area_root.rb_node; + addr = (unsigned long)kasan_reset_tag((void *)addr); + while (n) { struct vmap_area *va; @@ -2166,7 +2170,7 @@ EXPORT_SYMBOL_GPL(vm_unmap_aliases); void vm_unmap_ram(const void *mem, unsigned int count) { unsigned long size = (unsigned long)count << PAGE_SHIFT; - unsigned long addr = (unsigned long)mem; + unsigned long addr = (unsigned long)kasan_reset_tag(mem); struct vmap_area *va; might_sleep(); @@ -2227,14 +2231,19 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node) mem = (void *)addr; } - kasan_unpoison_vmalloc(mem, size); - if (vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, PAGE_SHIFT) < 0) { vm_unmap_ram(mem, count); return NULL; } + /* + * Mark the pages as accessible, now that they are mapped. + * With hardware tag-based KASAN, marking is skipped for + * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). + */ + mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL); + return mem; } EXPORT_SYMBOL(vm_map_ram); @@ -2460,10 +2469,20 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, return NULL; } - kasan_unpoison_vmalloc((void *)va->va_start, requested_size); - setup_vmalloc_vm(area, va, flags, caller); + /* + * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a + * best-effort approach, as they can be mapped outside of vmalloc code. + * For VM_ALLOC mappings, the pages are marked as accessible after + * getting mapped in __vmalloc_node_range(). + * With hardware tag-based KASAN, marking is skipped for + * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). + */ + if (!(flags & VM_ALLOC)) + area->addr = kasan_unpoison_vmalloc(area->addr, requested_size, + KASAN_VMALLOC_PROT_NORMAL); + return area; } @@ -2547,7 +2566,7 @@ struct vm_struct *remove_vm_area(const void *addr) va->vm = NULL; spin_unlock(&vmap_area_lock); - kasan_free_shadow(vm); + kasan_free_module_shadow(vm); free_unmap_vmap_area(va); return vm; @@ -3071,7 +3090,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, const void *caller) { struct vm_struct *area; - void *addr; + void *ret; + kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE; unsigned long real_size = size; unsigned long real_align = align; unsigned int shift = PAGE_SHIFT; @@ -3124,11 +3144,51 @@ again: goto fail; } - addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node); - if (!addr) + /* + * Prepare arguments for __vmalloc_area_node() and + * kasan_unpoison_vmalloc(). + */ + if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) { + if (kasan_hw_tags_enabled()) { + /* + * Modify protection bits to allow tagging. + * This must be done before mapping. + */ + prot = arch_vmap_pgprot_tagged(prot); + + /* + * Skip page_alloc poisoning and zeroing for physical + * pages backing VM_ALLOC mapping. Memory is instead + * poisoned and zeroed by kasan_unpoison_vmalloc(). + */ + gfp_mask |= __GFP_SKIP_KASAN_UNPOISON | __GFP_SKIP_ZERO; + } + + /* Take note that the mapping is PAGE_KERNEL. */ + kasan_flags |= KASAN_VMALLOC_PROT_NORMAL; + } + + /* Allocate physical pages and map them into vmalloc space. */ + ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node); + if (!ret) goto fail; /* + * Mark the pages as accessible, now that they are mapped. + * The init condition should match the one in post_alloc_hook() + * (except for the should_skip_init() check) to make sure that memory + * is initialized under the same conditions regardless of the enabled + * KASAN mode. + * Tag-based KASAN modes only assign tags to normal non-executable + * allocations, see __kasan_unpoison_vmalloc(). + */ + kasan_flags |= KASAN_VMALLOC_VM_ALLOC; + if (!want_init_on_free() && want_init_on_alloc(gfp_mask)) + kasan_flags |= KASAN_VMALLOC_INIT; + /* KASAN_VMALLOC_PROT_NORMAL already set if required. */ + area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags); + + /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED * flag. It means that vm_struct is not fully initialized. * Now, it is fully initialized, so remove this flag here. @@ -3139,7 +3199,7 @@ again: if (!(vm_flags & VM_DEFER_KMEMLEAK)) kmemleak_vmalloc(area, size, gfp_mask); - return addr; + return area->addr; fail: if (shift > PAGE_SHIFT) { @@ -3424,6 +3484,8 @@ long vread(char *buf, char *addr, unsigned long count) unsigned long buflen = count; unsigned long n; + addr = kasan_reset_tag(addr); + /* Don't allow overflow */ if ((unsigned long) addr + count < count) count = -(unsigned long) addr; @@ -3809,9 +3871,6 @@ retry: for (area = 0; area < nr_vms; area++) { if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) goto err_free_shadow; - - kasan_unpoison_vmalloc((void *)vas[area]->va_start, - sizes[area]); } /* insert all vm's */ @@ -3824,6 +3883,16 @@ retry: } spin_unlock(&vmap_area_lock); + /* + * Mark allocated areas as accessible. Do it now as a best-effort + * approach, as they can be mapped outside of vmalloc code. + * With hardware tag-based KASAN, marking is skipped for + * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). + */ + for (area = 0; area < nr_vms; area++) + vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr, + vms[area]->size, KASAN_VMALLOC_PROT_NORMAL); + kfree(vas); return vms; |