summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig3
-rw-r--r--mm/balloon_compaction.c6
-rw-r--r--mm/damon/core.c5
-rw-r--r--mm/debug.c1
-rw-r--r--mm/filemap.c77
-rw-r--r--mm/gup.c10
-rw-r--r--mm/huge_memory.c109
-rw-r--r--mm/internal.h6
-rw-r--r--mm/kasan/Makefile2
-rw-r--r--mm/kasan/common.c4
-rw-r--r--mm/kasan/hw_tags.c211
-rw-r--r--mm/kasan/kasan.h56
-rw-r--r--mm/kasan/report.c336
-rw-r--r--mm/kasan/report_generic.c34
-rw-r--r--mm/kasan/report_hw_tags.c1
-rw-r--r--mm/kasan/report_sw_tags.c16
-rw-r--r--mm/kasan/report_tags.c2
-rw-r--r--mm/kasan/shadow.c64
-rw-r--r--mm/kfence/core.c11
-rw-r--r--mm/kfence/kfence.h3
-rw-r--r--mm/khugepaged.c11
-rw-r--r--mm/kmemleak.c9
-rw-r--r--mm/maccess.c119
-rw-r--r--mm/madvise.c48
-rw-r--r--mm/memblock.c5
-rw-r--r--mm/memcontrol.c2
-rw-r--r--mm/memory.c149
-rw-r--r--mm/memremap.c2
-rw-r--r--mm/migrate.c6
-rw-r--r--mm/mlock.c53
-rw-r--r--mm/page-writeback.c18
-rw-r--r--mm/page_alloc.c156
-rw-r--r--mm/page_owner.c71
-rw-r--r--mm/percpu-stats.c2
-rw-r--r--mm/readahead.c204
-rw-r--r--mm/rmap.c66
-rw-r--r--mm/slab.c1
-rw-r--r--mm/slab_common.c2
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c130
-rw-r--r--mm/swap.c8
-rw-r--r--mm/swap_cgroup.c4
-rw-r--r--mm/swapfile.c104
-rw-r--r--mm/util.c50
-rw-r--r--mm/vmalloc.c99
45 files changed, 1230 insertions, 1048 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 761f5021ba51..034d87953600 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -762,6 +762,9 @@ config ARCH_HAS_CURRENT_STACK_POINTER
register alias named "current_stack_pointer", this config can be
selected.
+config ARCH_HAS_FILTER_PGPROT
+ bool
+
config ARCH_HAS_PTE_DEVMAP
bool
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 907fefde2572..4b8eab4b3f45 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -203,7 +203,7 @@ EXPORT_SYMBOL_GPL(balloon_page_dequeue);
#ifdef CONFIG_BALLOON_COMPACTION
-bool balloon_page_isolate(struct page *page, isolate_mode_t mode)
+static bool balloon_page_isolate(struct page *page, isolate_mode_t mode)
{
struct balloon_dev_info *b_dev_info = balloon_page_device(page);
@@ -217,7 +217,7 @@ bool balloon_page_isolate(struct page *page, isolate_mode_t mode)
return true;
}
-void balloon_page_putback(struct page *page)
+static void balloon_page_putback(struct page *page)
{
struct balloon_dev_info *b_dev_info = balloon_page_device(page);
unsigned long flags;
@@ -230,7 +230,7 @@ void balloon_page_putback(struct page *page)
/* move_to_new_page() counterpart for a ballooned page */
-int balloon_page_migrate(struct address_space *mapping,
+static int balloon_page_migrate(struct address_space *mapping,
struct page *newpage, struct page *page,
enum migrate_mode mode)
{
diff --git a/mm/damon/core.c b/mm/damon/core.c
index c1e0fed4e877..5ce8d7c867f0 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1019,12 +1019,15 @@ static int kdamond_wait_activation(struct damon_ctx *ctx)
struct damos *s;
unsigned long wait_time;
unsigned long min_wait_time = 0;
+ bool init_wait_time = false;
while (!kdamond_need_stop(ctx)) {
damon_for_each_scheme(s, ctx) {
wait_time = damos_wmark_wait_us(s);
- if (!min_wait_time || wait_time < min_wait_time)
+ if (!init_wait_time || wait_time < min_wait_time) {
+ init_wait_time = true;
min_wait_time = wait_time;
+ }
}
if (!min_wait_time)
return 0;
diff --git a/mm/debug.c b/mm/debug.c
index eeb7ea3ca292..bef329bf28f0 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -261,5 +261,4 @@ void page_init_poison(struct page *page, size_t size)
if (page_init_poisoning)
memset(page, PAGE_POISON_PATTERN, size);
}
-EXPORT_SYMBOL_GPL(page_init_poison);
#endif /* CONFIG_DEBUG_VM */
diff --git a/mm/filemap.c b/mm/filemap.c
index d2e6a79fe69d..3a5ffb5587cd 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -152,25 +152,25 @@ static void filemap_unaccount_folio(struct address_space *mapping,
VM_BUG_ON_FOLIO(folio_mapped(folio), folio);
if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) {
- int mapcount;
-
pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n",
current->comm, folio_pfn(folio));
dump_page(&folio->page, "still mapped when deleted");
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
- mapcount = page_mapcount(&folio->page);
- if (mapping_exiting(mapping) &&
- folio_ref_count(folio) >= mapcount + 2) {
- /*
- * All vmas have already been torn down, so it's
- * a good bet that actually the folio is unmapped,
- * and we'd prefer not to leak it: if we're wrong,
- * some other bad page check should catch it later.
- */
- page_mapcount_reset(&folio->page);
- folio_ref_sub(folio, mapcount);
+ if (mapping_exiting(mapping) && !folio_test_large(folio)) {
+ int mapcount = page_mapcount(&folio->page);
+
+ if (folio_ref_count(folio) >= mapcount + 2) {
+ /*
+ * All vmas have already been torn down, so it's
+ * a good bet that actually the page is unmapped
+ * and we'd rather not leak it: if we're wrong,
+ * another bad page check should catch it later.
+ */
+ page_mapcount_reset(&folio->page);
+ folio_ref_sub(folio, mapcount);
+ }
}
}
@@ -193,16 +193,20 @@ static void filemap_unaccount_folio(struct address_space *mapping,
/*
* At this point folio must be either written or cleaned by
* truncate. Dirty folio here signals a bug and loss of
- * unwritten data.
+ * unwritten data - on ordinary filesystems.
+ *
+ * But it's harmless on in-memory filesystems like tmpfs; and can
+ * occur when a driver which did get_user_pages() sets page dirty
+ * before putting it, while the inode is being finally evicted.
*
- * This fixes dirty accounting after removing the folio entirely
+ * Below fixes dirty accounting after removing the folio entirely
* but leaves the dirty flag set: it has no effect for truncated
* folio and anyway will be cleared before returning folio to
* buddy allocator.
*/
- if (WARN_ON_ONCE(folio_test_dirty(folio)))
- folio_account_cleaned(folio, mapping,
- inode_to_wb(mapping->host));
+ if (WARN_ON_ONCE(folio_test_dirty(folio) &&
+ mapping_can_writeback(mapping)))
+ folio_account_cleaned(folio, inode_to_wb(mapping->host));
}
/*
@@ -1185,24 +1189,17 @@ static void folio_wake_bit(struct folio *folio, int bit_nr)
}
/*
- * It is possible for other pages to have collided on the waitqueue
- * hash, so in that case check for a page match. That prevents a long-
- * term waiter
+ * It's possible to miss clearing waiters here, when we woke our page
+ * waiters, but the hashed waitqueue has waiters for other pages on it.
+ * That's okay, it's a rare case. The next waker will clear it.
*
- * It is still possible to miss a case here, when we woke page waiters
- * and removed them from the waitqueue, but there are still other
- * page waiters.
+ * Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE,
+ * other), the flag may be cleared in the course of freeing the page;
+ * but that is not required for correctness.
*/
- if (!waitqueue_active(q) || !key.page_match) {
+ if (!waitqueue_active(q) || !key.page_match)
folio_clear_waiters(folio);
- /*
- * It's possible to miss clearing Waiters here, when we woke
- * our page waiters, but the hashed waitqueue has waiters for
- * other pages on it.
- *
- * That's okay, it's a rare case. The next waker will clear it.
- */
- }
+
spin_unlock_irqrestore(&q->lock, flags);
}
@@ -2541,7 +2538,7 @@ static int filemap_create_folio(struct file *file,
* the page cache as the locked folio would then be enough to
* synchronize with hole punching. But there are code paths
* such as filemap_update_page() filling in partially uptodate
- * pages or ->readpages() that need to hold invalidate_lock
+ * pages or ->readahead() that need to hold invalidate_lock
* while mapping blocks for IO so let's hold the lock here as
* well to keep locking rules simple.
*/
@@ -3755,9 +3752,10 @@ out:
}
EXPORT_SYMBOL(generic_file_direct_write);
-ssize_t generic_perform_write(struct file *file,
- struct iov_iter *i, loff_t pos)
+ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
{
+ struct file *file = iocb->ki_filp;
+ loff_t pos = iocb->ki_pos;
struct address_space *mapping = file->f_mapping;
const struct address_space_operations *a_ops = mapping->a_ops;
long status = 0;
@@ -3782,7 +3780,7 @@ again:
* same page as we're writing to, without it being marked
* up-to-date.
*/
- if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
+ if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
status = -EFAULT;
break;
}
@@ -3887,7 +3885,8 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (written < 0 || !iov_iter_count(from) || IS_DAX(inode))
goto out;
- status = generic_perform_write(file, from, pos = iocb->ki_pos);
+ pos = iocb->ki_pos;
+ status = generic_perform_write(iocb, from);
/*
* If generic_perform_write() returned a synchronous error
* then we want to return the number of bytes which were
@@ -3919,7 +3918,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
*/
}
} else {
- written = generic_perform_write(file, from, iocb->ki_pos);
+ written = generic_perform_write(iocb, from);
if (likely(written > 0))
iocb->ki_pos += written;
}
diff --git a/mm/gup.c b/mm/gup.c
index 271fbe8195d7..f598a037eb04 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1404,6 +1404,7 @@ long populate_vma_page_range(struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
unsigned long nr_pages = (end - start) / PAGE_SIZE;
int gup_flags;
+ long ret;
VM_BUG_ON(!PAGE_ALIGNED(start));
VM_BUG_ON(!PAGE_ALIGNED(end));
@@ -1438,8 +1439,10 @@ long populate_vma_page_range(struct vm_area_struct *vma,
* We made sure addr is within a VMA, so the following will
* not result in a stack expansion that recurses back here.
*/
- return __get_user_pages(mm, start, nr_pages, gup_flags,
+ ret = __get_user_pages(mm, start, nr_pages, gup_flags,
NULL, NULL, locked);
+ lru_add_drain();
+ return ret;
}
/*
@@ -1471,6 +1474,7 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
struct mm_struct *mm = vma->vm_mm;
unsigned long nr_pages = (end - start) / PAGE_SIZE;
int gup_flags;
+ long ret;
VM_BUG_ON(!PAGE_ALIGNED(start));
VM_BUG_ON(!PAGE_ALIGNED(end));
@@ -1498,8 +1502,10 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
if (check_vma_flags(vma, gup_flags))
return -EINVAL;
- return __get_user_pages(mm, start, nr_pages, gup_flags,
+ ret = __get_user_pages(mm, start, nr_pages, gup_flags,
NULL, NULL, locked);
+ lru_add_drain();
+ return ret;
}
/*
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 005fab2f3b73..2fe38212e07c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -40,6 +40,9 @@
#include <asm/pgalloc.h>
#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/thp.h>
+
/*
* By default, transparent hugepage support is disabled in order to avoid
* risking an increased memory footprint for applications that are not
@@ -530,7 +533,7 @@ void prep_transhuge_page(struct page *page)
set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
}
-bool is_transparent_hugepage(struct page *page)
+static inline bool is_transparent_hugepage(struct page *page)
{
if (!PageCompound(page))
return false;
@@ -539,7 +542,6 @@ bool is_transparent_hugepage(struct page *page)
return is_huge_zero_page(page) ||
page[1].compound_dtor == TRANSHUGE_PAGE_DTOR;
}
-EXPORT_SYMBOL_GPL(is_transparent_hugepage);
static unsigned long __thp_get_unmapped_area(struct file *filp,
unsigned long addr, unsigned long len,
@@ -1301,7 +1303,6 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
page = pmd_page(orig_pmd);
VM_BUG_ON_PAGE(!PageHead(page), page);
- /* Lock page for reuse_swap_page() */
if (!trylock_page(page)) {
get_page(page);
spin_unlock(vmf->ptl);
@@ -1317,10 +1318,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
}
/*
- * We can only reuse the page if nobody else maps the huge page or it's
- * part.
+ * See do_wp_page(): we can only map the page writable if there are
+ * no additional references. Note that we always drain the LRU
+ * pagevecs immediately after adding a THP.
*/
- if (reuse_swap_page(page)) {
+ if (page_count(page) > 1 + PageSwapCache(page) * thp_nr_pages(page))
+ goto unlock_fallback;
+ if (PageSwapCache(page))
+ try_to_free_swap(page);
+ if (page_count(page) == 1) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -1331,6 +1337,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
return VM_FAULT_WRITE;
}
+unlock_fallback:
unlock_page(page);
spin_unlock(vmf->ptl);
fallback:
@@ -2126,8 +2133,6 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
{
spinlock_t *ptl;
struct mmu_notifier_range range;
- bool do_unlock_folio = false;
- pmd_t _pmd;
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address & HPAGE_PMD_MASK,
@@ -2146,42 +2151,12 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
goto out;
}
-repeat:
- if (pmd_trans_huge(*pmd)) {
- if (!folio) {
- folio = page_folio(pmd_page(*pmd));
- /*
- * An anonymous page must be locked, to ensure that a
- * concurrent reuse_swap_page() sees stable mapcount;
- * but reuse_swap_page() is not used on shmem or file,
- * and page lock must not be taken when zap_pmd_range()
- * calls __split_huge_pmd() while i_mmap_lock is held.
- */
- if (folio_test_anon(folio)) {
- if (unlikely(!folio_trylock(folio))) {
- folio_get(folio);
- _pmd = *pmd;
- spin_unlock(ptl);
- folio_lock(folio);
- spin_lock(ptl);
- if (unlikely(!pmd_same(*pmd, _pmd))) {
- folio_unlock(folio);
- folio_put(folio);
- folio = NULL;
- goto repeat;
- }
- folio_put(folio);
- }
- do_unlock_folio = true;
- }
- }
- } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
- goto out;
- __split_huge_pmd_locked(vma, pmd, range.start, freeze);
+ if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
+ is_pmd_migration_entry(*pmd))
+ __split_huge_pmd_locked(vma, pmd, range.start, freeze);
+
out:
spin_unlock(ptl);
- if (do_unlock_folio)
- folio_unlock(folio);
/*
* No need to double call mmu_notifier->invalidate_range() callback.
* They are 3 cases to consider inside __split_huge_pmd_locked():
@@ -2476,54 +2451,6 @@ static void __split_huge_page(struct page *page, struct list_head *list,
}
}
-/*
- * This calculates accurately how many mappings a transparent hugepage
- * has (unlike page_mapcount() which isn't fully accurate). This full
- * accuracy is primarily needed to know if copy-on-write faults can
- * reuse the page and change the mapping to read-write instead of
- * copying them. At the same time this returns the total_mapcount too.
- *
- * The function returns the highest mapcount any one of the subpages
- * has. If the return value is one, even if different processes are
- * mapping different subpages of the transparent hugepage, they can
- * all reuse it, because each process is reusing a different subpage.
- *
- * The total_mapcount is instead counting all virtual mappings of the
- * subpages. If the total_mapcount is equal to "one", it tells the
- * caller all mappings belong to the same "mm" and in turn the
- * anon_vma of the transparent hugepage can become the vma->anon_vma
- * local one as no other process may be mapping any of the subpages.
- *
- * It would be more accurate to replace page_mapcount() with
- * page_trans_huge_mapcount(), however we only use
- * page_trans_huge_mapcount() in the copy-on-write faults where we
- * need full accuracy to avoid breaking page pinning, because
- * page_trans_huge_mapcount() is slower than page_mapcount().
- */
-int page_trans_huge_mapcount(struct page *page)
-{
- int i, ret;
-
- /* hugetlbfs shouldn't call it */
- VM_BUG_ON_PAGE(PageHuge(page), page);
-
- if (likely(!PageTransCompound(page)))
- return atomic_read(&page->_mapcount) + 1;
-
- page = compound_head(page);
-
- ret = 0;
- for (i = 0; i < thp_nr_pages(page); i++) {
- int mapcount = atomic_read(&page[i]._mapcount) + 1;
- ret = max(ret, mapcount);
- }
-
- if (PageDoubleMap(page))
- ret -= 1;
-
- return ret + compound_mapcount(page);
-}
-
/* Racy check whether the huge page can be split */
bool can_split_folio(struct folio *folio, int *pextra_pins)
{
@@ -3131,6 +3058,7 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
set_pmd_at(mm, address, pvmw->pmd, pmdswp);
page_remove_rmap(page, vma, true);
put_page(page);
+ trace_set_migration_pmd(address, pmd_val(pmdswp));
}
void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
@@ -3163,5 +3091,6 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
/* No need to invalidate - it was non-present before */
update_mmu_cache_pmd(vma, address, pvmw->pmd);
+ trace_remove_migration_pmd(address, pmd_val(pmde));
}
#endif
diff --git a/mm/internal.h b/mm/internal.h
index 58dc6adc19c5..cf16280ce132 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -456,7 +456,8 @@ static inline void munlock_vma_page(struct page *page,
}
void mlock_new_page(struct page *page);
bool need_mlock_page_drain(int cpu);
-void mlock_page_drain(int cpu);
+void mlock_page_drain_local(void);
+void mlock_page_drain_remote(int cpu);
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
@@ -539,7 +540,8 @@ static inline void munlock_vma_page(struct page *page,
struct vm_area_struct *vma, bool compound) { }
static inline void mlock_new_page(struct page *page) { }
static inline bool need_mlock_page_drain(int cpu) { return false; }
-static inline void mlock_page_drain(int cpu) { }
+static inline void mlock_page_drain_local(void) { }
+static inline void mlock_page_drain_remote(int cpu) { }
static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
{
}
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index adcd9acaef61..1f84df9c302e 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -35,7 +35,7 @@ CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
-obj-$(CONFIG_KASAN) := common.o report.o
+obj-y := common.o report.o
obj-$(CONFIG_KASAN_GENERIC) += init.o generic.o report_generic.o shadow.o quarantine.o
obj-$(CONFIG_KASAN_HW_TAGS) += hw_tags.o report_hw_tags.o tags.o report_tags.o
obj-$(CONFIG_KASAN_SW_TAGS) += init.o report_sw_tags.o shadow.o sw_tags.o tags.o report_tags.o
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 92196562687b..d9079ec11f31 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -387,7 +387,7 @@ static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip)
}
/*
- * The object will be poisoned by kasan_free_pages() or
+ * The object will be poisoned by kasan_poison_pages() or
* kasan_slab_free_mempool().
*/
@@ -538,7 +538,7 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size,
return NULL;
/*
- * The object has already been unpoisoned by kasan_alloc_pages() for
+ * The object has already been unpoisoned by kasan_unpoison_pages() for
* alloc_pages() or by kasan_krealloc() for krealloc().
*/
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 7355cb534e4f..07a76c46daa5 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -32,6 +32,12 @@ enum kasan_arg_mode {
KASAN_ARG_MODE_ASYMM,
};
+enum kasan_arg_vmalloc {
+ KASAN_ARG_VMALLOC_DEFAULT,
+ KASAN_ARG_VMALLOC_OFF,
+ KASAN_ARG_VMALLOC_ON,
+};
+
enum kasan_arg_stacktrace {
KASAN_ARG_STACKTRACE_DEFAULT,
KASAN_ARG_STACKTRACE_OFF,
@@ -40,18 +46,28 @@ enum kasan_arg_stacktrace {
static enum kasan_arg kasan_arg __ro_after_init;
static enum kasan_arg_mode kasan_arg_mode __ro_after_init;
-static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init;
+static enum kasan_arg_vmalloc kasan_arg_vmalloc __initdata;
+static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata;
-/* Whether KASAN is enabled at all. */
+/*
+ * Whether KASAN is enabled at all.
+ * The value remains false until KASAN is initialized by kasan_init_hw_tags().
+ */
DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled);
EXPORT_SYMBOL(kasan_flag_enabled);
-/* Whether the selected mode is synchronous/asynchronous/asymmetric.*/
+/*
+ * Whether the selected mode is synchronous, asynchronous, or asymmetric.
+ * Defaults to KASAN_MODE_SYNC.
+ */
enum kasan_mode kasan_mode __ro_after_init;
EXPORT_SYMBOL_GPL(kasan_mode);
+/* Whether to enable vmalloc tagging. */
+DEFINE_STATIC_KEY_TRUE(kasan_flag_vmalloc);
+
/* Whether to collect alloc/free stack traces. */
-DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
+DEFINE_STATIC_KEY_TRUE(kasan_flag_stacktrace);
/* kasan=off/on */
static int __init early_kasan_flag(char *arg)
@@ -89,6 +105,23 @@ static int __init early_kasan_mode(char *arg)
}
early_param("kasan.mode", early_kasan_mode);
+/* kasan.vmalloc=off/on */
+static int __init early_kasan_flag_vmalloc(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (!strcmp(arg, "off"))
+ kasan_arg_vmalloc = KASAN_ARG_VMALLOC_OFF;
+ else if (!strcmp(arg, "on"))
+ kasan_arg_vmalloc = KASAN_ARG_VMALLOC_ON;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+early_param("kasan.vmalloc", early_kasan_flag_vmalloc);
+
/* kasan.stacktrace=off/on */
static int __init early_kasan_flag_stacktrace(char *arg)
{
@@ -116,7 +149,10 @@ static inline const char *kasan_mode_info(void)
return "sync";
}
-/* kasan_init_hw_tags_cpu() is called for each CPU. */
+/*
+ * kasan_init_hw_tags_cpu() is called for each CPU.
+ * Not marked as __init as a CPU can be hot-plugged after boot.
+ */
void kasan_init_hw_tags_cpu(void)
{
/*
@@ -124,7 +160,11 @@ void kasan_init_hw_tags_cpu(void)
* as this function is only called for MTE-capable hardware.
*/
- /* If KASAN is disabled via command line, don't initialize it. */
+ /*
+ * If KASAN is disabled via command line, don't initialize it.
+ * When this function is called, kasan_flag_enabled is not yet
+ * set by kasan_init_hw_tags(). Thus, check kasan_arg instead.
+ */
if (kasan_arg == KASAN_ARG_OFF)
return;
@@ -132,12 +172,7 @@ void kasan_init_hw_tags_cpu(void)
* Enable async or asymm modes only when explicitly requested
* through the command line.
*/
- if (kasan_arg_mode == KASAN_ARG_MODE_ASYNC)
- hw_enable_tagging_async();
- else if (kasan_arg_mode == KASAN_ARG_MODE_ASYMM)
- hw_enable_tagging_asymm();
- else
- hw_enable_tagging_sync();
+ kasan_enable_tagging();
}
/* kasan_init_hw_tags() is called once on boot CPU. */
@@ -151,86 +186,168 @@ void __init kasan_init_hw_tags(void)
if (kasan_arg == KASAN_ARG_OFF)
return;
- /* Enable KASAN. */
- static_branch_enable(&kasan_flag_enabled);
-
switch (kasan_arg_mode) {
case KASAN_ARG_MODE_DEFAULT:
- /*
- * Default to sync mode.
- */
- fallthrough;
+ /* Default is specified by kasan_mode definition. */
+ break;
case KASAN_ARG_MODE_SYNC:
- /* Sync mode enabled. */
kasan_mode = KASAN_MODE_SYNC;
break;
case KASAN_ARG_MODE_ASYNC:
- /* Async mode enabled. */
kasan_mode = KASAN_MODE_ASYNC;
break;
case KASAN_ARG_MODE_ASYMM:
- /* Asymm mode enabled. */
kasan_mode = KASAN_MODE_ASYMM;
break;
}
+ switch (kasan_arg_vmalloc) {
+ case KASAN_ARG_VMALLOC_DEFAULT:
+ /* Default is specified by kasan_flag_vmalloc definition. */
+ break;
+ case KASAN_ARG_VMALLOC_OFF:
+ static_branch_disable(&kasan_flag_vmalloc);
+ break;
+ case KASAN_ARG_VMALLOC_ON:
+ static_branch_enable(&kasan_flag_vmalloc);
+ break;
+ }
+
switch (kasan_arg_stacktrace) {
case KASAN_ARG_STACKTRACE_DEFAULT:
- /* Default to enabling stack trace collection. */
- static_branch_enable(&kasan_flag_stacktrace);
+ /* Default is specified by kasan_flag_stacktrace definition. */
break;
case KASAN_ARG_STACKTRACE_OFF:
- /* Do nothing, kasan_flag_stacktrace keeps its default value. */
+ static_branch_disable(&kasan_flag_stacktrace);
break;
case KASAN_ARG_STACKTRACE_ON:
static_branch_enable(&kasan_flag_stacktrace);
break;
}
- pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, stacktrace=%s)\n",
+ /* KASAN is now initialized, enable it. */
+ static_branch_enable(&kasan_flag_enabled);
+
+ pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, vmalloc=%s, stacktrace=%s)\n",
kasan_mode_info(),
+ kasan_vmalloc_enabled() ? "on" : "off",
kasan_stack_collection_enabled() ? "on" : "off");
}
-void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags)
+#ifdef CONFIG_KASAN_VMALLOC
+
+static void unpoison_vmalloc_pages(const void *addr, u8 tag)
{
+ struct vm_struct *area;
+ int i;
+
/*
- * This condition should match the one in post_alloc_hook() in
- * page_alloc.c.
+ * As hardware tag-based KASAN only tags VM_ALLOC vmalloc allocations
+ * (see the comment in __kasan_unpoison_vmalloc), all of the pages
+ * should belong to a single area.
*/
- bool init = !want_init_on_free() && want_init_on_alloc(flags);
-
- if (flags & __GFP_SKIP_KASAN_POISON)
- SetPageSkipKASanPoison(page);
+ area = find_vm_area((void *)addr);
+ if (WARN_ON(!area))
+ return;
- if (flags & __GFP_ZEROTAGS) {
- int i;
+ for (i = 0; i < area->nr_pages; i++) {
+ struct page *page = area->pages[i];
- for (i = 0; i != 1 << order; ++i)
- tag_clear_highpage(page + i);
- } else {
- kasan_unpoison_pages(page, order, init);
+ page_kasan_tag_set(page, tag);
}
}
-void kasan_free_pages(struct page *page, unsigned int order)
+void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
+ kasan_vmalloc_flags_t flags)
{
+ u8 tag;
+ unsigned long redzone_start, redzone_size;
+
+ if (!kasan_vmalloc_enabled())
+ return (void *)start;
+
+ if (!is_vmalloc_or_module_addr(start))
+ return (void *)start;
+
+ /*
+ * Skip unpoisoning and assigning a pointer tag for non-VM_ALLOC
+ * mappings as:
+ *
+ * 1. Unlike the software KASAN modes, hardware tag-based KASAN only
+ * supports tagging physical memory. Therefore, it can only tag a
+ * single mapping of normal physical pages.
+ * 2. Hardware tag-based KASAN can only tag memory mapped with special
+ * mapping protection bits, see arch_vmalloc_pgprot_modify().
+ * As non-VM_ALLOC mappings can be mapped outside of vmalloc code,
+ * providing these bits would require tracking all non-VM_ALLOC
+ * mappers.
+ *
+ * Thus, for VM_ALLOC mappings, hardware tag-based KASAN only tags
+ * the first virtual mapping, which is created by vmalloc().
+ * Tagging the page_alloc memory backing that vmalloc() allocation is
+ * skipped, see ___GFP_SKIP_KASAN_UNPOISON.
+ *
+ * For non-VM_ALLOC allocations, page_alloc memory is tagged as usual.
+ */
+ if (!(flags & KASAN_VMALLOC_VM_ALLOC))
+ return (void *)start;
+
+ /*
+ * Don't tag executable memory.
+ * The kernel doesn't tolerate having the PC register tagged.
+ */
+ if (!(flags & KASAN_VMALLOC_PROT_NORMAL))
+ return (void *)start;
+
+ tag = kasan_random_tag();
+ start = set_tag(start, tag);
+
+ /* Unpoison and initialize memory up to size. */
+ kasan_unpoison(start, size, flags & KASAN_VMALLOC_INIT);
+
+ /*
+ * Explicitly poison and initialize the in-page vmalloc() redzone.
+ * Unlike software KASAN modes, hardware tag-based KASAN doesn't
+ * unpoison memory when populating shadow for vmalloc() space.
+ */
+ redzone_start = round_up((unsigned long)start + size,
+ KASAN_GRANULE_SIZE);
+ redzone_size = round_up(redzone_start, PAGE_SIZE) - redzone_start;
+ kasan_poison((void *)redzone_start, redzone_size, KASAN_TAG_INVALID,
+ flags & KASAN_VMALLOC_INIT);
+
/*
- * This condition should match the one in free_pages_prepare() in
- * page_alloc.c.
+ * Set per-page tag flags to allow accessing physical memory for the
+ * vmalloc() mapping through page_address(vmalloc_to_page()).
*/
- bool init = want_init_on_free();
+ unpoison_vmalloc_pages(start, tag);
- kasan_poison_pages(page, order, init);
+ return (void *)start;
+}
+
+void __kasan_poison_vmalloc(const void *start, unsigned long size)
+{
+ /*
+ * No tagging here.
+ * The physical pages backing the vmalloc() allocation are poisoned
+ * through the usual page_alloc paths.
+ */
}
+#endif
+
#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
-void kasan_enable_tagging_sync(void)
+void kasan_enable_tagging(void)
{
- hw_enable_tagging_sync();
+ if (kasan_arg_mode == KASAN_ARG_MODE_ASYNC)
+ hw_enable_tagging_async();
+ else if (kasan_arg_mode == KASAN_ARG_MODE_ASYMM)
+ hw_enable_tagging_asymm();
+ else
+ hw_enable_tagging_sync();
}
-EXPORT_SYMBOL_GPL(kasan_enable_tagging_sync);
+EXPORT_SYMBOL_GPL(kasan_enable_tagging);
void kasan_force_async_fault(void)
{
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index c17fa8d26ffe..d79b83d673b1 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -12,7 +12,8 @@
#include <linux/static_key.h>
#include "../slab.h"
-DECLARE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
+DECLARE_STATIC_KEY_TRUE(kasan_flag_vmalloc);
+DECLARE_STATIC_KEY_TRUE(kasan_flag_stacktrace);
enum kasan_mode {
KASAN_MODE_SYNC,
@@ -22,6 +23,11 @@ enum kasan_mode {
extern enum kasan_mode kasan_mode __ro_after_init;
+static inline bool kasan_vmalloc_enabled(void)
+{
+ return static_branch_likely(&kasan_flag_vmalloc);
+}
+
static inline bool kasan_stack_collection_enabled(void)
{
return static_branch_unlikely(&kasan_flag_stacktrace);
@@ -71,17 +77,19 @@ static inline bool kasan_sync_fault_possible(void)
#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */
#define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */
#define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */
-#define KASAN_KMALLOC_FREETRACK 0xFA /* object was freed and has free track set */
+#define KASAN_VMALLOC_INVALID 0xF8 /* unallocated space in vmapped page */
#else
#define KASAN_FREE_PAGE KASAN_TAG_INVALID
#define KASAN_PAGE_REDZONE KASAN_TAG_INVALID
#define KASAN_KMALLOC_REDZONE KASAN_TAG_INVALID
#define KASAN_KMALLOC_FREE KASAN_TAG_INVALID
-#define KASAN_KMALLOC_FREETRACK KASAN_TAG_INVALID
+#define KASAN_VMALLOC_INVALID KASAN_TAG_INVALID /* only for SW_TAGS */
#endif
+#ifdef CONFIG_KASAN_GENERIC
+
+#define KASAN_KMALLOC_FREETRACK 0xFA /* object was freed and has free track set */
#define KASAN_GLOBAL_REDZONE 0xF9 /* redzone for global variable */
-#define KASAN_VMALLOC_INVALID 0xF8 /* unallocated space in vmapped page */
/*
* Stack redzone shadow values
@@ -110,6 +118,8 @@ static inline bool kasan_sync_fault_possible(void)
#define KASAN_ABI_VERSION 1
#endif
+#endif /* CONFIG_KASAN_GENERIC */
+
/* Metadata layout customization. */
#define META_BYTES_PER_BLOCK 1
#define META_BLOCKS_PER_ROW 16
@@ -117,9 +127,15 @@ static inline bool kasan_sync_fault_possible(void)
#define META_MEM_BYTES_PER_ROW (META_BYTES_PER_ROW * KASAN_GRANULE_SIZE)
#define META_ROWS_AROUND_ADDR 2
-struct kasan_access_info {
- const void *access_addr;
- const void *first_bad_addr;
+enum kasan_report_type {
+ KASAN_REPORT_ACCESS,
+ KASAN_REPORT_INVALID_FREE,
+};
+
+struct kasan_report_info {
+ enum kasan_report_type type;
+ void *access_addr;
+ void *first_bad_addr;
size_t access_size;
bool is_write;
unsigned long ip;
@@ -204,6 +220,14 @@ struct kasan_free_meta {
#endif
};
+#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
+/* Used in KUnit-compatible KASAN tests. */
+struct kunit_kasan_status {
+ bool report_found;
+ bool sync_fault;
+};
+#endif
+
struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache,
const void *object);
#ifdef CONFIG_KASAN_GENERIC
@@ -221,7 +245,8 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
static inline bool addr_has_metadata(const void *addr)
{
- return (addr >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
+ return (kasan_reset_tag(addr) >=
+ kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
}
/**
@@ -251,10 +276,10 @@ static inline void kasan_print_tags(u8 addr_tag, const void *addr) { }
#endif
void *kasan_find_first_bad_addr(void *addr, size_t size);
-const char *kasan_get_bug_type(struct kasan_access_info *info);
+const char *kasan_get_bug_type(struct kasan_report_info *info);
void kasan_metadata_fetch_row(char *buffer, void *row);
-#if defined(CONFIG_KASAN_GENERIC) && defined(CONFIG_KASAN_STACK)
+#if defined(CONFIG_KASAN_STACK)
void kasan_print_address_stack_frame(const void *addr);
#else
static inline void kasan_print_address_stack_frame(const void *addr) { }
@@ -340,12 +365,12 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
-void kasan_enable_tagging_sync(void);
+void kasan_enable_tagging(void);
void kasan_force_async_fault(void);
#else /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */
-static inline void kasan_enable_tagging_sync(void) { }
+static inline void kasan_enable_tagging(void) { }
static inline void kasan_force_async_fault(void) { }
#endif /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */
@@ -467,6 +492,13 @@ static inline bool kasan_arch_is_ready(void) { return true; }
#error kasan_arch_is_ready only works in KASAN generic outline mode!
#endif
+#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST)
+
+bool kasan_save_enable_multi_shot(void);
+void kasan_restore_multi_shot(bool enabled);
+
+#endif
+
/*
* Exported functions for interfaces called from assembly or from generated
* code. Declarations here to avoid warning about missing declarations.
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 3ad9624dcc56..199d77cce21a 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -13,6 +13,7 @@
#include <linux/ftrace.h>
#include <linux/init.h>
#include <linux/kernel.h>
+#include <linux/lockdep.h>
#include <linux/mm.h>
#include <linux/printk.h>
#include <linux/sched.h>
@@ -64,6 +65,40 @@ static int __init early_kasan_fault(char *arg)
}
early_param("kasan.fault", early_kasan_fault);
+static int __init kasan_set_multi_shot(char *str)
+{
+ set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
+ return 1;
+}
+__setup("kasan_multi_shot", kasan_set_multi_shot);
+
+/*
+ * Used to suppress reports within kasan_disable/enable_current() critical
+ * sections, which are used for marking accesses to slab metadata.
+ */
+static bool report_suppressed(void)
+{
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+ if (current->kasan_depth)
+ return true;
+#endif
+ return false;
+}
+
+/*
+ * Used to avoid reporting more than one KASAN bug unless kasan_multi_shot
+ * is enabled. Note that KASAN tests effectively enable kasan_multi_shot
+ * for their duration.
+ */
+static bool report_enabled(void)
+{
+ if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
+ return true;
+ return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
+}
+
+#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST)
+
bool kasan_save_enable_multi_shot(void)
{
return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
@@ -77,61 +112,87 @@ void kasan_restore_multi_shot(bool enabled)
}
EXPORT_SYMBOL_GPL(kasan_restore_multi_shot);
-static int __init kasan_set_multi_shot(char *str)
-{
- set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
- return 1;
-}
-__setup("kasan_multi_shot", kasan_set_multi_shot);
+#endif
-static void print_error_description(struct kasan_access_info *info)
+#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
+static void update_kunit_status(bool sync)
{
- pr_err("BUG: KASAN: %s in %pS\n",
- kasan_get_bug_type(info), (void *)info->ip);
- if (info->access_size)
- pr_err("%s of size %zu at addr %px by task %s/%d\n",
- info->is_write ? "Write" : "Read", info->access_size,
- info->access_addr, current->comm, task_pid_nr(current));
- else
- pr_err("%s at addr %px by task %s/%d\n",
- info->is_write ? "Write" : "Read",
- info->access_addr, current->comm, task_pid_nr(current));
+ struct kunit *test;
+ struct kunit_resource *resource;
+ struct kunit_kasan_status *status;
+
+ test = current->kunit_test;
+ if (!test)
+ return;
+
+ resource = kunit_find_named_resource(test, "kasan_status");
+ if (!resource) {
+ kunit_set_failure(test);
+ return;
+ }
+
+ status = (struct kunit_kasan_status *)resource->data;
+ WRITE_ONCE(status->report_found, true);
+ WRITE_ONCE(status->sync_fault, sync);
+
+ kunit_put_resource(resource);
}
+#else
+static void update_kunit_status(bool sync) { }
+#endif
static DEFINE_SPINLOCK(report_lock);
-static void start_report(unsigned long *flags)
+static void start_report(unsigned long *flags, bool sync)
{
- /*
- * Make sure we don't end up in loop.
- */
+ /* Respect the /proc/sys/kernel/traceoff_on_warning interface. */
+ disable_trace_on_warning();
+ /* Update status of the currently running KASAN test. */
+ update_kunit_status(sync);
+ /* Do not allow LOCKDEP mangling KASAN reports. */
+ lockdep_off();
+ /* Make sure we don't end up in loop. */
kasan_disable_current();
spin_lock_irqsave(&report_lock, *flags);
pr_err("==================================================================\n");
}
-static void end_report(unsigned long *flags, unsigned long addr)
+static void end_report(unsigned long *flags, void *addr)
{
- if (!kasan_async_fault_possible())
- trace_error_report_end(ERROR_DETECTOR_KASAN, addr);
+ if (addr)
+ trace_error_report_end(ERROR_DETECTOR_KASAN,
+ (unsigned long)addr);
pr_err("==================================================================\n");
- add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
spin_unlock_irqrestore(&report_lock, *flags);
- if (panic_on_warn && !test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) {
- /*
- * This thread may hit another WARN() in the panic path.
- * Resetting this prevents additional WARN() from panicking the
- * system on this thread. Other threads are blocked by the
- * panic_mutex in panic().
- */
- panic_on_warn = 0;
+ if (panic_on_warn && !test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
panic("panic_on_warn set ...\n");
- }
if (kasan_arg_fault == KASAN_ARG_FAULT_PANIC)
panic("kasan.fault=panic set ...\n");
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+ lockdep_on();
kasan_enable_current();
}
+static void print_error_description(struct kasan_report_info *info)
+{
+ if (info->type == KASAN_REPORT_INVALID_FREE) {
+ pr_err("BUG: KASAN: double-free or invalid-free in %pS\n",
+ (void *)info->ip);
+ return;
+ }
+
+ pr_err("BUG: KASAN: %s in %pS\n",
+ kasan_get_bug_type(info), (void *)info->ip);
+ if (info->access_size)
+ pr_err("%s of size %zu at addr %px by task %s/%d\n",
+ info->is_write ? "Write" : "Read", info->access_size,
+ info->access_addr, current->comm, task_pid_nr(current));
+ else
+ pr_err("%s at addr %px by task %s/%d\n",
+ info->is_write ? "Write" : "Read",
+ info->access_addr, current->comm, task_pid_nr(current));
+}
+
static void print_track(struct kasan_track *track, const char *prefix)
{
pr_err("%s by task %u:\n", prefix, track->pid);
@@ -170,9 +231,6 @@ static void describe_object_addr(struct kmem_cache *cache, void *object,
" which belongs to the cache %s of size %d\n",
object, cache->name, cache->object_size);
- if (!addr)
- return;
-
if (access_addr < object_addr) {
rel_type = "to the left";
rel_bytes = object_addr - access_addr;
@@ -261,19 +319,43 @@ static void print_address_description(void *addr, u8 tag)
void *object = nearest_obj(cache, slab, addr);
describe_object(cache, object, addr, tag);
+ pr_err("\n");
}
if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) {
pr_err("The buggy address belongs to the variable:\n");
pr_err(" %pS\n", addr);
+ pr_err("\n");
+ }
+
+ if (object_is_on_stack(addr)) {
+ /*
+ * Currently, KASAN supports printing frame information only
+ * for accesses to the task's own stack.
+ */
+ kasan_print_address_stack_frame(addr);
+ pr_err("\n");
+ }
+
+ if (is_vmalloc_addr(addr)) {
+ struct vm_struct *va = find_vm_area(addr);
+
+ if (va) {
+ pr_err("The buggy address belongs to the virtual mapping at\n"
+ " [%px, %px) created by:\n"
+ " %pS\n",
+ va->addr, va->addr + va->size, va->caller);
+ pr_err("\n");
+
+ page = vmalloc_to_page(page);
+ }
}
if (page) {
- pr_err("The buggy address belongs to the page:\n");
+ pr_err("The buggy address belongs to the physical page:\n");
dump_page(page, "kasan: bad access detected");
+ pr_err("\n");
}
-
- kasan_print_address_stack_frame(addr);
}
static bool meta_row_is_guilty(const void *row, const void *addr)
@@ -332,138 +414,110 @@ static void print_memory_metadata(const void *addr)
}
}
-static bool report_enabled(void)
+static void print_report(struct kasan_report_info *info)
{
-#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
- if (current->kasan_depth)
- return false;
-#endif
- if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
- return true;
- return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
-}
+ void *tagged_addr = info->access_addr;
+ void *untagged_addr = kasan_reset_tag(tagged_addr);
+ u8 tag = get_tag(tagged_addr);
-#if IS_ENABLED(CONFIG_KUNIT)
-static void kasan_update_kunit_status(struct kunit *cur_test)
-{
- struct kunit_resource *resource;
- struct kunit_kasan_expectation *kasan_data;
-
- resource = kunit_find_named_resource(cur_test, "kasan_data");
+ print_error_description(info);
+ if (addr_has_metadata(untagged_addr))
+ kasan_print_tags(tag, info->first_bad_addr);
+ pr_err("\n");
- if (!resource) {
- kunit_set_failure(cur_test);
- return;
+ if (addr_has_metadata(untagged_addr)) {
+ print_address_description(untagged_addr, tag);
+ print_memory_metadata(info->first_bad_addr);
+ } else {
+ dump_stack_lvl(KERN_ERR);
}
-
- kasan_data = (struct kunit_kasan_expectation *)resource->data;
- WRITE_ONCE(kasan_data->report_found, true);
- kunit_put_resource(resource);
}
-#endif /* IS_ENABLED(CONFIG_KUNIT) */
-void kasan_report_invalid_free(void *object, unsigned long ip)
+void kasan_report_invalid_free(void *ptr, unsigned long ip)
{
unsigned long flags;
- u8 tag = get_tag(object);
-
- object = kasan_reset_tag(object);
+ struct kasan_report_info info;
-#if IS_ENABLED(CONFIG_KUNIT)
- if (current->kunit_test)
- kasan_update_kunit_status(current->kunit_test);
-#endif /* IS_ENABLED(CONFIG_KUNIT) */
+ /*
+ * Do not check report_suppressed(), as an invalid-free cannot be
+ * caused by accessing slab metadata and thus should not be
+ * suppressed by kasan_disable/enable_current() critical sections.
+ */
+ if (unlikely(!report_enabled()))
+ return;
- start_report(&flags);
- pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip);
- kasan_print_tags(tag, object);
- pr_err("\n");
- print_address_description(object, tag);
- pr_err("\n");
- print_memory_metadata(object);
- end_report(&flags, (unsigned long)object);
-}
+ start_report(&flags, true);
-#ifdef CONFIG_KASAN_HW_TAGS
-void kasan_report_async(void)
-{
- unsigned long flags;
+ info.type = KASAN_REPORT_INVALID_FREE;
+ info.access_addr = ptr;
+ info.first_bad_addr = kasan_reset_tag(ptr);
+ info.access_size = 0;
+ info.is_write = false;
+ info.ip = ip;
-#if IS_ENABLED(CONFIG_KUNIT)
- if (current->kunit_test)
- kasan_update_kunit_status(current->kunit_test);
-#endif /* IS_ENABLED(CONFIG_KUNIT) */
+ print_report(&info);
- start_report(&flags);
- pr_err("BUG: KASAN: invalid-access\n");
- pr_err("Asynchronous mode enabled: no access details available\n");
- pr_err("\n");
- dump_stack_lvl(KERN_ERR);
- end_report(&flags, 0);
+ end_report(&flags, ptr);
}
-#endif /* CONFIG_KASAN_HW_TAGS */
-static void __kasan_report(unsigned long addr, size_t size, bool is_write,
- unsigned long ip)
+/*
+ * kasan_report() is the only reporting function that uses
+ * user_access_save/restore(): kasan_report_invalid_free() cannot be called
+ * from a UACCESS region, and kasan_report_async() is not used on x86.
+ */
+bool kasan_report(unsigned long addr, size_t size, bool is_write,
+ unsigned long ip)
{
- struct kasan_access_info info;
- void *tagged_addr;
- void *untagged_addr;
- unsigned long flags;
-
-#if IS_ENABLED(CONFIG_KUNIT)
- if (current->kunit_test)
- kasan_update_kunit_status(current->kunit_test);
-#endif /* IS_ENABLED(CONFIG_KUNIT) */
-
- disable_trace_on_warning();
+ bool ret = true;
+ void *ptr = (void *)addr;
+ unsigned long ua_flags = user_access_save();
+ unsigned long irq_flags;
+ struct kasan_report_info info;
+
+ if (unlikely(report_suppressed()) || unlikely(!report_enabled())) {
+ ret = false;
+ goto out;
+ }
- tagged_addr = (void *)addr;
- untagged_addr = kasan_reset_tag(tagged_addr);
+ start_report(&irq_flags, true);
- info.access_addr = tagged_addr;
- if (addr_has_metadata(untagged_addr))
- info.first_bad_addr =
- kasan_find_first_bad_addr(tagged_addr, size);
- else
- info.first_bad_addr = untagged_addr;
+ info.type = KASAN_REPORT_ACCESS;
+ info.access_addr = ptr;
+ info.first_bad_addr = kasan_find_first_bad_addr(ptr, size);
info.access_size = size;
info.is_write = is_write;
info.ip = ip;
- start_report(&flags);
+ print_report(&info);
- print_error_description(&info);
- if (addr_has_metadata(untagged_addr))
- kasan_print_tags(get_tag(tagged_addr), info.first_bad_addr);
- pr_err("\n");
+ end_report(&irq_flags, ptr);
- if (addr_has_metadata(untagged_addr)) {
- print_address_description(untagged_addr, get_tag(tagged_addr));
- pr_err("\n");
- print_memory_metadata(info.first_bad_addr);
- } else {
- dump_stack_lvl(KERN_ERR);
- }
+out:
+ user_access_restore(ua_flags);
- end_report(&flags, addr);
+ return ret;
}
-bool kasan_report(unsigned long addr, size_t size, bool is_write,
- unsigned long ip)
+#ifdef CONFIG_KASAN_HW_TAGS
+void kasan_report_async(void)
{
- unsigned long flags = user_access_save();
- bool ret = false;
-
- if (likely(report_enabled())) {
- __kasan_report(addr, size, is_write, ip);
- ret = true;
- }
+ unsigned long flags;
- user_access_restore(flags);
+ /*
+ * Do not check report_suppressed(), as kasan_disable/enable_current()
+ * critical sections do not affect Hardware Tag-Based KASAN.
+ */
+ if (unlikely(!report_enabled()))
+ return;
- return ret;
+ start_report(&flags, false);
+ pr_err("BUG: KASAN: invalid-access\n");
+ pr_err("Asynchronous fault: no details available\n");
+ pr_err("\n");
+ dump_stack_lvl(KERN_ERR);
+ end_report(&flags, NULL);
}
+#endif /* CONFIG_KASAN_HW_TAGS */
#ifdef CONFIG_KASAN_INLINE
/*
diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c
index 139615ef326b..efc5e79a103f 100644
--- a/mm/kasan/report_generic.c
+++ b/mm/kasan/report_generic.c
@@ -34,12 +34,16 @@ void *kasan_find_first_bad_addr(void *addr, size_t size)
{
void *p = addr;
+ if (!addr_has_metadata(p))
+ return p;
+
while (p < addr + size && !(*(u8 *)kasan_mem_to_shadow(p)))
p += KASAN_GRANULE_SIZE;
+
return p;
}
-static const char *get_shadow_bug_type(struct kasan_access_info *info)
+static const char *get_shadow_bug_type(struct kasan_report_info *info)
{
const char *bug_type = "unknown-crash";
u8 *shadow_addr;
@@ -91,7 +95,7 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info)
return bug_type;
}
-static const char *get_wild_bug_type(struct kasan_access_info *info)
+static const char *get_wild_bug_type(struct kasan_report_info *info)
{
const char *bug_type = "unknown-crash";
@@ -105,7 +109,7 @@ static const char *get_wild_bug_type(struct kasan_access_info *info)
return bug_type;
}
-const char *kasan_get_bug_type(struct kasan_access_info *info)
+const char *kasan_get_bug_type(struct kasan_report_info *info)
{
/*
* If access_size is a negative number, then it has reason to be
@@ -180,7 +184,7 @@ static void print_decoded_frame_descr(const char *frame_descr)
return;
pr_err("\n");
- pr_err("this frame has %lu %s:\n", num_objects,
+ pr_err("This frame has %lu %s:\n", num_objects,
num_objects == 1 ? "object" : "objects");
while (num_objects--) {
@@ -211,6 +215,7 @@ static void print_decoded_frame_descr(const char *frame_descr)
}
}
+/* Returns true only if the address is on the current task's stack. */
static bool __must_check get_address_stack_frame_info(const void *addr,
unsigned long *offset,
const char **frame_descr,
@@ -224,13 +229,6 @@ static bool __must_check get_address_stack_frame_info(const void *addr,
BUILD_BUG_ON(IS_ENABLED(CONFIG_STACK_GROWSUP));
- /*
- * NOTE: We currently only support printing frame information for
- * accesses to the task's own stack.
- */
- if (!object_is_on_stack(addr))
- return false;
-
aligned_addr = round_down((unsigned long)addr, sizeof(long));
mem_ptr = round_down(aligned_addr, KASAN_GRANULE_SIZE);
shadow_ptr = kasan_mem_to_shadow((void *)aligned_addr);
@@ -269,17 +267,17 @@ void kasan_print_address_stack_frame(const void *addr)
const char *frame_descr;
const void *frame_pc;
+ if (WARN_ON(!object_is_on_stack(addr)))
+ return;
+
+ pr_err("The buggy address belongs to stack of task %s/%d\n",
+ current->comm, task_pid_nr(current));
+
if (!get_address_stack_frame_info(addr, &offset, &frame_descr,
&frame_pc))
return;
- /*
- * get_address_stack_frame_info only returns true if the given addr is
- * on the current task's stack.
- */
- pr_err("\n");
- pr_err("addr %px is located in stack of task %s/%d at offset %lu in frame:\n",
- addr, current->comm, task_pid_nr(current), offset);
+ pr_err(" and is located at offset %lu in frame:\n", offset);
pr_err(" %pS\n", frame_pc);
if (!frame_descr)
diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c
index 5dbbbb930e7a..f3d3be614e4b 100644
--- a/mm/kasan/report_hw_tags.c
+++ b/mm/kasan/report_hw_tags.c
@@ -17,6 +17,7 @@
void *kasan_find_first_bad_addr(void *addr, size_t size)
{
+ /* Return the same value regardless of whether addr_has_metadata(). */
return kasan_reset_tag(addr);
}
diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c
index d2298c357834..7a26397297ed 100644
--- a/mm/kasan/report_sw_tags.c
+++ b/mm/kasan/report_sw_tags.c
@@ -16,6 +16,7 @@
#include <linux/mm.h>
#include <linux/printk.h>
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
#include <linux/slab.h>
#include <linux/stackdepot.h>
#include <linux/stacktrace.h>
@@ -35,8 +36,12 @@ void *kasan_find_first_bad_addr(void *addr, size_t size)
void *p = kasan_reset_tag(addr);
void *end = p + size;
+ if (!addr_has_metadata(p))
+ return p;
+
while (p < end && tag == *(u8 *)kasan_mem_to_shadow(p))
p += KASAN_GRANULE_SIZE;
+
return p;
}
@@ -51,3 +56,14 @@ void kasan_print_tags(u8 addr_tag, const void *addr)
pr_err("Pointer tag: [%02x], memory tag: [%02x]\n", addr_tag, *shadow);
}
+
+#ifdef CONFIG_KASAN_STACK
+void kasan_print_address_stack_frame(const void *addr)
+{
+ if (WARN_ON(!object_is_on_stack(addr)))
+ return;
+
+ pr_err("The buggy address belongs to stack of task %s/%d\n",
+ current->comm, task_pid_nr(current));
+}
+#endif
diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c
index 1b41de88c53e..e25d2166e813 100644
--- a/mm/kasan/report_tags.c
+++ b/mm/kasan/report_tags.c
@@ -7,7 +7,7 @@
#include "kasan.h"
#include "../slab.h"
-const char *kasan_get_bug_type(struct kasan_access_info *info)
+const char *kasan_get_bug_type(struct kasan_report_info *info)
{
#ifdef CONFIG_KASAN_TAGS_IDENTIFY
struct kasan_alloc_meta *alloc_meta;
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 94136f84b449..a4f07de21771 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -345,27 +345,6 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
return 0;
}
-/*
- * Poison the shadow for a vmalloc region. Called as part of the
- * freeing process at the time the region is freed.
- */
-void kasan_poison_vmalloc(const void *start, unsigned long size)
-{
- if (!is_vmalloc_or_module_addr(start))
- return;
-
- size = round_up(size, KASAN_GRANULE_SIZE);
- kasan_poison(start, size, KASAN_VMALLOC_INVALID, false);
-}
-
-void kasan_unpoison_vmalloc(const void *start, unsigned long size)
-{
- if (!is_vmalloc_or_module_addr(start))
- return;
-
- kasan_unpoison(start, size, false);
-}
-
static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
void *unused)
{
@@ -496,9 +475,48 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
}
}
+void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
+ kasan_vmalloc_flags_t flags)
+{
+ /*
+ * Software KASAN modes unpoison both VM_ALLOC and non-VM_ALLOC
+ * mappings, so the KASAN_VMALLOC_VM_ALLOC flag is ignored.
+ * Software KASAN modes can't optimize zeroing memory by combining it
+ * with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored.
+ */
+
+ if (!is_vmalloc_or_module_addr(start))
+ return (void *)start;
+
+ /*
+ * Don't tag executable memory with the tag-based mode.
+ * The kernel doesn't tolerate having the PC register tagged.
+ */
+ if (IS_ENABLED(CONFIG_KASAN_SW_TAGS) &&
+ !(flags & KASAN_VMALLOC_PROT_NORMAL))
+ return (void *)start;
+
+ start = set_tag(start, kasan_random_tag());
+ kasan_unpoison(start, size, false);
+ return (void *)start;
+}
+
+/*
+ * Poison the shadow for a vmalloc region. Called as part of the
+ * freeing process at the time the region is freed.
+ */
+void __kasan_poison_vmalloc(const void *start, unsigned long size)
+{
+ if (!is_vmalloc_or_module_addr(start))
+ return;
+
+ size = round_up(size, KASAN_GRANULE_SIZE);
+ kasan_poison(start, size, KASAN_VMALLOC_INVALID, false);
+}
+
#else /* CONFIG_KASAN_VMALLOC */
-int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask)
+int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask)
{
void *ret;
size_t scaled_size;
@@ -534,7 +552,7 @@ int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask)
return -ENOMEM;
}
-void kasan_free_shadow(const struct vm_struct *vm)
+void kasan_free_module_shadow(const struct vm_struct *vm)
{
if (vm->flags & VM_KASAN)
vfree(kasan_mem_to_shadow(vm->addr));
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 2f9fdfde1941..a203747ad2c0 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -566,6 +566,8 @@ static unsigned long kfence_init_pool(void)
* enters __slab_free() slow-path.
*/
for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
+ struct slab *slab = page_slab(&pages[i]);
+
if (!i || (i % 2))
continue;
@@ -573,7 +575,11 @@ static unsigned long kfence_init_pool(void)
if (WARN_ON(compound_head(&pages[i]) != &pages[i]))
return addr;
- __SetPageSlab(&pages[i]);
+ __folio_set_slab(slab_folio(slab));
+#ifdef CONFIG_MEMCG
+ slab->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg |
+ MEMCG_DATA_OBJCGS;
+#endif
}
/*
@@ -1033,6 +1039,9 @@ void __kfence_free(void *addr)
{
struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
+#ifdef CONFIG_MEMCG
+ KFENCE_WARN_ON(meta->objcg);
+#endif
/*
* If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing
* the object, as the object page may be recycled for other-typed
diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
index 2a2d5de9d379..9a6c4b1b12a8 100644
--- a/mm/kfence/kfence.h
+++ b/mm/kfence/kfence.h
@@ -89,6 +89,9 @@ struct kfence_metadata {
struct kfence_track free_track;
/* For updating alloc_covered on frees. */
u32 alloc_stack_hash;
+#ifdef CONFIG_MEMCG
+ struct obj_cgroup *objcg;
+#endif
};
extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 1cdf7c38b9e5..a4e5eaf3eb01 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -46,7 +46,6 @@ enum scan_result {
SCAN_VMA_NULL,
SCAN_VMA_CHECK,
SCAN_ADDRESS_RANGE,
- SCAN_SWAP_CACHE_PAGE,
SCAN_DEL_PAGE_LRU,
SCAN_ALLOC_HUGE_PAGE_FAIL,
SCAN_CGROUP_CHARGE_FAIL,
@@ -683,16 +682,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
result = SCAN_PAGE_COUNT;
goto out;
}
- if (!pte_write(pteval) && PageSwapCache(page) &&
- !reuse_swap_page(page)) {
- /*
- * Page is in the swap cache and cannot be re-used.
- * It cannot be collapsed into a THP.
- */
- unlock_page(page);
- result = SCAN_SWAP_CACHE_PAGE;
- goto out;
- }
/*
* Isolate the page to avoid collapsing an hugepage
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 7580baa76af1..acd7cbb82e16 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -796,6 +796,8 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
unsigned long flags;
struct kmemleak_object *object;
struct kmemleak_scan_area *area = NULL;
+ unsigned long untagged_ptr;
+ unsigned long untagged_objp;
object = find_and_get_object(ptr, 1);
if (!object) {
@@ -804,6 +806,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
return;
}
+ untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
+ untagged_objp = (unsigned long)kasan_reset_tag((void *)object->pointer);
+
if (scan_area_cache)
area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
@@ -815,8 +820,8 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
goto out_unlock;
}
if (size == SIZE_MAX) {
- size = object->pointer + object->size - ptr;
- } else if (ptr + size > object->pointer + object->size) {
+ size = untagged_objp + object->size - untagged_ptr;
+ } else if (untagged_ptr + size > untagged_objp + object->size) {
kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
dump_object_info(object);
kmem_cache_free(scan_area_cache, area);
diff --git a/mm/maccess.c b/mm/maccess.c
index 3fed2b876539..5f4d240f67ec 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -12,8 +12,6 @@ bool __weak copy_from_kernel_nofault_allowed(const void *unsafe_src,
return true;
}
-#ifdef HAVE_GET_KERNEL_NOFAULT
-
#define copy_from_kernel_nofault_loop(dst, src, len, type, err_label) \
while (len >= sizeof(type)) { \
__get_kernel_nofault(dst, src, type, err_label); \
@@ -102,112 +100,6 @@ Efault:
dst[-1] = '\0';
return -EFAULT;
}
-#else /* HAVE_GET_KERNEL_NOFAULT */
-/**
- * copy_from_kernel_nofault(): safely attempt to read from kernel-space
- * @dst: pointer to the buffer that shall take the data
- * @src: address to read from
- * @size: size of the data chunk
- *
- * Safely read from kernel address @src to the buffer at @dst. If a kernel
- * fault happens, handle that and return -EFAULT. If @src is not a valid kernel
- * address, return -ERANGE.
- *
- * We ensure that the copy_from_user is executed in atomic context so that
- * do_page_fault() doesn't attempt to take mmap_lock. This makes
- * copy_from_kernel_nofault() suitable for use within regions where the caller
- * already holds mmap_lock, or other locks which nest inside mmap_lock.
- */
-long copy_from_kernel_nofault(void *dst, const void *src, size_t size)
-{
- long ret;
- mm_segment_t old_fs = get_fs();
-
- if (!copy_from_kernel_nofault_allowed(src, size))
- return -ERANGE;
-
- set_fs(KERNEL_DS);
- pagefault_disable();
- ret = __copy_from_user_inatomic(dst, (__force const void __user *)src,
- size);
- pagefault_enable();
- set_fs(old_fs);
-
- if (ret)
- return -EFAULT;
- return 0;
-}
-EXPORT_SYMBOL_GPL(copy_from_kernel_nofault);
-
-/**
- * copy_to_kernel_nofault(): safely attempt to write to a location
- * @dst: address to write to
- * @src: pointer to the data that shall be written
- * @size: size of the data chunk
- *
- * Safely write to address @dst from the buffer at @src. If a kernel fault
- * happens, handle that and return -EFAULT.
- */
-long copy_to_kernel_nofault(void *dst, const void *src, size_t size)
-{
- long ret;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- pagefault_disable();
- ret = __copy_to_user_inatomic((__force void __user *)dst, src, size);
- pagefault_enable();
- set_fs(old_fs);
-
- if (ret)
- return -EFAULT;
- return 0;
-}
-
-/**
- * strncpy_from_kernel_nofault: - Copy a NUL terminated string from unsafe
- * address.
- * @dst: Destination address, in kernel space. This buffer must be at
- * least @count bytes long.
- * @unsafe_addr: Unsafe address.
- * @count: Maximum number of bytes to copy, including the trailing NUL.
- *
- * Copies a NUL-terminated string from unsafe address to kernel buffer.
- *
- * On success, returns the length of the string INCLUDING the trailing NUL.
- *
- * If access fails, returns -EFAULT (some data may have been copied and the
- * trailing NUL added). If @unsafe_addr is not a valid kernel address, return
- * -ERANGE.
- *
- * If @count is smaller than the length of the string, copies @count-1 bytes,
- * sets the last byte of @dst buffer to NUL and returns @count.
- */
-long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
-{
- mm_segment_t old_fs = get_fs();
- const void *src = unsafe_addr;
- long ret;
-
- if (unlikely(count <= 0))
- return 0;
- if (!copy_from_kernel_nofault_allowed(unsafe_addr, count))
- return -ERANGE;
-
- set_fs(KERNEL_DS);
- pagefault_disable();
-
- do {
- ret = __get_user(*dst++, (const char __user __force *)src++);
- } while (dst[-1] && ret == 0 && src - unsafe_addr < count);
-
- dst[-1] = '\0';
- pagefault_enable();
- set_fs(old_fs);
-
- return ret ? -EFAULT : src - unsafe_addr;
-}
-#endif /* HAVE_GET_KERNEL_NOFAULT */
/**
* copy_from_user_nofault(): safely attempt to read from a user-space location
@@ -221,14 +113,11 @@ long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
long copy_from_user_nofault(void *dst, const void __user *src, size_t size)
{
long ret = -EFAULT;
- mm_segment_t old_fs = force_uaccess_begin();
-
if (access_ok(src, size)) {
pagefault_disable();
ret = __copy_from_user_inatomic(dst, src, size);
pagefault_enable();
}
- force_uaccess_end(old_fs);
if (ret)
return -EFAULT;
@@ -248,14 +137,12 @@ EXPORT_SYMBOL_GPL(copy_from_user_nofault);
long copy_to_user_nofault(void __user *dst, const void *src, size_t size)
{
long ret = -EFAULT;
- mm_segment_t old_fs = force_uaccess_begin();
if (access_ok(dst, size)) {
pagefault_disable();
ret = __copy_to_user_inatomic(dst, src, size);
pagefault_enable();
}
- force_uaccess_end(old_fs);
if (ret)
return -EFAULT;
@@ -284,17 +171,14 @@ EXPORT_SYMBOL_GPL(copy_to_user_nofault);
long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
long count)
{
- mm_segment_t old_fs;
long ret;
if (unlikely(count <= 0))
return 0;
- old_fs = force_uaccess_begin();
pagefault_disable();
ret = strncpy_from_user(dst, unsafe_addr, count);
pagefault_enable();
- force_uaccess_end(old_fs);
if (ret >= count) {
ret = count;
@@ -324,14 +208,11 @@ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
*/
long strnlen_user_nofault(const void __user *unsafe_addr, long count)
{
- mm_segment_t old_fs;
int ret;
- old_fs = force_uaccess_begin();
pagefault_disable();
ret = strnlen_user(unsafe_addr, count);
pagefault_enable();
- force_uaccess_end(old_fs);
return ret;
}
diff --git a/mm/madvise.c b/mm/madvise.c
index 39b712fd8300..1873616a37d2 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -52,6 +52,7 @@ static int madvise_need_mmap_write(int behavior)
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
+ case MADV_DONTNEED_LOCKED:
case MADV_COLD:
case MADV_PAGEOUT:
case MADV_FREE:
@@ -504,7 +505,7 @@ static void madvise_cold_page_range(struct mmu_gather *tlb,
static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
{
- return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
+ return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB));
}
static long madvise_cold(struct vm_area_struct *vma,
@@ -777,6 +778,29 @@ static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
return 0;
}
+static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long *end,
+ int behavior)
+{
+ if (!is_vm_hugetlb_page(vma)) {
+ unsigned int forbidden = VM_PFNMAP;
+
+ if (behavior != MADV_DONTNEED_LOCKED)
+ forbidden |= VM_LOCKED;
+
+ return !(vma->vm_flags & forbidden);
+ }
+
+ if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED)
+ return false;
+ if (start & ~huge_page_mask(hstate_vma(vma)))
+ return false;
+
+ *end = ALIGN(*end, huge_page_size(hstate_vma(vma)));
+ return true;
+}
+
static long madvise_dontneed_free(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end,
@@ -785,7 +809,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
*prev = vma;
- if (!can_madv_lru_vma(vma))
+ if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
return -EINVAL;
if (!userfaultfd_remove(vma, start, end)) {
@@ -807,7 +831,12 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
*/
return -ENOMEM;
}
- if (!can_madv_lru_vma(vma))
+ /*
+ * Potential end adjustment for hugetlb vma is OK as
+ * the check below keeps end within vma.
+ */
+ if (!madvise_dontneed_free_valid_vma(vma, start, &end,
+ behavior))
return -EINVAL;
if (end > vma->vm_end) {
/*
@@ -827,7 +856,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
VM_WARN_ON(start >= end);
}
- if (behavior == MADV_DONTNEED)
+ if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
return madvise_dontneed_single_vma(vma, start, end);
else if (behavior == MADV_FREE)
return madvise_free_single_vma(vma, start, end);
@@ -966,6 +995,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
return madvise_pageout(vma, prev, start, end);
case MADV_FREE:
case MADV_DONTNEED:
+ case MADV_DONTNEED_LOCKED:
return madvise_dontneed_free(vma, prev, start, end, behavior);
case MADV_POPULATE_READ:
case MADV_POPULATE_WRITE:
@@ -1096,6 +1126,7 @@ madvise_behavior_valid(int behavior)
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
+ case MADV_DONTNEED_LOCKED:
case MADV_FREE:
case MADV_COLD:
case MADV_PAGEOUT:
@@ -1433,16 +1464,9 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
while (iov_iter_count(&iter)) {
iovec = iov_iter_iovec(&iter);
- /*
- * do_madvise returns ENOMEM if unmapped holes are present
- * in the passed VMA. process_madvise() is expected to skip
- * unmapped holes passed to it in the 'struct iovec' list
- * and not fail because of them. Thus treat -ENOMEM return
- * from do_madvise as valid and continue processing.
- */
ret = do_madvise(mm, (unsigned long)iovec.iov_base,
iovec.iov_len, behavior);
- if (ret < 0 && ret != -ENOMEM)
+ if (ret < 0)
break;
iov_iter_advance(&iter, iovec.iov_len);
}
diff --git a/mm/memblock.c b/mm/memblock.c
index b12a364f2766..e4f03a6e8e56 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1284,11 +1284,10 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
{
int zone_nid = zone_to_nid(zone);
phys_addr_t spa, epa;
- int nid;
__next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
&memblock.memory, &memblock.reserved,
- &spa, &epa, &nid);
+ &spa, &epa, NULL);
while (*idx != U64_MAX) {
unsigned long epfn = PFN_DOWN(epa);
@@ -1315,7 +1314,7 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
__next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
&memblock.memory, &memblock.reserved,
- &spa, &epa, &nid);
+ &spa, &epa, NULL);
}
/* signal end of iteration */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d495c2acb9f0..725f76723220 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -60,7 +60,7 @@
#include <linux/oom.h>
#include <linux/lockdep.h>
#include <linux/file.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
#include <linux/psi.h>
#include <linux/seq_buf.h>
#include "internal.h"
diff --git a/mm/memory.c b/mm/memory.c
index 7c40850b7124..76e3af9639d9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3287,19 +3287,35 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
if (PageAnon(vmf->page)) {
struct page *page = vmf->page;
- /* PageKsm() doesn't necessarily raise the page refcount */
- if (PageKsm(page) || page_count(page) != 1)
+ /*
+ * We have to verify under page lock: these early checks are
+ * just an optimization to avoid locking the page and freeing
+ * the swapcache if there is little hope that we can reuse.
+ *
+ * PageKsm() doesn't necessarily raise the page refcount.
+ */
+ if (PageKsm(page) || page_count(page) > 3)
+ goto copy;
+ if (!PageLRU(page))
+ /*
+ * Note: We cannot easily detect+handle references from
+ * remote LRU pagevecs or references to PageLRU() pages.
+ */
+ lru_add_drain();
+ if (page_count(page) > 1 + PageSwapCache(page))
goto copy;
if (!trylock_page(page))
goto copy;
- if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) {
+ if (PageSwapCache(page))
+ try_to_free_swap(page);
+ if (PageKsm(page) || page_count(page) != 1) {
unlock_page(page);
goto copy;
}
/*
- * Ok, we've got the only map reference, and the only
- * page count reference, and the page is locked,
- * it's dark out, and we're wearing sunglasses. Hit it.
+ * Ok, we've got the only page reference from our mapping
+ * and the page is locked, it's dark out, and we're wearing
+ * sunglasses. Hit it.
*/
unlock_page(page);
wp_page_reuse(vmf);
@@ -3372,11 +3388,11 @@ void unmap_mapping_folio(struct folio *folio)
details.even_cows = false;
details.single_folio = folio;
- i_mmap_lock_write(mapping);
+ i_mmap_lock_read(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
unmap_mapping_range_tree(&mapping->i_mmap, first_index,
last_index, &details);
- i_mmap_unlock_write(mapping);
+ i_mmap_unlock_read(mapping);
}
/**
@@ -3402,11 +3418,11 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
if (last_index < first_index)
last_index = ULONG_MAX;
- i_mmap_lock_write(mapping);
+ i_mmap_lock_read(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
unmap_mapping_range_tree(&mapping->i_mmap, first_index,
last_index, &details);
- i_mmap_unlock_write(mapping);
+ i_mmap_unlock_read(mapping);
}
EXPORT_SYMBOL_GPL(unmap_mapping_pages);
@@ -3473,6 +3489,25 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
return 0;
}
+static inline bool should_try_to_free_swap(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned int fault_flags)
+{
+ if (!PageSwapCache(page))
+ return false;
+ if (mem_cgroup_swap_full(page) || (vma->vm_flags & VM_LOCKED) ||
+ PageMlocked(page))
+ return true;
+ /*
+ * If we want to map a page that's in the swapcache writable, we
+ * have to detect via the refcount if we're really the exclusive
+ * user. Try freeing the swapcache to get rid of the swapcache
+ * reference only in case it's likely that we'll be the exlusive user.
+ */
+ return (fault_flags & FAULT_FLAG_WRITE) && !PageKsm(page) &&
+ page_count(page) == 2;
+}
+
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
@@ -3591,21 +3626,39 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
goto out_release;
}
- /*
- * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
- * release the swapcache from under us. The page pin, and pte_same
- * test below, are not enough to exclude that. Even if it is still
- * swapcache, we need to check that the page's swap has not changed.
- */
- if (unlikely((!PageSwapCache(page) ||
- page_private(page) != entry.val)) && swapcache)
- goto out_page;
-
- page = ksm_might_need_to_copy(page, vma, vmf->address);
- if (unlikely(!page)) {
- ret = VM_FAULT_OOM;
- page = swapcache;
- goto out_page;
+ if (swapcache) {
+ /*
+ * Make sure try_to_free_swap or swapoff did not release the
+ * swapcache from under us. The page pin, and pte_same test
+ * below, are not enough to exclude that. Even if it is still
+ * swapcache, we need to check that the page's swap has not
+ * changed.
+ */
+ if (unlikely(!PageSwapCache(page) ||
+ page_private(page) != entry.val))
+ goto out_page;
+
+ /*
+ * KSM sometimes has to copy on read faults, for example, if
+ * page->index of !PageKSM() pages would be nonlinear inside the
+ * anon VMA -- PageKSM() is lost on actual swapout.
+ */
+ page = ksm_might_need_to_copy(page, vma, vmf->address);
+ if (unlikely(!page)) {
+ ret = VM_FAULT_OOM;
+ page = swapcache;
+ goto out_page;
+ }
+
+ /*
+ * If we want to map a page that's in the swapcache writable, we
+ * have to detect via the refcount if we're really the exclusive
+ * owner. Try removing the extra reference from the local LRU
+ * pagevecs if required.
+ */
+ if ((vmf->flags & FAULT_FLAG_WRITE) && page == swapcache &&
+ !PageKsm(page) && !PageLRU(page))
+ lru_add_drain();
}
cgroup_throttle_swaprate(page, GFP_KERNEL);
@@ -3624,19 +3677,25 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
}
/*
- * The page isn't present yet, go ahead with the fault.
- *
- * Be careful about the sequence of operations here.
- * To get its accounting right, reuse_swap_page() must be called
- * while the page is counted on swap but not yet in mapcount i.e.
- * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
- * must be called after the swap_free(), or it will never succeed.
+ * Remove the swap entry and conditionally try to free up the swapcache.
+ * We're already holding a reference on the page but haven't mapped it
+ * yet.
*/
+ swap_free(entry);
+ if (should_try_to_free_swap(page, vma, vmf->flags))
+ try_to_free_swap(page);
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
- if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
+
+ /*
+ * Same logic as in do_wp_page(); however, optimize for fresh pages
+ * that are certainly not shared because we just allocated them without
+ * exposing them to the swapcache.
+ */
+ if ((vmf->flags & FAULT_FLAG_WRITE) && !PageKsm(page) &&
+ (page != swapcache || page_count(page) == 1)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
vmf->flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
@@ -3662,10 +3721,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
- swap_free(entry);
- if (mem_cgroup_swap_full(page) ||
- (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
- try_to_free_swap(page);
unlock_page(page);
if (page != swapcache && swapcache) {
/*
@@ -3863,14 +3918,18 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
return ret;
if (unlikely(PageHWPoison(vmf->page))) {
+ struct page *page = vmf->page;
vm_fault_t poisonret = VM_FAULT_HWPOISON;
if (ret & VM_FAULT_LOCKED) {
+ if (page_mapped(page))
+ unmap_mapping_pages(page_mapping(page),
+ page->index, 1, false);
/* Retry if a clean page was removed from the cache. */
- if (invalidate_inode_page(vmf->page))
- poisonret = 0;
- unlock_page(vmf->page);
+ if (invalidate_inode_page(page))
+ poisonret = VM_FAULT_NOPAGE;
+ unlock_page(page);
}
- put_page(vmf->page);
+ put_page(page);
vmf->page = NULL;
return poisonret;
}
@@ -5255,14 +5314,6 @@ void print_vma_addr(char *prefix, unsigned long ip)
#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
void __might_fault(const char *file, int line)
{
- /*
- * Some code (nfs/sunrpc) uses socket ops on kernel memory while
- * holding the mmap_lock, this is safe because kernel memory doesn't
- * get paged out, therefore we'll never actually fault, and the
- * below annotations will generate false positives.
- */
- if (uaccess_kernel())
- return;
if (pagefault_disabled())
return;
__might_sleep(file, line);
diff --git a/mm/memremap.c b/mm/memremap.c
index c17eca4a48ca..af0223605e69 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -456,8 +456,6 @@ void free_zone_device_page(struct page *page)
if (WARN_ON_ONCE(!page->pgmap->ops || !page->pgmap->ops->page_free))
return;
- __ClearPageWaiters(page);
-
mem_cgroup_uncharge(page_folio(page));
/*
diff --git a/mm/migrate.c b/mm/migrate.c
index 4f30ed37856f..de175e2fdba5 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -53,7 +53,6 @@
#include <asm/tlbflush.h>
-#define CREATE_TRACE_POINTS
#include <trace/events/migrate.h>
#include "internal.h"
@@ -247,7 +246,10 @@ static bool remove_migration_pte(struct folio *folio,
set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
}
if (vma->vm_flags & VM_LOCKED)
- mlock_page_drain(smp_processor_id());
+ mlock_page_drain_local();
+
+ trace_remove_migration_pte(pvmw.address, pte_val(pte),
+ compound_order(new));
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, pvmw.address, pvmw.pte);
diff --git a/mm/mlock.c b/mm/mlock.c
index efd2dd2943de..716caf851043 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -28,7 +28,14 @@
#include "internal.h"
-static DEFINE_PER_CPU(struct pagevec, mlock_pvec);
+struct mlock_pvec {
+ local_lock_t lock;
+ struct pagevec vec;
+};
+
+static DEFINE_PER_CPU(struct mlock_pvec, mlock_pvec) = {
+ .lock = INIT_LOCAL_LOCK(lock),
+};
bool can_do_mlock(void)
{
@@ -203,18 +210,30 @@ static void mlock_pagevec(struct pagevec *pvec)
pagevec_reinit(pvec);
}
-void mlock_page_drain(int cpu)
+void mlock_page_drain_local(void)
{
struct pagevec *pvec;
- pvec = &per_cpu(mlock_pvec, cpu);
+ local_lock(&mlock_pvec.lock);
+ pvec = this_cpu_ptr(&mlock_pvec.vec);
+ if (pagevec_count(pvec))
+ mlock_pagevec(pvec);
+ local_unlock(&mlock_pvec.lock);
+}
+
+void mlock_page_drain_remote(int cpu)
+{
+ struct pagevec *pvec;
+
+ WARN_ON_ONCE(cpu_online(cpu));
+ pvec = &per_cpu(mlock_pvec.vec, cpu);
if (pagevec_count(pvec))
mlock_pagevec(pvec);
}
bool need_mlock_page_drain(int cpu)
{
- return pagevec_count(&per_cpu(mlock_pvec, cpu));
+ return pagevec_count(&per_cpu(mlock_pvec.vec, cpu));
}
/**
@@ -223,7 +242,10 @@ bool need_mlock_page_drain(int cpu)
*/
void mlock_folio(struct folio *folio)
{
- struct pagevec *pvec = &get_cpu_var(mlock_pvec);
+ struct pagevec *pvec;
+
+ local_lock(&mlock_pvec.lock);
+ pvec = this_cpu_ptr(&mlock_pvec.vec);
if (!folio_test_set_mlocked(folio)) {
int nr_pages = folio_nr_pages(folio);
@@ -236,7 +258,7 @@ void mlock_folio(struct folio *folio)
if (!pagevec_add(pvec, mlock_lru(&folio->page)) ||
folio_test_large(folio) || lru_cache_disabled())
mlock_pagevec(pvec);
- put_cpu_var(mlock_pvec);
+ local_unlock(&mlock_pvec.lock);
}
/**
@@ -245,9 +267,11 @@ void mlock_folio(struct folio *folio)
*/
void mlock_new_page(struct page *page)
{
- struct pagevec *pvec = &get_cpu_var(mlock_pvec);
+ struct pagevec *pvec;
int nr_pages = thp_nr_pages(page);
+ local_lock(&mlock_pvec.lock);
+ pvec = this_cpu_ptr(&mlock_pvec.vec);
SetPageMlocked(page);
mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
__count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
@@ -256,7 +280,7 @@ void mlock_new_page(struct page *page)
if (!pagevec_add(pvec, mlock_new(page)) ||
PageHead(page) || lru_cache_disabled())
mlock_pagevec(pvec);
- put_cpu_var(mlock_pvec);
+ local_unlock(&mlock_pvec.lock);
}
/**
@@ -265,8 +289,10 @@ void mlock_new_page(struct page *page)
*/
void munlock_page(struct page *page)
{
- struct pagevec *pvec = &get_cpu_var(mlock_pvec);
+ struct pagevec *pvec;
+ local_lock(&mlock_pvec.lock);
+ pvec = this_cpu_ptr(&mlock_pvec.vec);
/*
* TestClearPageMlocked(page) must be left to __munlock_page(),
* which will check whether the page is multiply mlocked.
@@ -276,7 +302,7 @@ void munlock_page(struct page *page)
if (!pagevec_add(pvec, page) ||
PageHead(page) || lru_cache_disabled())
mlock_pagevec(pvec);
- put_cpu_var(mlock_pvec);
+ local_unlock(&mlock_pvec.lock);
}
static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
@@ -721,13 +747,12 @@ int user_shm_lock(size_t size, struct ucounts *ucounts)
locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
lock_limit = rlimit(RLIMIT_MEMLOCK);
- if (lock_limit == RLIM_INFINITY)
- allowed = 1;
- lock_limit >>= PAGE_SHIFT;
+ if (lock_limit != RLIM_INFINITY)
+ lock_limit >>= PAGE_SHIFT;
spin_lock(&shmlock_user_lock);
memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
- if (!allowed && (memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) {
+ if ((memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) {
dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
goto out;
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 435c02630593..7e2da284e427 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2465,16 +2465,14 @@ static void folio_account_dirtied(struct folio *folio,
*
* Caller must hold lock_page_memcg().
*/
-void folio_account_cleaned(struct folio *folio, struct address_space *mapping,
- struct bdi_writeback *wb)
+void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb)
{
- if (mapping_can_writeback(mapping)) {
- long nr = folio_nr_pages(folio);
- lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
- zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
- wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
- task_io_account_cancelled_write(nr * PAGE_SIZE);
- }
+ long nr = folio_nr_pages(folio);
+
+ lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
+ zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
+ wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
+ task_io_account_cancelled_write(nr * PAGE_SIZE);
}
/*
@@ -2683,7 +2681,7 @@ void __folio_cancel_dirty(struct folio *folio)
wb = unlocked_inode_to_wb_begin(inode, &cookie);
if (folio_test_clear_dirty(folio))
- folio_account_cleaned(folio, mapping, wb);
+ folio_account_cleaned(folio, wb);
unlocked_inode_to_wb_end(inode, &cookie);
folio_memcg_unlock(folio);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6e0b4596cde9..2db95780e003 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -378,25 +378,9 @@ int page_group_by_mobility_disabled __read_mostly;
*/
static DEFINE_STATIC_KEY_TRUE(deferred_pages);
-/*
- * Calling kasan_poison_pages() only after deferred memory initialization
- * has completed. Poisoning pages during deferred memory init will greatly
- * lengthen the process and cause problem in large memory systems as the
- * deferred pages initialization is done with interrupt disabled.
- *
- * Assuming that there will be no reference to those newly initialized
- * pages before they are ever allocated, this should have no effect on
- * KASAN memory tracking as the poison will be properly inserted at page
- * allocation time. The only corner case is when pages are allocated by
- * on-demand allocation and then freed again before the deferred pages
- * initialization is done, but this is not likely to happen.
- */
-static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
+static inline bool deferred_pages_enabled(void)
{
- return static_branch_unlikely(&deferred_pages) ||
- (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
- (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
- PageSkipKASanPoison(page);
+ return static_branch_unlikely(&deferred_pages);
}
/* Returns true if the struct page for the pfn is uninitialised */
@@ -447,11 +431,9 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
#else
-static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
+static inline bool deferred_pages_enabled(void)
{
- return (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
- (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
- PageSkipKASanPoison(page);
+ return false;
}
static inline bool early_page_uninitialised(unsigned long pfn)
@@ -1126,6 +1108,9 @@ continue_merging:
buddy_pfn = __find_buddy_pfn(pfn, order);
buddy = page + (buddy_pfn - pfn);
+
+ if (!page_is_buddy(page, buddy, order))
+ goto done_merging;
buddy_mt = get_pageblock_migratetype(buddy);
if (migratetype != buddy_mt
@@ -1267,15 +1252,38 @@ out:
return ret;
}
-static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags)
+/*
+ * Skip KASAN memory poisoning when either:
+ *
+ * 1. Deferred memory initialization has not yet completed,
+ * see the explanation below.
+ * 2. Skipping poisoning is requested via FPI_SKIP_KASAN_POISON,
+ * see the comment next to it.
+ * 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON,
+ * see the comment next to it.
+ *
+ * Poisoning pages during deferred memory init will greatly lengthen the
+ * process and cause problem in large memory systems as the deferred pages
+ * initialization is done with interrupt disabled.
+ *
+ * Assuming that there will be no reference to those newly initialized
+ * pages before they are ever allocated, this should have no effect on
+ * KASAN memory tracking as the poison will be properly inserted at page
+ * allocation time. The only corner case is when pages are allocated by
+ * on-demand allocation and then freed again before the deferred pages
+ * initialization is done, but this is not likely to happen.
+ */
+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
{
- int i;
+ return deferred_pages_enabled() ||
+ (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+ (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
+ PageSkipKASanPoison(page);
+}
- if (zero_tags) {
- for (i = 0; i < numpages; i++)
- tag_clear_highpage(page + i);
- return;
- }
+static void kernel_init_free_pages(struct page *page, int numpages)
+{
+ int i;
/* s390's use of memset() could override KASAN redzones. */
kasan_disable_current();
@@ -1292,7 +1300,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
unsigned int order, bool check_free, fpi_t fpi_flags)
{
int bad = 0;
- bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
+ bool init = want_init_on_free();
VM_BUG_ON_PAGE(PageTail(page), page);
@@ -1359,23 +1367,21 @@ static __always_inline bool free_pages_prepare(struct page *page,
/*
* As memory initialization might be integrated into KASAN,
- * kasan_free_pages and kernel_init_free_pages must be
+ * KASAN poisoning and memory initialization code must be
* kept together to avoid discrepancies in behavior.
*
* With hardware tag-based KASAN, memory tags must be set before the
* page becomes unavailable via debug_pagealloc or arch_free_page.
*/
- if (kasan_has_integrated_init()) {
- if (!skip_kasan_poison)
- kasan_free_pages(page, order);
- } else {
- bool init = want_init_on_free();
+ if (!should_skip_kasan_poison(page, fpi_flags)) {
+ kasan_poison_pages(page, order, init);
- if (init)
- kernel_init_free_pages(page, 1 << order, false);
- if (!skip_kasan_poison)
- kasan_poison_pages(page, order, init);
+ /* Memory is already initialized if KASAN did it internally. */
+ if (kasan_has_integrated_init())
+ init = false;
}
+ if (init)
+ kernel_init_free_pages(page, 1 << order);
/*
* arch_free_page() can make the page's contents inaccessible. s390
@@ -2340,9 +2346,43 @@ static inline bool check_new_pcp(struct page *page, unsigned int order)
}
#endif /* CONFIG_DEBUG_VM */
+static inline bool should_skip_kasan_unpoison(gfp_t flags, bool init_tags)
+{
+ /* Don't skip if a software KASAN mode is enabled. */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
+ IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+ return false;
+
+ /* Skip, if hardware tag-based KASAN is not enabled. */
+ if (!kasan_hw_tags_enabled())
+ return true;
+
+ /*
+ * With hardware tag-based KASAN enabled, skip if either:
+ *
+ * 1. Memory tags have already been cleared via tag_clear_highpage().
+ * 2. Skipping has been requested via __GFP_SKIP_KASAN_UNPOISON.
+ */
+ return init_tags || (flags & __GFP_SKIP_KASAN_UNPOISON);
+}
+
+static inline bool should_skip_init(gfp_t flags)
+{
+ /* Don't skip, if hardware tag-based KASAN is not enabled. */
+ if (!kasan_hw_tags_enabled())
+ return false;
+
+ /* For hardware tag-based KASAN, skip if requested. */
+ return (flags & __GFP_SKIP_ZERO);
+}
+
inline void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags)
{
+ bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
+ !should_skip_init(gfp_flags);
+ bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS);
+
set_page_private(page, 0);
set_page_refcounted(page);
@@ -2358,19 +2398,38 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
/*
* As memory initialization might be integrated into KASAN,
- * kasan_alloc_pages and kernel_init_free_pages must be
+ * KASAN unpoisoning and memory initializion code must be
* kept together to avoid discrepancies in behavior.
*/
- if (kasan_has_integrated_init()) {
- kasan_alloc_pages(page, order, gfp_flags);
- } else {
- bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
+ /*
+ * If memory tags should be zeroed (which happens only when memory
+ * should be initialized as well).
+ */
+ if (init_tags) {
+ int i;
+
+ /* Initialize both memory and tags. */
+ for (i = 0; i != 1 << order; ++i)
+ tag_clear_highpage(page + i);
+
+ /* Note that memory is already initialized by the loop above. */
+ init = false;
+ }
+ if (!should_skip_kasan_unpoison(gfp_flags, init_tags)) {
+ /* Unpoison shadow memory or set memory tags. */
kasan_unpoison_pages(page, order, init);
- if (init)
- kernel_init_free_pages(page, 1 << order,
- gfp_flags & __GFP_ZEROTAGS);
+
+ /* Note that memory is already initialized by KASAN. */
+ if (kasan_has_integrated_init())
+ init = false;
}
+ /* If memory is still not initialized, do it now. */
+ if (init)
+ kernel_init_free_pages(page, 1 << order);
+ /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */
+ if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON))
+ SetPageSkipKASanPoison(page);
set_page_owner(page, order, gfp_flags);
page_table_check_alloc(page, order);
@@ -8308,6 +8367,7 @@ static int page_alloc_cpu_dead(unsigned int cpu)
struct zone *zone;
lru_add_drain_cpu(cpu);
+ mlock_page_drain_remote(cpu);
drain_pages(cpu);
/*
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 99e360df9465..fb3a05fdebdb 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -10,6 +10,7 @@
#include <linux/migrate.h>
#include <linux/stackdepot.h>
#include <linux/seq_file.h>
+#include <linux/memcontrol.h>
#include <linux/sched/clock.h>
#include "internal.h"
@@ -28,7 +29,9 @@ struct page_owner {
depot_stack_handle_t free_handle;
u64 ts_nsec;
u64 free_ts_nsec;
+ char comm[TASK_COMM_LEN];
pid_t pid;
+ pid_t tgid;
};
static bool page_owner_enabled = false;
@@ -163,7 +166,10 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext,
page_owner->gfp_mask = gfp_mask;
page_owner->last_migrate_reason = -1;
page_owner->pid = current->pid;
+ page_owner->tgid = current->tgid;
page_owner->ts_nsec = local_clock();
+ strlcpy(page_owner->comm, current->comm,
+ sizeof(page_owner->comm));
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
__set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
@@ -229,8 +235,10 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old)
old_page_owner->last_migrate_reason;
new_page_owner->handle = old_page_owner->handle;
new_page_owner->pid = old_page_owner->pid;
+ new_page_owner->tgid = old_page_owner->tgid;
new_page_owner->ts_nsec = old_page_owner->ts_nsec;
new_page_owner->free_ts_nsec = old_page_owner->ts_nsec;
+ strcpy(new_page_owner->comm, old_page_owner->comm);
/*
* We don't clear the bit on the old folio as it's going to be freed
@@ -325,6 +333,45 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
seq_putc(m, '\n');
}
+/*
+ * Looking for memcg information and print it out
+ */
+static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret,
+ struct page *page)
+{
+#ifdef CONFIG_MEMCG
+ unsigned long memcg_data;
+ struct mem_cgroup *memcg;
+ bool online;
+ char name[80];
+
+ rcu_read_lock();
+ memcg_data = READ_ONCE(page->memcg_data);
+ if (!memcg_data)
+ goto out_unlock;
+
+ if (memcg_data & MEMCG_DATA_OBJCGS)
+ ret += scnprintf(kbuf + ret, count - ret,
+ "Slab cache page\n");
+
+ memcg = page_memcg_check(page);
+ if (!memcg)
+ goto out_unlock;
+
+ online = (memcg->css.flags & CSS_ONLINE);
+ cgroup_name(memcg->css.cgroup, name, sizeof(name));
+ ret += scnprintf(kbuf + ret, count - ret,
+ "Charged %sto %smemcg %s\n",
+ PageMemcgKmem(page) ? "(via objcg) " : "",
+ online ? "" : "offline ",
+ name);
+out_unlock:
+ rcu_read_unlock();
+#endif /* CONFIG_MEMCG */
+
+ return ret;
+}
+
static ssize_t
print_page_owner(char __user *buf, size_t count, unsigned long pfn,
struct page *page, struct page_owner *page_owner,
@@ -338,19 +385,17 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
if (!kbuf)
return -ENOMEM;
- ret = snprintf(kbuf, count,
- "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns, free_ts %llu ns\n",
+ ret = scnprintf(kbuf, count,
+ "Page allocated via order %u, mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu ns, free_ts %llu ns\n",
page_owner->order, page_owner->gfp_mask,
&page_owner->gfp_mask, page_owner->pid,
+ page_owner->tgid, page_owner->comm,
page_owner->ts_nsec, page_owner->free_ts_nsec);
- if (ret >= count)
- goto err;
-
/* Print information relevant to grouping pages by mobility */
pageblock_mt = get_pageblock_migratetype(page);
page_mt = gfp_migratetype(page_owner->gfp_mask);
- ret += snprintf(kbuf + ret, count - ret,
+ ret += scnprintf(kbuf + ret, count - ret,
"PFN %lu type %s Block %lu type %s Flags %pGp\n",
pfn,
migratetype_names[page_mt],
@@ -358,21 +403,18 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
migratetype_names[pageblock_mt],
&page->flags);
- if (ret >= count)
- goto err;
-
ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0);
if (ret >= count)
goto err;
if (page_owner->last_migrate_reason != -1) {
- ret += snprintf(kbuf + ret, count - ret,
+ ret += scnprintf(kbuf + ret, count - ret,
"Page has been migrated, last migrate reason: %s\n",
migrate_reason_names[page_owner->last_migrate_reason]);
- if (ret >= count)
- goto err;
}
+ ret = print_page_owner_memcg(kbuf, count, ret, page);
+
ret += snprintf(kbuf + ret, count - ret, "\n");
if (ret >= count)
goto err;
@@ -415,9 +457,10 @@ void __dump_page_owner(const struct page *page)
else
pr_alert("page_owner tracks the page as freed\n");
- pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu, free_ts %llu\n",
+ pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu, free_ts %llu\n",
page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask,
- page_owner->pid, page_owner->ts_nsec, page_owner->free_ts_nsec);
+ page_owner->pid, page_owner->tgid, page_owner->comm,
+ page_owner->ts_nsec, page_owner->free_ts_nsec);
handle = READ_ONCE(page_owner->handle);
if (!handle)
diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
index c6bd092ff7a3..dd3590dfc23d 100644
--- a/mm/percpu-stats.c
+++ b/mm/percpu-stats.c
@@ -144,7 +144,7 @@ alloc_buffer:
spin_unlock_irq(&pcpu_lock);
/* there can be at most this many free and allocated fragments */
- buffer = vmalloc(array_size(sizeof(int), (2 * max_nr_alloc + 1)));
+ buffer = vmalloc_array(2 * max_nr_alloc + 1, sizeof(int));
if (!buffer)
return -ENOMEM;
diff --git a/mm/readahead.c b/mm/readahead.c
index d3a47546d17d..8e3775829513 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -13,29 +13,29 @@
*
* Readahead is used to read content into the page cache before it is
* explicitly requested by the application. Readahead only ever
- * attempts to read pages that are not yet in the page cache. If a
- * page is present but not up-to-date, readahead will not try to read
+ * attempts to read folios that are not yet in the page cache. If a
+ * folio is present but not up-to-date, readahead will not try to read
* it. In that case a simple ->readpage() will be requested.
*
* Readahead is triggered when an application read request (whether a
- * systemcall or a page fault) finds that the requested page is not in
+ * system call or a page fault) finds that the requested folio is not in
* the page cache, or that it is in the page cache and has the
- * %PG_readahead flag set. This flag indicates that the page was loaded
- * as part of a previous read-ahead request and now that it has been
- * accessed, it is time for the next read-ahead.
+ * readahead flag set. This flag indicates that the folio was read
+ * as part of a previous readahead request and now that it has been
+ * accessed, it is time for the next readahead.
*
* Each readahead request is partly synchronous read, and partly async
- * read-ahead. This is reflected in the struct file_ra_state which
- * contains ->size being to total number of pages, and ->async_size
- * which is the number of pages in the async section. The first page in
- * this async section will have %PG_readahead set as a trigger for a
- * subsequent read ahead. Once a series of sequential reads has been
+ * readahead. This is reflected in the struct file_ra_state which
+ * contains ->size being the total number of pages, and ->async_size
+ * which is the number of pages in the async section. The readahead
+ * flag will be set on the first folio in this async section to trigger
+ * a subsequent readahead. Once a series of sequential reads has been
* established, there should be no need for a synchronous component and
- * all read ahead request will be fully asynchronous.
+ * all readahead request will be fully asynchronous.
*
- * When either of the triggers causes a readahead, three numbers need to
- * be determined: the start of the region, the size of the region, and
- * the size of the async tail.
+ * When either of the triggers causes a readahead, three numbers need
+ * to be determined: the start of the region to read, the size of the
+ * region, and the size of the async tail.
*
* The start of the region is simply the first page address at or after
* the accessed address, which is not currently populated in the page
@@ -45,14 +45,14 @@
* was explicitly requested from the determined request size, unless
* this would be less than zero - then zero is used. NOTE THIS
* CALCULATION IS WRONG WHEN THE START OF THE REGION IS NOT THE ACCESSED
- * PAGE.
+ * PAGE. ALSO THIS CALCULATION IS NOT USED CONSISTENTLY.
*
* The size of the region is normally determined from the size of the
* previous readahead which loaded the preceding pages. This may be
* discovered from the struct file_ra_state for simple sequential reads,
* or from examining the state of the page cache when multiple
* sequential reads are interleaved. Specifically: where the readahead
- * was triggered by the %PG_readahead flag, the size of the previous
+ * was triggered by the readahead flag, the size of the previous
* readahead is assumed to be the number of pages from the triggering
* page to the start of the new readahead. In these cases, the size of
* the previous readahead is scaled, often doubled, for the new
@@ -65,52 +65,52 @@
* larger than the current request, and it is not scaled up, unless it
* is at the start of file.
*
- * In general read ahead is accelerated at the start of the file, as
+ * In general readahead is accelerated at the start of the file, as
* reads from there are often sequential. There are other minor
- * adjustments to the read ahead size in various special cases and these
+ * adjustments to the readahead size in various special cases and these
* are best discovered by reading the code.
*
- * The above calculation determines the readahead, to which any requested
- * read size may be added.
+ * The above calculation, based on the previous readahead size,
+ * determines the size of the readahead, to which any requested read
+ * size may be added.
*
* Readahead requests are sent to the filesystem using the ->readahead()
* address space operation, for which mpage_readahead() is a canonical
* implementation. ->readahead() should normally initiate reads on all
- * pages, but may fail to read any or all pages without causing an IO
+ * folios, but may fail to read any or all folios without causing an I/O
* error. The page cache reading code will issue a ->readpage() request
- * for any page which ->readahead() does not provided, and only an error
+ * for any folio which ->readahead() did not read, and only an error
* from this will be final.
*
- * ->readahead() will generally call readahead_page() repeatedly to get
- * each page from those prepared for read ahead. It may fail to read a
- * page by:
+ * ->readahead() will generally call readahead_folio() repeatedly to get
+ * each folio from those prepared for readahead. It may fail to read a
+ * folio by:
*
- * * not calling readahead_page() sufficiently many times, effectively
- * ignoring some pages, as might be appropriate if the path to
+ * * not calling readahead_folio() sufficiently many times, effectively
+ * ignoring some folios, as might be appropriate if the path to
* storage is congested.
*
- * * failing to actually submit a read request for a given page,
+ * * failing to actually submit a read request for a given folio,
* possibly due to insufficient resources, or
*
* * getting an error during subsequent processing of a request.
*
- * In the last two cases, the page should be unlocked to indicate that
- * the read attempt has failed. In the first case the page will be
- * unlocked by the caller.
+ * In the last two cases, the folio should be unlocked by the filesystem
+ * to indicate that the read attempt has failed. In the first case the
+ * folio will be unlocked by the VFS.
*
- * Those pages not in the final ``async_size`` of the request should be
+ * Those folios not in the final ``async_size`` of the request should be
* considered to be important and ->readahead() should not fail them due
* to congestion or temporary resource unavailability, but should wait
* for necessary resources (e.g. memory or indexing information) to
- * become available. Pages in the final ``async_size`` may be
+ * become available. Folios in the final ``async_size`` may be
* considered less urgent and failure to read them is more acceptable.
- * In this case it is best to use delete_from_page_cache() to remove the
- * pages from the page cache as is automatically done for pages that
- * were not fetched with readahead_page(). This will allow a
- * subsequent synchronous read ahead request to try them again. If they
+ * In this case it is best to use filemap_remove_folio() to remove the
+ * folios from the page cache as is automatically done for folios that
+ * were not fetched with readahead_folio(). This will allow a
+ * subsequent synchronous readahead request to try them again. If they
* are left in the page cache, then they will be read individually using
- * ->readpage().
- *
+ * ->readpage() which may be less efficient.
*/
#include <linux/kernel.h>
@@ -142,91 +142,14 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
}
EXPORT_SYMBOL_GPL(file_ra_state_init);
-/*
- * see if a page needs releasing upon read_cache_pages() failure
- * - the caller of read_cache_pages() may have set PG_private or PG_fscache
- * before calling, such as the NFS fs marking pages that are cached locally
- * on disk, thus we need to give the fs a chance to clean up in the event of
- * an error
- */
-static void read_cache_pages_invalidate_page(struct address_space *mapping,
- struct page *page)
-{
- if (page_has_private(page)) {
- if (!trylock_page(page))
- BUG();
- page->mapping = mapping;
- folio_invalidate(page_folio(page), 0, PAGE_SIZE);
- page->mapping = NULL;
- unlock_page(page);
- }
- put_page(page);
-}
-
-/*
- * release a list of pages, invalidating them first if need be
- */
-static void read_cache_pages_invalidate_pages(struct address_space *mapping,
- struct list_head *pages)
-{
- struct page *victim;
-
- while (!list_empty(pages)) {
- victim = lru_to_page(pages);
- list_del(&victim->lru);
- read_cache_pages_invalidate_page(mapping, victim);
- }
-}
-
-/**
- * read_cache_pages - populate an address space with some pages & start reads against them
- * @mapping: the address_space
- * @pages: The address of a list_head which contains the target pages. These
- * pages have their ->index populated and are otherwise uninitialised.
- * @filler: callback routine for filling a single page.
- * @data: private data for the callback routine.
- *
- * Hides the details of the LRU cache etc from the filesystems.
- *
- * Returns: %0 on success, error return by @filler otherwise
- */
-int read_cache_pages(struct address_space *mapping, struct list_head *pages,
- int (*filler)(void *, struct page *), void *data)
-{
- struct page *page;
- int ret = 0;
-
- while (!list_empty(pages)) {
- page = lru_to_page(pages);
- list_del(&page->lru);
- if (add_to_page_cache_lru(page, mapping, page->index,
- readahead_gfp_mask(mapping))) {
- read_cache_pages_invalidate_page(mapping, page);
- continue;
- }
- put_page(page);
-
- ret = filler(data, page);
- if (unlikely(ret)) {
- read_cache_pages_invalidate_pages(mapping, pages);
- break;
- }
- task_io_account_read(PAGE_SIZE);
- }
- return ret;
-}
-
-EXPORT_SYMBOL(read_cache_pages);
-
-static void read_pages(struct readahead_control *rac, struct list_head *pages,
- bool skip_page)
+static void read_pages(struct readahead_control *rac)
{
const struct address_space_operations *aops = rac->mapping->a_ops;
struct page *page;
struct blk_plug plug;
if (!readahead_count(rac))
- goto out;
+ return;
blk_start_plug(&plug);
@@ -234,7 +157,7 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages,
aops->readahead(rac);
/*
* Clean up the remaining pages. The sizes in ->ra
- * maybe be used to size next read-ahead, so make sure
+ * may be used to size the next readahead, so make sure
* they accurately reflect what happened.
*/
while ((page = readahead_page(rac))) {
@@ -246,13 +169,6 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages,
unlock_page(page);
put_page(page);
}
- } else if (aops->readpages) {
- aops->readpages(rac->file, rac->mapping, pages,
- readahead_count(rac));
- /* Clean up the remaining pages */
- put_pages_list(pages);
- rac->_index += rac->_nr_pages;
- rac->_nr_pages = 0;
} else {
while ((page = readahead_page(rac))) {
aops->readpage(rac->file, page);
@@ -262,12 +178,7 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages,
blk_finish_plug(&plug);
- BUG_ON(pages && !list_empty(pages));
BUG_ON(readahead_count(rac));
-
-out:
- if (skip_page)
- rac->_index++;
}
/**
@@ -289,7 +200,6 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
{
struct address_space *mapping = ractl->mapping;
unsigned long index = readahead_index(ractl);
- LIST_HEAD(page_pool);
gfp_t gfp_mask = readahead_gfp_mask(mapping);
unsigned long i;
@@ -321,7 +231,8 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
* have a stable reference to this page, and it's
* not worth getting one just for that.
*/
- read_pages(ractl, &page_pool, true);
+ read_pages(ractl);
+ ractl->_index++;
i = ractl->_index + ractl->_nr_pages - index - 1;
continue;
}
@@ -329,13 +240,11 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
folio = filemap_alloc_folio(gfp_mask, 0);
if (!folio)
break;
- if (mapping->a_ops->readpages) {
- folio->index = index + i;
- list_add(&folio->lru, &page_pool);
- } else if (filemap_add_folio(mapping, folio, index + i,
+ if (filemap_add_folio(mapping, folio, index + i,
gfp_mask) < 0) {
folio_put(folio);
- read_pages(ractl, &page_pool, true);
+ read_pages(ractl);
+ ractl->_index++;
i = ractl->_index + ractl->_nr_pages - index - 1;
continue;
}
@@ -349,7 +258,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
* uptodate then the caller will launch readpage again, and
* will then handle the error.
*/
- read_pages(ractl, &page_pool, false);
+ read_pages(ractl);
filemap_invalidate_unlock_shared(mapping);
memalloc_nofs_restore(nofs);
}
@@ -394,8 +303,7 @@ void force_page_cache_ra(struct readahead_control *ractl,
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
unsigned long max_pages, index;
- if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages &&
- !mapping->a_ops->readahead))
+ if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readahead))
return;
/*
@@ -512,7 +420,7 @@ static pgoff_t count_history_pages(struct address_space *mapping,
}
/*
- * page cache context based read-ahead
+ * page cache context based readahead
*/
static int try_context_readahead(struct address_space *mapping,
struct file_ra_state *ra,
@@ -624,7 +532,7 @@ void page_cache_ra_order(struct readahead_control *ractl,
ra->async_size += index - limit - 1;
}
- read_pages(ractl, NULL, false);
+ read_pages(ractl);
/*
* If there were already pages in the page cache, then we may have
@@ -763,9 +671,9 @@ void page_cache_sync_ra(struct readahead_control *ractl,
bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);
/*
- * Even if read-ahead is disabled, issue this request as read-ahead
+ * Even if readahead is disabled, issue this request as readahead
* as we'll need it to satisfy the requested range. The forced
- * read-ahead will do the right thing and limit the read to just the
+ * readahead will do the right thing and limit the read to just the
* requested range, which we'll set to 1 page for this case.
*/
if (!ractl->ra->ra_pages || blk_cgroup_congested()) {
@@ -781,7 +689,6 @@ void page_cache_sync_ra(struct readahead_control *ractl,
return;
}
- /* do read-ahead */
ondemand_readahead(ractl, NULL, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_sync_ra);
@@ -789,7 +696,7 @@ EXPORT_SYMBOL_GPL(page_cache_sync_ra);
void page_cache_async_ra(struct readahead_control *ractl,
struct folio *folio, unsigned long req_count)
{
- /* no read-ahead */
+ /* no readahead */
if (!ractl->ra->ra_pages)
return;
@@ -804,7 +711,6 @@ void page_cache_async_ra(struct readahead_control *ractl,
if (blk_cgroup_congested())
return;
- /* do read-ahead */
ondemand_readahead(ractl, folio, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_async_ra);
diff --git a/mm/rmap.c b/mm/rmap.c
index 615b5d323ee2..fedb82371efe 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -76,7 +76,9 @@
#include <asm/tlbflush.h>
+#define CREATE_TRACE_POINTS
#include <trace/events/tlb.h>
+#include <trace/events/migrate.h>
#include "internal.h"
@@ -1236,14 +1238,14 @@ void page_add_new_anon_rmap(struct page *page,
void page_add_file_rmap(struct page *page,
struct vm_area_struct *vma, bool compound)
{
- int i, nr = 1;
+ int i, nr = 0;
VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
lock_page_memcg(page);
if (compound && PageTransHuge(page)) {
int nr_pages = thp_nr_pages(page);
- for (i = 0, nr = 0; i < nr_pages; i++) {
+ for (i = 0; i < nr_pages; i++) {
if (atomic_inc_and_test(&page[i]._mapcount))
nr++;
}
@@ -1271,11 +1273,12 @@ void page_add_file_rmap(struct page *page,
VM_WARN_ON_ONCE(!PageLocked(page));
SetPageDoubleMap(compound_head(page));
}
- if (!atomic_inc_and_test(&page->_mapcount))
- goto out;
+ if (atomic_inc_and_test(&page->_mapcount))
+ nr++;
}
- __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
out:
+ if (nr)
+ __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
unlock_page_memcg(page);
mlock_vma_page(page, vma, compound);
@@ -1283,7 +1286,7 @@ out:
static void page_remove_file_rmap(struct page *page, bool compound)
{
- int i, nr = 1;
+ int i, nr = 0;
VM_BUG_ON_PAGE(compound && !PageHead(page), page);
@@ -1298,12 +1301,12 @@ static void page_remove_file_rmap(struct page *page, bool compound)
if (compound && PageTransHuge(page)) {
int nr_pages = thp_nr_pages(page);
- for (i = 0, nr = 0; i < nr_pages; i++) {
+ for (i = 0; i < nr_pages; i++) {
if (atomic_add_negative(-1, &page[i]._mapcount))
nr++;
}
if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
- return;
+ goto out;
if (PageSwapBacked(page))
__mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
-nr_pages);
@@ -1311,16 +1314,12 @@ static void page_remove_file_rmap(struct page *page, bool compound)
__mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
-nr_pages);
} else {
- if (!atomic_add_negative(-1, &page->_mapcount))
- return;
+ if (atomic_add_negative(-1, &page->_mapcount))
+ nr++;
}
-
- /*
- * We use the irq-unsafe __{inc|mod}_lruvec_page_state because
- * these counters are not modified in interrupt context, and
- * pte lock(a spinlock) is held, which implies preemption disabled.
- */
- __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr);
+out:
+ if (nr)
+ __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr);
}
static void page_remove_anon_compound_rmap(struct page *page)
@@ -1589,7 +1588,30 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
/* MADV_FREE page check */
if (!folio_test_swapbacked(folio)) {
- if (!folio_test_dirty(folio)) {
+ int ref_count, map_count;
+
+ /*
+ * Synchronize with gup_pte_range():
+ * - clear PTE; barrier; read refcount
+ * - inc refcount; barrier; read PTE
+ */
+ smp_mb();
+
+ ref_count = folio_ref_count(folio);
+ map_count = folio_mapcount(folio);
+
+ /*
+ * Order reads for page refcount and dirty flag
+ * (see comments in __remove_mapping()).
+ */
+ smp_rmb();
+
+ /*
+ * The only page refs must be one from isolation
+ * plus the rmap(s) (dropped by discard:).
+ */
+ if (ref_count == 1 + map_count &&
+ !folio_test_dirty(folio)) {
/* Invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm,
address, address + PAGE_SIZE);
@@ -1661,7 +1683,7 @@ discard:
*/
page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
if (vma->vm_flags & VM_LOCKED)
- mlock_page_drain(smp_processor_id());
+ mlock_page_drain_local();
folio_put(folio);
}
@@ -1852,6 +1874,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
if (pte_swp_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
+ trace_set_migration_pte(pvmw.address, pte_val(swp_pte),
+ compound_order(&folio->page));
/*
* No need to invalidate here it will synchronize on
* against the special swap migration pte.
@@ -1920,6 +1944,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
if (pte_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, address, pvmw.pte, swp_pte);
+ trace_set_migration_pte(address, pte_val(swp_pte),
+ compound_order(&folio->page));
/*
* No need to invalidate here it will synchronize on
* against the special swap migration pte.
@@ -1935,7 +1961,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
*/
page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
if (vma->vm_flags & VM_LOCKED)
- mlock_page_drain(smp_processor_id());
+ mlock_page_drain_local();
folio_put(folio);
}
diff --git a/mm/slab.c b/mm/slab.c
index d9dec7a8fd79..b04e40078bdf 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3422,6 +3422,7 @@ static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp,
if (is_kfence_address(objp)) {
kmemleak_free_recursive(objp, cachep->flags);
+ memcg_slab_free_hook(cachep, &objp, 1);
__kfence_free(objp);
return;
}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 23f2ab0713b7..6ee64d6208b3 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -807,7 +807,7 @@ void __init setup_kmalloc_cache_index_table(void)
unsigned int i;
BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
- (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
+ !is_power_of_2(KMALLOC_MIN_SIZE));
for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
unsigned int elem = size_index_elem(i);
diff --git a/mm/slob.c b/mm/slob.c
index 8a8795520361..dfa6808dff36 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -714,7 +714,7 @@ int __kmem_cache_shrink(struct kmem_cache *d)
return 0;
}
-struct kmem_cache kmem_cache_boot = {
+static struct kmem_cache kmem_cache_boot = {
.name = "kmem_cache",
.size = sizeof(struct kmem_cache),
.flags = SLAB_PANIC,
diff --git a/mm/slub.c b/mm/slub.c
index 07cdd999c3fe..74d92aa4a3a2 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1788,8 +1788,8 @@ static void *setup_object(struct kmem_cache *s, struct slab *slab,
/*
* Slab allocation and freeing
*/
-static inline struct slab *alloc_slab_page(struct kmem_cache *s,
- gfp_t flags, int node, struct kmem_cache_order_objects oo)
+static inline struct slab *alloc_slab_page(gfp_t flags, int node,
+ struct kmem_cache_order_objects oo)
{
struct folio *folio;
struct slab *slab;
@@ -1941,7 +1941,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL);
- slab = alloc_slab_page(s, alloc_gfp, node, oo);
+ slab = alloc_slab_page(alloc_gfp, node, oo);
if (unlikely(!slab)) {
oo = s->min;
alloc_gfp = flags;
@@ -1949,7 +1949,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
* Allocation may have failed due to fragmentation.
* Try a lower order alloc if possible
*/
- slab = alloc_slab_page(s, alloc_gfp, node, oo);
+ slab = alloc_slab_page(alloc_gfp, node, oo);
if (unlikely(!slab))
goto out;
stat(s, ORDER_FALLBACK);
@@ -2348,10 +2348,10 @@ static void init_kmem_cache_cpus(struct kmem_cache *s)
static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
void *freelist)
{
- enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
+ enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE, M_FULL_NOLIST };
struct kmem_cache_node *n = get_node(s, slab_nid(slab));
- int lock = 0, free_delta = 0;
- enum slab_modes l = M_NONE, m = M_NONE;
+ int free_delta = 0;
+ enum slab_modes mode = M_NONE;
void *nextfree, *freelist_iter, *freelist_tail;
int tail = DEACTIVATE_TO_HEAD;
unsigned long flags = 0;
@@ -2393,14 +2393,10 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
* Ensure that the slab is unfrozen while the list presence
* reflects the actual number of objects during unfreeze.
*
- * We setup the list membership and then perform a cmpxchg
- * with the count. If there is a mismatch then the slab
- * is not unfrozen but the slab is on the wrong list.
- *
- * Then we restart the process which may have to remove
- * the slab from the list that we just put it on again
- * because the number of objects in the slab may have
- * changed.
+ * We first perform cmpxchg holding lock and insert to list
+ * when it succeed. If there is mismatch then the slab is not
+ * unfrozen and number of objects in the slab may have changed.
+ * Then release lock and retry cmpxchg again.
*/
redo:
@@ -2419,61 +2415,52 @@ redo:
new.frozen = 0;
- if (!new.inuse && n->nr_partial >= s->min_partial)
- m = M_FREE;
- else if (new.freelist) {
- m = M_PARTIAL;
- if (!lock) {
- lock = 1;
- /*
- * Taking the spinlock removes the possibility that
- * acquire_slab() will see a slab that is frozen
- */
- spin_lock_irqsave(&n->list_lock, flags);
- }
+ if (!new.inuse && n->nr_partial >= s->min_partial) {
+ mode = M_FREE;
+ } else if (new.freelist) {
+ mode = M_PARTIAL;
+ /*
+ * Taking the spinlock removes the possibility that
+ * acquire_slab() will see a slab that is frozen
+ */
+ spin_lock_irqsave(&n->list_lock, flags);
+ } else if (kmem_cache_debug_flags(s, SLAB_STORE_USER)) {
+ mode = M_FULL;
+ /*
+ * This also ensures that the scanning of full
+ * slabs from diagnostic functions will not see
+ * any frozen slabs.
+ */
+ spin_lock_irqsave(&n->list_lock, flags);
} else {
- m = M_FULL;
- if (kmem_cache_debug_flags(s, SLAB_STORE_USER) && !lock) {
- lock = 1;
- /*
- * This also ensures that the scanning of full
- * slabs from diagnostic functions will not see
- * any frozen slabs.
- */
- spin_lock_irqsave(&n->list_lock, flags);
- }
+ mode = M_FULL_NOLIST;
}
- if (l != m) {
- if (l == M_PARTIAL)
- remove_partial(n, slab);
- else if (l == M_FULL)
- remove_full(s, n, slab);
-
- if (m == M_PARTIAL)
- add_partial(n, slab, tail);
- else if (m == M_FULL)
- add_full(s, n, slab);
- }
- l = m;
if (!cmpxchg_double_slab(s, slab,
old.freelist, old.counters,
new.freelist, new.counters,
- "unfreezing slab"))
+ "unfreezing slab")) {
+ if (mode == M_PARTIAL || mode == M_FULL)
+ spin_unlock_irqrestore(&n->list_lock, flags);
goto redo;
+ }
- if (lock)
- spin_unlock_irqrestore(&n->list_lock, flags);
- if (m == M_PARTIAL)
+ if (mode == M_PARTIAL) {
+ add_partial(n, slab, tail);
+ spin_unlock_irqrestore(&n->list_lock, flags);
stat(s, tail);
- else if (m == M_FULL)
- stat(s, DEACTIVATE_FULL);
- else if (m == M_FREE) {
+ } else if (mode == M_FREE) {
stat(s, DEACTIVATE_EMPTY);
discard_slab(s, slab);
stat(s, FREE_SLAB);
+ } else if (mode == M_FULL) {
+ add_full(s, n, slab);
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ stat(s, DEACTIVATE_FULL);
+ } else if (mode == M_FULL_NOLIST) {
+ stat(s, DEACTIVATE_FULL);
}
}
@@ -4014,15 +4001,6 @@ static int init_kmem_cache_nodes(struct kmem_cache *s)
return 1;
}
-static void set_min_partial(struct kmem_cache *s, unsigned long min)
-{
- if (min < MIN_PARTIAL)
- min = MIN_PARTIAL;
- else if (min > MAX_PARTIAL)
- min = MAX_PARTIAL;
- s->min_partial = min;
-}
-
static void set_cpu_partial(struct kmem_cache *s)
{
#ifdef CONFIG_SLUB_CPU_PARTIAL
@@ -4060,7 +4038,7 @@ static void set_cpu_partial(struct kmem_cache *s)
* calculate_sizes() determines the order and the distribution of data within
* a slab object.
*/
-static int calculate_sizes(struct kmem_cache *s, int forced_order)
+static int calculate_sizes(struct kmem_cache *s)
{
slab_flags_t flags = s->flags;
unsigned int size = s->object_size;
@@ -4164,10 +4142,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
size = ALIGN(size, s->align);
s->size = size;
s->reciprocal_size = reciprocal_value(size);
- if (forced_order >= 0)
- order = forced_order;
- else
- order = calculate_order(size);
+ order = calculate_order(size);
if ((int)order < 0)
return 0;
@@ -4203,7 +4178,7 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
s->random = get_random_long();
#endif
- if (!calculate_sizes(s, -1))
+ if (!calculate_sizes(s))
goto error;
if (disable_higher_order_debug) {
/*
@@ -4213,7 +4188,7 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
if (get_order(s->size) > get_order(s->object_size)) {
s->flags &= ~DEBUG_METADATA_FLAGS;
s->offset = 0;
- if (!calculate_sizes(s, -1))
+ if (!calculate_sizes(s))
goto error;
}
}
@@ -4229,7 +4204,8 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
* The larger the object size is, the more slabs we want on the partial
* list to avoid pounding the page allocator excessively.
*/
- set_min_partial(s, ilog2(s->size) / 2);
+ s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2);
+ s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial);
set_cpu_partial(s);
@@ -5358,12 +5334,10 @@ struct slab_attribute {
};
#define SLAB_ATTR_RO(_name) \
- static struct slab_attribute _name##_attr = \
- __ATTR(_name, 0400, _name##_show, NULL)
+ static struct slab_attribute _name##_attr = __ATTR_RO_MODE(_name, 0400)
#define SLAB_ATTR(_name) \
- static struct slab_attribute _name##_attr = \
- __ATTR(_name, 0600, _name##_show, _name##_store)
+ static struct slab_attribute _name##_attr = __ATTR_RW_MODE(_name, 0600)
static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
{
@@ -5410,7 +5384,7 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
if (err)
return err;
- set_min_partial(s, min);
+ s->min_partial = min;
return length;
}
SLAB_ATTR(min_partial);
diff --git a/mm/swap.c b/mm/swap.c
index 5b30045207e1..7e320ec08c6a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -97,7 +97,6 @@ static void __page_cache_release(struct page *page)
mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
}
- __ClearPageWaiters(page);
}
static void __put_single_page(struct page *page)
@@ -152,7 +151,6 @@ void put_pages_list(struct list_head *pages)
continue;
}
/* Cannot be PageLRU because it's passed to us using the lru */
- __ClearPageWaiters(page);
}
free_unref_page_list(pages);
@@ -626,7 +624,6 @@ void lru_add_drain_cpu(int cpu)
pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
activate_page_drain(cpu);
- mlock_page_drain(cpu);
}
/**
@@ -708,6 +705,7 @@ void lru_add_drain(void)
local_lock(&lru_pvecs.lock);
lru_add_drain_cpu(smp_processor_id());
local_unlock(&lru_pvecs.lock);
+ mlock_page_drain_local();
}
/*
@@ -722,6 +720,7 @@ static void lru_add_and_bh_lrus_drain(void)
lru_add_drain_cpu(smp_processor_id());
local_unlock(&lru_pvecs.lock);
invalidate_bh_lrus_cpu();
+ mlock_page_drain_local();
}
void lru_add_drain_cpu_zone(struct zone *zone)
@@ -730,6 +729,7 @@ void lru_add_drain_cpu_zone(struct zone *zone)
lru_add_drain_cpu(smp_processor_id());
drain_local_pages(zone);
local_unlock(&lru_pvecs.lock);
+ mlock_page_drain_local();
}
#ifdef CONFIG_SMP
@@ -971,8 +971,6 @@ void release_pages(struct page **pages, int nr)
count_vm_event(UNEVICTABLE_PGCLEARED);
}
- __ClearPageWaiters(page);
-
list_add(&page->lru, &pages_to_free);
}
if (lruvec)
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index 7f34343c075a..5a9442979a18 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -167,14 +167,12 @@ unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
int swap_cgroup_swapon(int type, unsigned long max_pages)
{
void *array;
- unsigned long array_size;
unsigned long length;
struct swap_cgroup_ctrl *ctrl;
length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
- array_size = length * sizeof(void *);
- array = vzalloc(array_size);
+ array = vcalloc(length, sizeof(void *));
if (!array)
goto nomem;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 33c7abb16610..63c61f8b2611 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1167,16 +1167,6 @@ out:
return NULL;
}
-static struct swap_info_struct *swap_info_get(swp_entry_t entry)
-{
- struct swap_info_struct *p;
-
- p = _swap_info_get(entry);
- if (p)
- spin_lock(&p->lock);
- return p;
-}
-
static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
struct swap_info_struct *q)
{
@@ -1601,100 +1591,6 @@ static bool page_swapped(struct page *page)
return false;
}
-static int page_trans_huge_map_swapcount(struct page *page,
- int *total_swapcount)
-{
- int i, map_swapcount, _total_swapcount;
- unsigned long offset = 0;
- struct swap_info_struct *si;
- struct swap_cluster_info *ci = NULL;
- unsigned char *map = NULL;
- int swapcount = 0;
-
- /* hugetlbfs shouldn't call it */
- VM_BUG_ON_PAGE(PageHuge(page), page);
-
- if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
- if (PageSwapCache(page))
- swapcount = page_swapcount(page);
- if (total_swapcount)
- *total_swapcount = swapcount;
- return swapcount + page_trans_huge_mapcount(page);
- }
-
- page = compound_head(page);
-
- _total_swapcount = map_swapcount = 0;
- if (PageSwapCache(page)) {
- swp_entry_t entry;
-
- entry.val = page_private(page);
- si = _swap_info_get(entry);
- if (si) {
- map = si->swap_map;
- offset = swp_offset(entry);
- }
- }
- if (map)
- ci = lock_cluster(si, offset);
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- int mapcount = atomic_read(&page[i]._mapcount) + 1;
- if (map) {
- swapcount = swap_count(map[offset + i]);
- _total_swapcount += swapcount;
- }
- map_swapcount = max(map_swapcount, mapcount + swapcount);
- }
- unlock_cluster(ci);
-
- if (PageDoubleMap(page))
- map_swapcount -= 1;
-
- if (total_swapcount)
- *total_swapcount = _total_swapcount;
-
- return map_swapcount + compound_mapcount(page);
-}
-
-/*
- * We can write to an anon page without COW if there are no other references
- * to it. And as a side-effect, free up its swap: because the old content
- * on disk will never be read, and seeking back there to write new content
- * later would only waste time away from clustering.
- */
-bool reuse_swap_page(struct page *page)
-{
- int count, total_swapcount;
-
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- if (unlikely(PageKsm(page)))
- return false;
- count = page_trans_huge_map_swapcount(page, &total_swapcount);
- if (count == 1 && PageSwapCache(page) &&
- (likely(!PageTransCompound(page)) ||
- /* The remaining swap count will be freed soon */
- total_swapcount == page_swapcount(page))) {
- if (!PageWriteback(page)) {
- page = compound_head(page);
- delete_from_swap_cache(page);
- SetPageDirty(page);
- } else {
- swp_entry_t entry;
- struct swap_info_struct *p;
-
- entry.val = page_private(page);
- p = swap_info_get(entry);
- if (p->flags & SWP_STABLE_WRITES) {
- spin_unlock(&p->lock);
- return false;
- }
- spin_unlock(&p->lock);
- }
- }
-
- return count <= 1;
-}
-
/*
* If swap is getting full, or if there are no more mappings of this page,
* then try_to_free_swap is called to free its swap space.
diff --git a/mm/util.c b/mm/util.c
index 1e2728736398..54e5e761a9a9 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -649,6 +649,56 @@ void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
}
EXPORT_SYMBOL(kvrealloc);
+/**
+ * __vmalloc_array - allocate memory for a virtually contiguous array.
+ * @n: number of elements.
+ * @size: element size.
+ * @flags: the type of memory to allocate (see kmalloc).
+ */
+void *__vmalloc_array(size_t n, size_t size, gfp_t flags)
+{
+ size_t bytes;
+
+ if (unlikely(check_mul_overflow(n, size, &bytes)))
+ return NULL;
+ return __vmalloc(bytes, flags);
+}
+EXPORT_SYMBOL(__vmalloc_array);
+
+/**
+ * vmalloc_array - allocate memory for a virtually contiguous array.
+ * @n: number of elements.
+ * @size: element size.
+ */
+void *vmalloc_array(size_t n, size_t size)
+{
+ return __vmalloc_array(n, size, GFP_KERNEL);
+}
+EXPORT_SYMBOL(vmalloc_array);
+
+/**
+ * __vcalloc - allocate and zero memory for a virtually contiguous array.
+ * @n: number of elements.
+ * @size: element size.
+ * @flags: the type of memory to allocate (see kmalloc).
+ */
+void *__vcalloc(size_t n, size_t size, gfp_t flags)
+{
+ return __vmalloc_array(n, size, flags | __GFP_ZERO);
+}
+EXPORT_SYMBOL(__vcalloc);
+
+/**
+ * vcalloc - allocate and zero memory for a virtually contiguous array.
+ * @n: number of elements.
+ * @size: element size.
+ */
+void *vcalloc(size_t n, size_t size)
+{
+ return __vmalloc_array(n, size, GFP_KERNEL | __GFP_ZERO);
+}
+EXPORT_SYMBOL(vcalloc);
+
/* Neutral page->mapping pointer to address_space or anon_vma or other */
void *page_rmapping(struct page *page)
{
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 99e0f3e8d1a5..e163372d3967 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -74,7 +74,7 @@ static const bool vmap_allow_huge = false;
bool is_vmalloc_addr(const void *x)
{
- unsigned long addr = (unsigned long)x;
+ unsigned long addr = (unsigned long)kasan_reset_tag(x);
return addr >= VMALLOC_START && addr < VMALLOC_END;
}
@@ -631,7 +631,7 @@ int is_vmalloc_or_module_addr(const void *x)
* just put it in the vmalloc space.
*/
#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
- unsigned long addr = (unsigned long)x;
+ unsigned long addr = (unsigned long)kasan_reset_tag(x);
if (addr >= MODULES_VADDR && addr < MODULES_END)
return 1;
#endif
@@ -795,6 +795,8 @@ static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
struct vmap_area *va = NULL;
struct rb_node *n = vmap_area_root.rb_node;
+ addr = (unsigned long)kasan_reset_tag((void *)addr);
+
while (n) {
struct vmap_area *tmp;
@@ -816,6 +818,8 @@ static struct vmap_area *__find_vmap_area(unsigned long addr)
{
struct rb_node *n = vmap_area_root.rb_node;
+ addr = (unsigned long)kasan_reset_tag((void *)addr);
+
while (n) {
struct vmap_area *va;
@@ -2166,7 +2170,7 @@ EXPORT_SYMBOL_GPL(vm_unmap_aliases);
void vm_unmap_ram(const void *mem, unsigned int count)
{
unsigned long size = (unsigned long)count << PAGE_SHIFT;
- unsigned long addr = (unsigned long)mem;
+ unsigned long addr = (unsigned long)kasan_reset_tag(mem);
struct vmap_area *va;
might_sleep();
@@ -2227,14 +2231,19 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
mem = (void *)addr;
}
- kasan_unpoison_vmalloc(mem, size);
-
if (vmap_pages_range(addr, addr + size, PAGE_KERNEL,
pages, PAGE_SHIFT) < 0) {
vm_unmap_ram(mem, count);
return NULL;
}
+ /*
+ * Mark the pages as accessible, now that they are mapped.
+ * With hardware tag-based KASAN, marking is skipped for
+ * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
+ */
+ mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL);
+
return mem;
}
EXPORT_SYMBOL(vm_map_ram);
@@ -2460,10 +2469,20 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
return NULL;
}
- kasan_unpoison_vmalloc((void *)va->va_start, requested_size);
-
setup_vmalloc_vm(area, va, flags, caller);
+ /*
+ * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
+ * best-effort approach, as they can be mapped outside of vmalloc code.
+ * For VM_ALLOC mappings, the pages are marked as accessible after
+ * getting mapped in __vmalloc_node_range().
+ * With hardware tag-based KASAN, marking is skipped for
+ * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
+ */
+ if (!(flags & VM_ALLOC))
+ area->addr = kasan_unpoison_vmalloc(area->addr, requested_size,
+ KASAN_VMALLOC_PROT_NORMAL);
+
return area;
}
@@ -2547,7 +2566,7 @@ struct vm_struct *remove_vm_area(const void *addr)
va->vm = NULL;
spin_unlock(&vmap_area_lock);
- kasan_free_shadow(vm);
+ kasan_free_module_shadow(vm);
free_unmap_vmap_area(va);
return vm;
@@ -3071,7 +3090,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
const void *caller)
{
struct vm_struct *area;
- void *addr;
+ void *ret;
+ kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
unsigned long real_size = size;
unsigned long real_align = align;
unsigned int shift = PAGE_SHIFT;
@@ -3124,11 +3144,51 @@ again:
goto fail;
}
- addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
- if (!addr)
+ /*
+ * Prepare arguments for __vmalloc_area_node() and
+ * kasan_unpoison_vmalloc().
+ */
+ if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) {
+ if (kasan_hw_tags_enabled()) {
+ /*
+ * Modify protection bits to allow tagging.
+ * This must be done before mapping.
+ */
+ prot = arch_vmap_pgprot_tagged(prot);
+
+ /*
+ * Skip page_alloc poisoning and zeroing for physical
+ * pages backing VM_ALLOC mapping. Memory is instead
+ * poisoned and zeroed by kasan_unpoison_vmalloc().
+ */
+ gfp_mask |= __GFP_SKIP_KASAN_UNPOISON | __GFP_SKIP_ZERO;
+ }
+
+ /* Take note that the mapping is PAGE_KERNEL. */
+ kasan_flags |= KASAN_VMALLOC_PROT_NORMAL;
+ }
+
+ /* Allocate physical pages and map them into vmalloc space. */
+ ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
+ if (!ret)
goto fail;
/*
+ * Mark the pages as accessible, now that they are mapped.
+ * The init condition should match the one in post_alloc_hook()
+ * (except for the should_skip_init() check) to make sure that memory
+ * is initialized under the same conditions regardless of the enabled
+ * KASAN mode.
+ * Tag-based KASAN modes only assign tags to normal non-executable
+ * allocations, see __kasan_unpoison_vmalloc().
+ */
+ kasan_flags |= KASAN_VMALLOC_VM_ALLOC;
+ if (!want_init_on_free() && want_init_on_alloc(gfp_mask))
+ kasan_flags |= KASAN_VMALLOC_INIT;
+ /* KASAN_VMALLOC_PROT_NORMAL already set if required. */
+ area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags);
+
+ /*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
* flag. It means that vm_struct is not fully initialized.
* Now, it is fully initialized, so remove this flag here.
@@ -3139,7 +3199,7 @@ again:
if (!(vm_flags & VM_DEFER_KMEMLEAK))
kmemleak_vmalloc(area, size, gfp_mask);
- return addr;
+ return area->addr;
fail:
if (shift > PAGE_SHIFT) {
@@ -3424,6 +3484,8 @@ long vread(char *buf, char *addr, unsigned long count)
unsigned long buflen = count;
unsigned long n;
+ addr = kasan_reset_tag(addr);
+
/* Don't allow overflow */
if ((unsigned long) addr + count < count)
count = -(unsigned long) addr;
@@ -3809,9 +3871,6 @@ retry:
for (area = 0; area < nr_vms; area++) {
if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
goto err_free_shadow;
-
- kasan_unpoison_vmalloc((void *)vas[area]->va_start,
- sizes[area]);
}
/* insert all vm's */
@@ -3824,6 +3883,16 @@ retry:
}
spin_unlock(&vmap_area_lock);
+ /*
+ * Mark allocated areas as accessible. Do it now as a best-effort
+ * approach, as they can be mapped outside of vmalloc code.
+ * With hardware tag-based KASAN, marking is skipped for
+ * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
+ */
+ for (area = 0; area < nr_vms; area++)
+ vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr,
+ vms[area]->size, KASAN_VMALLOC_PROT_NORMAL);
+
kfree(vas);
return vms;