summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig4
-rw-r--r--mm/damon/core.c7
-rw-r--r--mm/damon/paddr.c3
-rw-r--r--mm/execmem.c39
-rw-r--r--mm/filemap.c40
-rw-r--r--mm/gup.c6
-rw-r--r--mm/huge_memory.c2
-rw-r--r--mm/hugetlb.c8
-rw-r--r--mm/memcontrol-v1.c6
-rw-r--r--mm/memcontrol.c13
-rw-r--r--mm/memory.c2
-rw-r--r--mm/mempolicy.c31
-rw-r--r--mm/migrate.c10
-rw-r--r--mm/page_alloc.c14
-rw-r--r--mm/percpu.c4
-rw-r--r--mm/shmem.c8
-rw-r--r--mm/slab.h34
-rw-r--r--mm/slab_common.c48
-rw-r--r--mm/slub.c336
-rw-r--r--mm/swap_cgroup.c7
-rw-r--r--mm/usercopy.c18
-rw-r--r--mm/util.c162
-rw-r--r--mm/vma.c3
-rw-r--r--mm/vmstat.c2
24 files changed, 488 insertions, 319 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 1b501db06417..0b7f4bb5cb80 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -242,6 +242,10 @@ menu "Slab allocator options"
config SLUB
def_bool y
+config KVFREE_RCU_BATCHED
+ def_bool y
+ depends on !SLUB_TINY && !TINY_RCU
+
config SLUB_TINY
bool "Configure for minimal memory footprint"
depends on EXPERT
diff --git a/mm/damon/core.c b/mm/damon/core.c
index c7b981308862..384935ef4e65 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -373,6 +373,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
* or damon_attrs are updated.
*/
scheme->next_apply_sis = 0;
+ scheme->walk_completed = false;
INIT_LIST_HEAD(&scheme->filters);
scheme->stat = (struct damos_stat){};
INIT_LIST_HEAD(&scheme->list);
@@ -1429,9 +1430,13 @@ static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t,
{
struct damos_filter *filter;
+ s->core_filters_allowed = false;
damos_for_each_filter(filter, s) {
- if (damos_filter_match(ctx, t, r, filter))
+ if (damos_filter_match(ctx, t, r, filter)) {
+ if (filter->allow)
+ s->core_filters_allowed = true;
return !filter->allow;
+ }
}
return false;
}
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 0f9ae14f884d..c834aa217835 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -236,6 +236,9 @@ static bool damos_pa_filter_out(struct damos *scheme, struct folio *folio)
{
struct damos_filter *filter;
+ if (scheme->core_filters_allowed)
+ return false;
+
damos_for_each_filter(filter, scheme) {
if (damos_pa_filter_match(filter, folio))
return !filter->allow;
diff --git a/mm/execmem.c b/mm/execmem.c
index 317b6a8d35be..e6c4f5076ca8 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -257,7 +257,6 @@ out_unlock:
static int execmem_cache_populate(struct execmem_range *range, size_t size)
{
unsigned long vm_flags = VM_ALLOW_HUGE_VMAP;
- unsigned long start, end;
struct vm_struct *vm;
size_t alloc_size;
int err = -ENOMEM;
@@ -275,26 +274,18 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size)
/* fill memory with instructions that will trap */
execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true);
- start = (unsigned long)p;
- end = start + alloc_size;
-
- vunmap_range(start, end);
-
- err = execmem_set_direct_map_valid(vm, false);
- if (err)
- goto err_free_mem;
-
- err = vmap_pages_range_noflush(start, end, range->pgprot, vm->pages,
- PMD_SHIFT);
+ err = set_memory_rox((unsigned long)p, vm->nr_pages);
if (err)
goto err_free_mem;
err = execmem_cache_add(p, alloc_size);
if (err)
- goto err_free_mem;
+ goto err_reset_direct_map;
return 0;
+err_reset_direct_map:
+ execmem_set_direct_map_valid(vm, true);
err_free_mem:
vfree(p);
return err;
@@ -344,6 +335,28 @@ static bool execmem_cache_free(void *ptr)
return true;
}
+
+int execmem_make_temp_rw(void *ptr, size_t size)
+{
+ unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned long addr = (unsigned long)ptr;
+ int ret;
+
+ ret = set_memory_nx(addr, nr);
+ if (ret)
+ return ret;
+
+ return set_memory_rw(addr, nr);
+}
+
+int execmem_restore_rox(void *ptr, size_t size)
+{
+ unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned long addr = (unsigned long)ptr;
+
+ return set_memory_rox(addr, nr);
+}
+
#else /* CONFIG_ARCH_HAS_EXECMEM_ROX */
static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
{
diff --git a/mm/filemap.c b/mm/filemap.c
index 6d616bb9001e..e9404290f2c6 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1985,8 +1985,19 @@ no_page:
if (err == -EEXIST)
goto repeat;
- if (err)
+ if (err) {
+ /*
+ * When NOWAIT I/O fails to allocate folios this could
+ * be due to a nonblocking memory allocation and not
+ * because the system actually is out of memory.
+ * Return -EAGAIN so that there caller retries in a
+ * blocking fashion instead of propagating -ENOMEM
+ * to the application.
+ */
+ if ((fgp_flags & FGP_NOWAIT) && err == -ENOMEM)
+ err = -EAGAIN;
return ERR_PTR(err);
+ }
/*
* filemap_add_folio locks the page, and for mmap
* we expect an unlocked page.
@@ -4083,17 +4094,6 @@ retry:
bytes = min(chunk - offset, bytes);
balance_dirty_pages_ratelimited(mapping);
- /*
- * Bring in the user page that we will copy from _first_.
- * Otherwise there's a nasty deadlock on copying from the
- * same page as we're writing to, without it being marked
- * up-to-date.
- */
- if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
- status = -EFAULT;
- break;
- }
-
if (fatal_signal_pending(current)) {
status = -EINTR;
break;
@@ -4111,6 +4111,12 @@ retry:
if (mapping_writably_mapped(mapping))
flush_dcache_folio(folio);
+ /*
+ * Faults here on mmap()s can recurse into arbitrary
+ * filesystem code. Lots of locks are held that can
+ * deadlock. Use an atomic copy to avoid deadlocking
+ * in page fault handling.
+ */
copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
flush_dcache_folio(folio);
@@ -4136,6 +4142,16 @@ retry:
bytes = copied;
goto retry;
}
+
+ /*
+ * 'folio' is now unlocked and faults on it can be
+ * handled. Ensure forward progress by trying to
+ * fault it in now.
+ */
+ if (fault_in_iov_iter_readable(i, bytes) == bytes) {
+ status = -EFAULT;
+ break;
+ }
} else {
pos += status;
written += status;
diff --git a/mm/gup.c b/mm/gup.c
index 3883b307780e..855ab860f88b 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2254,6 +2254,7 @@ EXPORT_SYMBOL(fault_in_readable);
/**
* get_dump_page() - pin user page in memory while writing it to core dump
* @addr: user address
+ * @locked: a pointer to an int denoting whether the mmap sem is held
*
* Returns struct page pointer of user page pinned for dump,
* to be freed afterwards by put_page().
@@ -2266,13 +2267,12 @@ EXPORT_SYMBOL(fault_in_readable);
* Called without mmap_lock (takes and releases the mmap_lock by itself).
*/
#ifdef CONFIG_ELF_CORE
-struct page *get_dump_page(unsigned long addr)
+struct page *get_dump_page(unsigned long addr, int *locked)
{
struct page *page;
- int locked = 0;
int ret;
- ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked,
+ ret = __get_user_pages_locked(current->mm, addr, 1, &page, locked,
FOLL_FORCE | FOLL_DUMP | FOLL_GET);
return (ret == 1) ? page : NULL;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3d3ebdc002d5..373781b21e5c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3304,7 +3304,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
folio_account_cleaned(tail,
inode_to_wb(folio->mapping->host));
__filemap_remove_folio(tail, NULL);
- folio_put(tail);
+ folio_put_refs(tail, folio_nr_pages(tail));
} else if (!folio_test_anon(folio)) {
__xa_store(&folio->mapping->i_pages, tail->index,
tail, 0);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 97930d44d460..318624c96584 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2135,6 +2135,8 @@ retry:
if (!folio_ref_count(folio)) {
struct hstate *h = folio_hstate(folio);
+ bool adjust_surplus = false;
+
if (!available_huge_pages(h))
goto out;
@@ -2157,7 +2159,9 @@ retry:
goto retry;
}
- remove_hugetlb_folio(h, folio, false);
+ if (h->surplus_huge_pages_node[folio_nid(folio)])
+ adjust_surplus = true;
+ remove_hugetlb_folio(h, folio, adjust_surplus);
h->max_huge_pages--;
spin_unlock_irq(&hugetlb_lock);
@@ -2177,7 +2181,7 @@ retry:
rc = hugetlb_vmemmap_restore_folio(h, folio);
if (rc) {
spin_lock_irq(&hugetlb_lock);
- add_hugetlb_folio(h, folio, false);
+ add_hugetlb_folio(h, folio, adjust_surplus);
h->max_huge_pages++;
goto out;
}
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 2be6b9112808..2e9fa431bbf5 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -1855,9 +1855,11 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
if (val > MAX_SWAPPINESS)
return -EINVAL;
- if (!mem_cgroup_is_root(memcg))
+ if (!mem_cgroup_is_root(memcg)) {
+ pr_info_once("Per memcg swappiness does not exist in cgroup v2. "
+ "See memory.reclaim or memory.swap.max there\n ");
WRITE_ONCE(memcg->swappiness, val);
- else
+ } else
WRITE_ONCE(vm_swappiness, val);
return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4de6acb9b8ec..a037ec92881d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1921,9 +1921,18 @@ void drain_all_stock(struct mem_cgroup *root_memcg)
static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
struct memcg_stock_pcp *stock;
+ struct obj_cgroup *old;
+ unsigned long flags;
stock = &per_cpu(memcg_stock, cpu);
+
+ /* drain_obj_stock requires stock_lock */
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ old = drain_obj_stock(stock);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+
drain_stock(stock);
+ obj_cgroup_put(old);
return 0;
}
@@ -4993,7 +5002,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
- swap_cgroup_record(folio, entry);
+ swap_cgroup_record(folio, mem_cgroup_id(swap_memcg), entry);
folio_unqueue_deferred_split(folio);
folio->memcg_data = 0;
@@ -5055,7 +5064,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
mem_cgroup_id_get_many(memcg, nr_pages - 1);
mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
- swap_cgroup_record(folio, entry);
+ swap_cgroup_record(folio, mem_cgroup_id(memcg), entry);
return 0;
}
diff --git a/mm/memory.c b/mm/memory.c
index fb7b8dc75167..4f6d9766a046 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6834,10 +6834,8 @@ void __might_fault(const char *file, int line)
if (pagefault_disabled())
return;
__might_sleep(file, line);
-#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
if (current->mm)
might_lock_read(&current->mm->mmap_lock);
-#endif
}
EXPORT_SYMBOL(__might_fault);
#endif
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index bbaadbeeb291..a9eea051b0d6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -196,6 +196,37 @@ int numa_nearest_node(int node, unsigned int state)
}
EXPORT_SYMBOL_GPL(numa_nearest_node);
+/**
+ * nearest_node_nodemask - Find the node in @mask at the nearest distance
+ * from @node.
+ *
+ * @node: a valid node ID to start the search from.
+ * @mask: a pointer to a nodemask representing the allowed nodes.
+ *
+ * This function iterates over all nodes in @mask and calculates the
+ * distance from the starting @node, then it returns the node ID that is
+ * the closest to @node, or MAX_NUMNODES if no node is found.
+ *
+ * Note that @node must be a valid node ID usable with node_distance(),
+ * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
+ * or unexpected behavior.
+ */
+int nearest_node_nodemask(int node, nodemask_t *mask)
+{
+ int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;
+
+ for_each_node_mask(n, *mask) {
+ dist = node_distance(node, n);
+ if (dist < min_dist) {
+ min_dist = dist;
+ min_node = n;
+ }
+ }
+
+ return min_node;
+}
+EXPORT_SYMBOL_GPL(nearest_node_nodemask);
+
struct mempolicy *get_task_policy(struct task_struct *p)
{
struct mempolicy *pol = p->mempolicy;
diff --git a/mm/migrate.c b/mm/migrate.c
index fb19a18892c8..97f0edf0c032 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -518,15 +518,13 @@ static int __folio_migrate_mapping(struct address_space *mapping,
if (folio_test_anon(folio) && folio_test_large(folio))
mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
folio_ref_add(newfolio, nr); /* add cache reference */
- if (folio_test_swapbacked(folio)) {
+ if (folio_test_swapbacked(folio))
__folio_set_swapbacked(newfolio);
- if (folio_test_swapcache(folio)) {
- folio_set_swapcache(newfolio);
- newfolio->private = folio_get_private(folio);
- }
+ if (folio_test_swapcache(folio)) {
+ folio_set_swapcache(newfolio);
+ newfolio->private = folio_get_private(folio);
entries = nr;
} else {
- VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
entries = 1;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 94917c729120..542d25f77be8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7004,7 +7004,7 @@ static inline bool has_unaccepted_memory(void)
static bool cond_accept_memory(struct zone *zone, unsigned int order)
{
- long to_accept;
+ long to_accept, wmark;
bool ret = false;
if (!has_unaccepted_memory())
@@ -7013,8 +7013,18 @@ static bool cond_accept_memory(struct zone *zone, unsigned int order)
if (list_empty(&zone->unaccepted_pages))
return false;
+ wmark = promo_wmark_pages(zone);
+
+ /*
+ * Watermarks have not been initialized yet.
+ *
+ * Accepting one MAX_ORDER page to ensure progress.
+ */
+ if (!wmark)
+ return try_to_accept_memory_one(zone);
+
/* How much to accept to get to promo watermark? */
- to_accept = promo_wmark_pages(zone) -
+ to_accept = wmark -
(zone_page_state(zone, NR_FREE_PAGES) -
__zone_watermark_unusable_free(zone, order, 0) -
zone_page_state(zone, NR_UNACCEPTED));
diff --git a/mm/percpu.c b/mm/percpu.c
index ac61e3fc5f15..7b5835356d1e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -3071,7 +3071,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
continue;
}
/* copy and return the unused part */
- memcpy(ptr, __per_cpu_load, ai->static_size);
+ memcpy(ptr, __per_cpu_start, ai->static_size);
pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum);
}
}
@@ -3240,7 +3240,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t
flush_cache_vmap_early(unit_addr, unit_addr + ai->unit_size);
/* copy static data */
- memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
+ memcpy((void *)unit_addr, __per_cpu_start, ai->static_size);
}
/* we're ready, commit */
diff --git a/mm/shmem.c b/mm/shmem.c
index 1ede0800e846..ab61c8bb20e1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3912,16 +3912,16 @@ out_iput:
return error;
}
-static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int error;
error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0);
if (error)
- return error;
+ return ERR_PTR(error);
inc_nlink(dir);
- return 0;
+ return NULL;
}
static int shmem_create(struct mnt_idmap *idmap, struct inode *dir,
diff --git a/mm/slab.h b/mm/slab.h
index e9fd9bf0bfa6..05a21dc796e0 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -457,39 +457,17 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s)
return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT));
}
-/* Legal flag mask for kmem_cache_create(), for various configurations */
#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
SLAB_CACHE_DMA32 | SLAB_PANIC | \
- SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS )
+ SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \
+ SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
+ SLAB_TEMPORARY | SLAB_ACCOUNT | \
+ SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE)
-#ifdef CONFIG_SLUB_DEBUG
#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
SLAB_TRACE | SLAB_CONSISTENCY_CHECKS)
-#else
-#define SLAB_DEBUG_FLAGS (0)
-#endif
-#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
- SLAB_TEMPORARY | SLAB_ACCOUNT | \
- SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE)
-
-/* Common flags available with current configuration */
-#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
-
-/* Common flags permitted for kmem_cache_create */
-#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \
- SLAB_RED_ZONE | \
- SLAB_POISON | \
- SLAB_STORE_USER | \
- SLAB_TRACE | \
- SLAB_CONSISTENCY_CHECKS | \
- SLAB_NOLEAKTRACE | \
- SLAB_RECLAIM_ACCOUNT | \
- SLAB_TEMPORARY | \
- SLAB_ACCOUNT | \
- SLAB_KMALLOC | \
- SLAB_NO_MERGE | \
- SLAB_NO_USER_FLAGS)
+#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS)
bool __kmem_cache_empty(struct kmem_cache *);
int __kmem_cache_shutdown(struct kmem_cache *);
@@ -604,6 +582,8 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
void **p, int objects, struct slabobj_ext *obj_exts);
#endif
+void kvfree_rcu_cb(struct rcu_head *head);
+
size_t __ksize(const void *objp);
static inline size_t slab_ksize(const struct kmem_cache *s)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 4c9f0a87f733..5be257e03c7c 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -298,6 +298,8 @@ struct kmem_cache *__kmem_cache_create_args(const char *name,
static_branch_enable(&slub_debug_enabled);
if (flags & SLAB_STORE_USER)
stack_depot_init();
+#else
+ flags &= ~SLAB_DEBUG_FLAGS;
#endif
mutex_lock(&slab_mutex);
@@ -307,20 +309,11 @@ struct kmem_cache *__kmem_cache_create_args(const char *name,
goto out_unlock;
}
- /* Refuse requests with allocator specific flags */
if (flags & ~SLAB_FLAGS_PERMITTED) {
err = -EINVAL;
goto out_unlock;
}
- /*
- * Some allocators will constraint the set of valid flags to a subset
- * of all flags. We expect them to define CACHE_CREATE_MASK in this
- * case, and we'll just provide them with a sanitized version of the
- * passed flags.
- */
- flags &= CACHE_CREATE_MASK;
-
/* Fail closed on bad usersize of useroffset values. */
if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) ||
WARN_ON(!args->usersize && args->useroffset) ||
@@ -1284,6 +1277,29 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
EXPORT_TRACEPOINT_SYMBOL(kfree);
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
+#ifndef CONFIG_KVFREE_RCU_BATCHED
+
+void kvfree_call_rcu(struct rcu_head *head, void *ptr)
+{
+ if (head) {
+ kasan_record_aux_stack(ptr);
+ call_rcu(head, kvfree_rcu_cb);
+ return;
+ }
+
+ // kvfree_rcu(one_arg) call.
+ might_sleep();
+ synchronize_rcu();
+ kvfree(ptr);
+}
+EXPORT_SYMBOL_GPL(kvfree_call_rcu);
+
+void __init kvfree_rcu_init(void)
+{
+}
+
+#else /* CONFIG_KVFREE_RCU_BATCHED */
+
/*
* This rcu parameter is runtime-read-only. It reflects
* a minimum allowed number of objects which can be cached
@@ -1534,8 +1550,7 @@ kvfree_rcu_list(struct rcu_head *head)
rcu_lock_acquire(&rcu_callback_map);
trace_rcu_invoke_kvfree_callback("slab", head, offset);
- if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
- kvfree(ptr);
+ kvfree(ptr);
rcu_lock_release(&rcu_callback_map);
cond_resched_tasks_rcu_qs();
@@ -1863,8 +1878,6 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
return true;
}
-#if !defined(CONFIG_TINY_RCU)
-
static enum hrtimer_restart
schedule_page_work_fn(struct hrtimer *t)
{
@@ -1889,8 +1902,8 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp)
&krcp->page_cache_work,
msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
} else {
- hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- krcp->hrtimer.function = schedule_page_work_fn;
+ hrtimer_setup(&krcp->hrtimer, schedule_page_work_fn, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
}
}
@@ -2073,8 +2086,6 @@ void kvfree_rcu_barrier(void)
}
EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
-#endif /* #if !defined(CONFIG_TINY_RCU) */
-
static unsigned long
kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
@@ -2168,3 +2179,6 @@ void __init kvfree_rcu_init(void)
shrinker_register(kfree_rcu_shrinker);
}
+
+#endif /* CONFIG_KVFREE_RCU_BATCHED */
+
diff --git a/mm/slub.c b/mm/slub.c
index 1f50129dcfb3..5eac408e818e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -19,6 +19,7 @@
#include <linux/bitops.h>
#include <linux/slab.h>
#include "slab.h"
+#include <linux/vmalloc.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/kasan.h>
@@ -1017,22 +1018,31 @@ void skip_orig_size_check(struct kmem_cache *s, const void *object)
set_orig_size(s, (void *)object, s->object_size);
}
-static void slab_bug(struct kmem_cache *s, char *fmt, ...)
+static void __slab_bug(struct kmem_cache *s, const char *fmt, va_list argsp)
{
struct va_format vaf;
va_list args;
- va_start(args, fmt);
+ va_copy(args, argsp);
vaf.fmt = fmt;
vaf.va = &args;
pr_err("=============================================================================\n");
- pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
+ pr_err("BUG %s (%s): %pV\n", s ? s->name : "<unknown>", print_tainted(), &vaf);
pr_err("-----------------------------------------------------------------------------\n\n");
va_end(args);
}
+static void slab_bug(struct kmem_cache *s, const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ __slab_bug(s, fmt, args);
+ va_end(args);
+}
+
__printf(2, 3)
-static void slab_fix(struct kmem_cache *s, char *fmt, ...)
+static void slab_fix(struct kmem_cache *s, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -1085,19 +1095,19 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p)
/* Beginning of the filler is the free pointer */
print_section(KERN_ERR, "Padding ", p + off,
size_from_object(s) - off);
-
- dump_stack();
}
static void object_err(struct kmem_cache *s, struct slab *slab,
- u8 *object, char *reason)
+ u8 *object, const char *reason)
{
if (slab_add_kunit_errors())
return;
- slab_bug(s, "%s", reason);
+ slab_bug(s, reason);
print_trailer(s, slab, object);
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+
+ WARN_ON(1);
}
static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
@@ -1114,22 +1124,30 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
return false;
}
+static void __slab_err(struct slab *slab)
+{
+ if (slab_in_kunit_test())
+ return;
+
+ print_slab_info(slab);
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+
+ WARN_ON(1);
+}
+
static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab,
const char *fmt, ...)
{
va_list args;
- char buf[100];
if (slab_add_kunit_errors())
return;
va_start(args, fmt);
- vsnprintf(buf, sizeof(buf), fmt, args);
+ __slab_bug(s, fmt, args);
va_end(args);
- slab_bug(s, "%s", buf);
- print_slab_info(slab);
- dump_stack();
- add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+
+ __slab_err(slab);
}
static void init_object(struct kmem_cache *s, void *object, u8 val)
@@ -1166,7 +1184,7 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
s->inuse - poison_size);
}
-static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
+static void restore_bytes(struct kmem_cache *s, const char *message, u8 data,
void *from, void *to)
{
slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data);
@@ -1181,8 +1199,8 @@ static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
static pad_check_attributes int
check_bytes_and_report(struct kmem_cache *s, struct slab *slab,
- u8 *object, char *what,
- u8 *start, unsigned int value, unsigned int bytes)
+ u8 *object, const char *what, u8 *start, unsigned int value,
+ unsigned int bytes, bool slab_obj_print)
{
u8 *fault;
u8 *end;
@@ -1201,10 +1219,11 @@ check_bytes_and_report(struct kmem_cache *s, struct slab *slab,
if (slab_add_kunit_errors())
goto skip_bug_print;
- slab_bug(s, "%s overwritten", what);
- pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
- fault, end - 1, fault - addr,
- fault[0], value);
+ pr_err("[%s overwritten] 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
+ what, fault, end - 1, fault - addr, fault[0], value);
+
+ if (slab_obj_print)
+ object_err(s, slab, object, "Object corrupt");
skip_bug_print:
restore_bytes(s, what, value, fault, end);
@@ -1268,7 +1287,7 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
return 1;
return check_bytes_and_report(s, slab, p, "Object padding",
- p + off, POISON_INUSE, size_from_object(s) - off);
+ p + off, POISON_INUSE, size_from_object(s) - off, true);
}
/* Check the pad bytes at the end of a slab page */
@@ -1301,9 +1320,10 @@ slab_pad_check(struct kmem_cache *s, struct slab *slab)
while (end > fault && end[-1] == POISON_INUSE)
end--;
- slab_err(s, slab, "Padding overwritten. 0x%p-0x%p @offset=%tu",
- fault, end - 1, fault - start);
+ slab_bug(s, "Padding overwritten. 0x%p-0x%p @offset=%tu",
+ fault, end - 1, fault - start);
print_section(KERN_ERR, "Padding ", pad, remainder);
+ __slab_err(slab);
restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
}
@@ -1318,11 +1338,11 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
if (s->flags & SLAB_RED_ZONE) {
if (!check_bytes_and_report(s, slab, object, "Left Redzone",
- object - s->red_left_pad, val, s->red_left_pad))
+ object - s->red_left_pad, val, s->red_left_pad, ret))
ret = 0;
if (!check_bytes_and_report(s, slab, object, "Right Redzone",
- endobject, val, s->inuse - s->object_size))
+ endobject, val, s->inuse - s->object_size, ret))
ret = 0;
if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) {
@@ -1331,7 +1351,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
if (s->object_size > orig_size &&
!check_bytes_and_report(s, slab, object,
"kmalloc Redzone", p + orig_size,
- val, s->object_size - orig_size)) {
+ val, s->object_size - orig_size, ret)) {
ret = 0;
}
}
@@ -1339,7 +1359,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
if (!check_bytes_and_report(s, slab, p, "Alignment padding",
endobject, POISON_INUSE,
- s->inuse - s->object_size))
+ s->inuse - s->object_size, ret))
ret = 0;
}
}
@@ -1355,11 +1375,11 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
if (kasan_meta_size < s->object_size - 1 &&
!check_bytes_and_report(s, slab, p, "Poison",
p + kasan_meta_size, POISON_FREE,
- s->object_size - kasan_meta_size - 1))
+ s->object_size - kasan_meta_size - 1, ret))
ret = 0;
if (kasan_meta_size < s->object_size &&
!check_bytes_and_report(s, slab, p, "End Poison",
- p + s->object_size - 1, POISON_END, 1))
+ p + s->object_size - 1, POISON_END, 1, ret))
ret = 0;
}
/*
@@ -1385,11 +1405,6 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
ret = 0;
}
- if (!ret && !slab_in_kunit_test()) {
- print_trailer(s, slab, object);
- add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
- }
-
return ret;
}
@@ -1427,7 +1442,7 @@ static int check_slab(struct kmem_cache *s, struct slab *slab)
* Determine if a certain object in a slab is on the freelist. Must hold the
* slab lock to guarantee that the chains are in a consistent state.
*/
-static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
+static bool on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
{
int nr = 0;
void *fp;
@@ -1437,26 +1452,34 @@ static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
fp = slab->freelist;
while (fp && nr <= slab->objects) {
if (fp == search)
- return 1;
+ return true;
if (!check_valid_pointer(s, slab, fp)) {
if (object) {
object_err(s, slab, object,
"Freechain corrupt");
set_freepointer(s, object, NULL);
+ break;
} else {
slab_err(s, slab, "Freepointer corrupt");
slab->freelist = NULL;
slab->inuse = slab->objects;
slab_fix(s, "Freelist cleared");
- return 0;
+ return false;
}
- break;
}
object = fp;
fp = get_freepointer(s, object);
nr++;
}
+ if (nr > slab->objects) {
+ slab_err(s, slab, "Freelist cycle detected");
+ slab->freelist = NULL;
+ slab->inuse = slab->objects;
+ slab_fix(s, "Freelist cleared");
+ return false;
+ }
+
max_objects = order_objects(slab_order(slab), s->size);
if (max_objects > MAX_OBJS_PER_PAGE)
max_objects = MAX_OBJS_PER_PAGE;
@@ -1624,12 +1647,12 @@ static inline int free_consistency_checks(struct kmem_cache *s,
slab_err(s, slab, "Attempt to free object(0x%p) outside of slab",
object);
} else if (!slab->slab_cache) {
- pr_err("SLUB <none>: no slab for object 0x%p.\n",
- object);
- dump_stack();
- } else
+ slab_err(NULL, slab, "No slab cache for object 0x%p",
+ object);
+ } else {
object_err(s, slab, object,
- "page slab pointer corrupt.");
+ "page slab pointer corrupt.");
+ }
return 0;
}
return 1;
@@ -4241,6 +4264,7 @@ static void *___kmalloc_large_node(size_t size, gfp_t flags, int node)
ptr = folio_address(folio);
lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
PAGE_SIZE << order);
+ __folio_set_large_kmalloc(folio);
}
ptr = kasan_kmalloc_large(ptr, size, flags);
@@ -4716,6 +4740,11 @@ static void free_large_kmalloc(struct folio *folio, void *object)
{
unsigned int order = folio_order(folio);
+ if (WARN_ON_ONCE(!folio_test_large_kmalloc(folio))) {
+ dump_page(&folio->page, "Not a kmalloc allocation");
+ return;
+ }
+
if (WARN_ON_ONCE(order == 0))
pr_warn_once("object pointer: 0x%p\n", object);
@@ -4725,9 +4754,55 @@ static void free_large_kmalloc(struct folio *folio, void *object)
lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
-(PAGE_SIZE << order));
+ __folio_clear_large_kmalloc(folio);
folio_put(folio);
}
+/*
+ * Given an rcu_head embedded within an object obtained from kvmalloc at an
+ * offset < 4k, free the object in question.
+ */
+void kvfree_rcu_cb(struct rcu_head *head)
+{
+ void *obj = head;
+ struct folio *folio;
+ struct slab *slab;
+ struct kmem_cache *s;
+ void *slab_addr;
+
+ if (is_vmalloc_addr(obj)) {
+ obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
+ vfree(obj);
+ return;
+ }
+
+ folio = virt_to_folio(obj);
+ if (!folio_test_slab(folio)) {
+ /*
+ * rcu_head offset can be only less than page size so no need to
+ * consider folio order
+ */
+ obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
+ free_large_kmalloc(folio, obj);
+ return;
+ }
+
+ slab = folio_slab(folio);
+ s = slab->slab_cache;
+ slab_addr = folio_address(folio);
+
+ if (is_kfence_address(obj)) {
+ obj = kfence_object_start(obj);
+ } else {
+ unsigned int idx = __obj_to_index(s, slab_addr, obj);
+
+ obj = slab_addr + s->size * idx;
+ obj = fixup_red_left(s, obj);
+ }
+
+ slab_free(s, slab, obj, _RET_IP_);
+}
+
/**
* kfree - free previously allocated memory
* @object: pointer returned by kmalloc() or kmem_cache_alloc()
@@ -4878,6 +4953,168 @@ void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags)
}
EXPORT_SYMBOL(krealloc_noprof);
+static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
+{
+ /*
+ * We want to attempt a large physically contiguous block first because
+ * it is less likely to fragment multiple larger blocks and therefore
+ * contribute to a long term fragmentation less than vmalloc fallback.
+ * However make sure that larger requests are not too disruptive - no
+ * OOM killer and no allocation failure warnings as we have a fallback.
+ */
+ if (size > PAGE_SIZE) {
+ flags |= __GFP_NOWARN;
+
+ if (!(flags & __GFP_RETRY_MAYFAIL))
+ flags |= __GFP_NORETRY;
+
+ /* nofail semantic is implemented by the vmalloc fallback */
+ flags &= ~__GFP_NOFAIL;
+ }
+
+ return flags;
+}
+
+/**
+ * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon
+ * failure, fall back to non-contiguous (vmalloc) allocation.
+ * @size: size of the request.
+ * @b: which set of kmalloc buckets to allocate from.
+ * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
+ * @node: numa node to allocate from
+ *
+ * Uses kmalloc to get the memory but if the allocation fails then falls back
+ * to the vmalloc allocator. Use kvfree for freeing the memory.
+ *
+ * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
+ * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
+ * preferable to the vmalloc fallback, due to visible performance drawbacks.
+ *
+ * Return: pointer to the allocated memory of %NULL in case of failure
+ */
+void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
+{
+ void *ret;
+
+ /*
+ * It doesn't really make sense to fallback to vmalloc for sub page
+ * requests
+ */
+ ret = __do_kmalloc_node(size, PASS_BUCKET_PARAM(b),
+ kmalloc_gfp_adjust(flags, size),
+ node, _RET_IP_);
+ if (ret || size <= PAGE_SIZE)
+ return ret;
+
+ /* non-sleeping allocations are not supported by vmalloc */
+ if (!gfpflags_allow_blocking(flags))
+ return NULL;
+
+ /* Don't even allow crazy sizes */
+ if (unlikely(size > INT_MAX)) {
+ WARN_ON_ONCE(!(flags & __GFP_NOWARN));
+ return NULL;
+ }
+
+ /*
+ * kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
+ * since the callers already cannot assume anything
+ * about the resulting pointer, and cannot play
+ * protection games.
+ */
+ return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
+ flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
+ node, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(__kvmalloc_node_noprof);
+
+/**
+ * kvfree() - Free memory.
+ * @addr: Pointer to allocated memory.
+ *
+ * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
+ * It is slightly more efficient to use kfree() or vfree() if you are certain
+ * that you know which one to use.
+ *
+ * Context: Either preemptible task context or not-NMI interrupt.
+ */
+void kvfree(const void *addr)
+{
+ if (is_vmalloc_addr(addr))
+ vfree(addr);
+ else
+ kfree(addr);
+}
+EXPORT_SYMBOL(kvfree);
+
+/**
+ * kvfree_sensitive - Free a data object containing sensitive information.
+ * @addr: address of the data object to be freed.
+ * @len: length of the data object.
+ *
+ * Use the special memzero_explicit() function to clear the content of a
+ * kvmalloc'ed object containing sensitive data to make sure that the
+ * compiler won't optimize out the data clearing.
+ */
+void kvfree_sensitive(const void *addr, size_t len)
+{
+ if (likely(!ZERO_OR_NULL_PTR(addr))) {
+ memzero_explicit((void *)addr, len);
+ kvfree(addr);
+ }
+}
+EXPORT_SYMBOL(kvfree_sensitive);
+
+/**
+ * kvrealloc - reallocate memory; contents remain unchanged
+ * @p: object to reallocate memory for
+ * @size: the size to reallocate
+ * @flags: the flags for the page level allocator
+ *
+ * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0
+ * and @p is not a %NULL pointer, the object pointed to is freed.
+ *
+ * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
+ * initial memory allocation, every subsequent call to this API for the same
+ * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
+ * __GFP_ZERO is not fully honored by this API.
+ *
+ * In any case, the contents of the object pointed to are preserved up to the
+ * lesser of the new and old sizes.
+ *
+ * This function must not be called concurrently with itself or kvfree() for the
+ * same memory allocation.
+ *
+ * Return: pointer to the allocated memory or %NULL in case of error
+ */
+void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags)
+{
+ void *n;
+
+ if (is_vmalloc_addr(p))
+ return vrealloc_noprof(p, size, flags);
+
+ n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size));
+ if (!n) {
+ /* We failed to krealloc(), fall back to kvmalloc(). */
+ n = kvmalloc_noprof(size, flags);
+ if (!n)
+ return NULL;
+
+ if (p) {
+ /* We already know that `p` is not a vmalloc address. */
+ kasan_disable_current();
+ memcpy(n, kasan_reset_tag(p), ksize(p));
+ kasan_enable_current();
+
+ kfree(p);
+ }
+ }
+
+ return n;
+}
+EXPORT_SYMBOL(kvrealloc_noprof);
+
struct detached_freelist {
struct slab *slab;
void *tail;
@@ -5570,14 +5807,14 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s)
return !!oo_objects(s->oo);
}
-static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
- const char *text)
+static void list_slab_objects(struct kmem_cache *s, struct slab *slab)
{
#ifdef CONFIG_SLUB_DEBUG
void *addr = slab_address(slab);
void *p;
- slab_err(s, slab, text, s->name);
+ if (!slab_add_kunit_errors())
+ slab_bug(s, "Objects remaining on __kmem_cache_shutdown()");
spin_lock(&object_map_lock);
__fill_map(object_map, s, slab);
@@ -5592,6 +5829,8 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
}
}
spin_unlock(&object_map_lock);
+
+ __slab_err(slab);
#endif
}
@@ -5612,8 +5851,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
remove_partial(n, slab);
list_add(&slab->slab_list, &discard);
} else {
- list_slab_objects(s, slab,
- "Objects remaining in %s on __kmem_cache_shutdown()");
+ list_slab_objects(s, slab);
}
}
spin_unlock_irq(&n->list_lock);
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index be39078f255b..1007c30f12e2 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -58,9 +58,11 @@ static unsigned short __swap_cgroup_id_xchg(struct swap_cgroup *map,
* entries must not have been charged
*
* @folio: the folio that the swap entry belongs to
+ * @id: mem_cgroup ID to be recorded
* @ent: the first swap entry to be recorded
*/
-void swap_cgroup_record(struct folio *folio, swp_entry_t ent)
+void swap_cgroup_record(struct folio *folio, unsigned short id,
+ swp_entry_t ent)
{
unsigned int nr_ents = folio_nr_pages(folio);
struct swap_cgroup *map;
@@ -72,8 +74,7 @@ void swap_cgroup_record(struct folio *folio, swp_entry_t ent)
map = swap_cgroup_ctrl[swp_type(ent)].map;
do {
- old = __swap_cgroup_id_xchg(map, offset,
- mem_cgroup_id(folio_memcg(folio)));
+ old = __swap_cgroup_id_xchg(map, offset, id);
VM_BUG_ON(old);
} while (++offset != end);
}
diff --git a/mm/usercopy.c b/mm/usercopy.c
index 83c164aba6e0..dbdcc43964fb 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -17,7 +17,7 @@
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
-#include <linux/thread_info.h>
+#include <linux/ucopysize.h>
#include <linux/vmalloc.h>
#include <linux/atomic.h>
#include <linux/jump_label.h>
@@ -201,7 +201,9 @@ static inline void check_heap_object(const void *ptr, unsigned long n,
}
}
-static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);
+DEFINE_STATIC_KEY_MAYBE_RO(CONFIG_HARDENED_USERCOPY_DEFAULT_ON,
+ validate_usercopy_range);
+EXPORT_SYMBOL(validate_usercopy_range);
/*
* Validates that the given object is:
@@ -212,9 +214,6 @@ static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);
*/
void __check_object_size(const void *ptr, unsigned long n, bool to_user)
{
- if (static_branch_unlikely(&bypass_usercopy_checks))
- return;
-
/* Skip all tests if size is zero. */
if (!n)
return;
@@ -255,7 +254,8 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
}
EXPORT_SYMBOL(__check_object_size);
-static bool enable_checks __initdata = true;
+static bool enable_checks __initdata =
+ IS_ENABLED(CONFIG_HARDENED_USERCOPY_DEFAULT_ON);
static int __init parse_hardened_usercopy(char *str)
{
@@ -269,8 +269,10 @@ __setup("hardened_usercopy=", parse_hardened_usercopy);
static int __init set_hardened_usercopy(void)
{
- if (enable_checks == false)
- static_branch_enable(&bypass_usercopy_checks);
+ if (enable_checks)
+ static_branch_enable(&validate_usercopy_range);
+ else
+ static_branch_disable(&validate_usercopy_range);
return 1;
}
diff --git a/mm/util.c b/mm/util.c
index 8c965474d329..e7d81371032b 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -615,168 +615,6 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
}
EXPORT_SYMBOL(vm_mmap);
-static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
-{
- /*
- * We want to attempt a large physically contiguous block first because
- * it is less likely to fragment multiple larger blocks and therefore
- * contribute to a long term fragmentation less than vmalloc fallback.
- * However make sure that larger requests are not too disruptive - no
- * OOM killer and no allocation failure warnings as we have a fallback.
- */
- if (size > PAGE_SIZE) {
- flags |= __GFP_NOWARN;
-
- if (!(flags & __GFP_RETRY_MAYFAIL))
- flags |= __GFP_NORETRY;
-
- /* nofail semantic is implemented by the vmalloc fallback */
- flags &= ~__GFP_NOFAIL;
- }
-
- return flags;
-}
-
-/**
- * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon
- * failure, fall back to non-contiguous (vmalloc) allocation.
- * @size: size of the request.
- * @b: which set of kmalloc buckets to allocate from.
- * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
- * @node: numa node to allocate from
- *
- * Uses kmalloc to get the memory but if the allocation fails then falls back
- * to the vmalloc allocator. Use kvfree for freeing the memory.
- *
- * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
- * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
- * preferable to the vmalloc fallback, due to visible performance drawbacks.
- *
- * Return: pointer to the allocated memory of %NULL in case of failure
- */
-void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
-{
- void *ret;
-
- /*
- * It doesn't really make sense to fallback to vmalloc for sub page
- * requests
- */
- ret = __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, b),
- kmalloc_gfp_adjust(flags, size),
- node);
- if (ret || size <= PAGE_SIZE)
- return ret;
-
- /* non-sleeping allocations are not supported by vmalloc */
- if (!gfpflags_allow_blocking(flags))
- return NULL;
-
- /* Don't even allow crazy sizes */
- if (unlikely(size > INT_MAX)) {
- WARN_ON_ONCE(!(flags & __GFP_NOWARN));
- return NULL;
- }
-
- /*
- * kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
- * since the callers already cannot assume anything
- * about the resulting pointer, and cannot play
- * protection games.
- */
- return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
- flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
- node, __builtin_return_address(0));
-}
-EXPORT_SYMBOL(__kvmalloc_node_noprof);
-
-/**
- * kvfree() - Free memory.
- * @addr: Pointer to allocated memory.
- *
- * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
- * It is slightly more efficient to use kfree() or vfree() if you are certain
- * that you know which one to use.
- *
- * Context: Either preemptible task context or not-NMI interrupt.
- */
-void kvfree(const void *addr)
-{
- if (is_vmalloc_addr(addr))
- vfree(addr);
- else
- kfree(addr);
-}
-EXPORT_SYMBOL(kvfree);
-
-/**
- * kvfree_sensitive - Free a data object containing sensitive information.
- * @addr: address of the data object to be freed.
- * @len: length of the data object.
- *
- * Use the special memzero_explicit() function to clear the content of a
- * kvmalloc'ed object containing sensitive data to make sure that the
- * compiler won't optimize out the data clearing.
- */
-void kvfree_sensitive(const void *addr, size_t len)
-{
- if (likely(!ZERO_OR_NULL_PTR(addr))) {
- memzero_explicit((void *)addr, len);
- kvfree(addr);
- }
-}
-EXPORT_SYMBOL(kvfree_sensitive);
-
-/**
- * kvrealloc - reallocate memory; contents remain unchanged
- * @p: object to reallocate memory for
- * @size: the size to reallocate
- * @flags: the flags for the page level allocator
- *
- * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0
- * and @p is not a %NULL pointer, the object pointed to is freed.
- *
- * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
- * initial memory allocation, every subsequent call to this API for the same
- * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
- * __GFP_ZERO is not fully honored by this API.
- *
- * In any case, the contents of the object pointed to are preserved up to the
- * lesser of the new and old sizes.
- *
- * This function must not be called concurrently with itself or kvfree() for the
- * same memory allocation.
- *
- * Return: pointer to the allocated memory or %NULL in case of error
- */
-void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags)
-{
- void *n;
-
- if (is_vmalloc_addr(p))
- return vrealloc_noprof(p, size, flags);
-
- n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size));
- if (!n) {
- /* We failed to krealloc(), fall back to kvmalloc(). */
- n = kvmalloc_noprof(size, flags);
- if (!n)
- return NULL;
-
- if (p) {
- /* We already know that `p` is not a vmalloc address. */
- kasan_disable_current();
- memcpy(n, kasan_reset_tag(p), ksize(p));
- kasan_enable_current();
-
- kfree(p);
- }
- }
-
- return n;
-}
-EXPORT_SYMBOL(kvrealloc_noprof);
-
/**
* __vmalloc_array - allocate memory for a virtually contiguous array.
* @n: number of elements.
diff --git a/mm/vma.c b/mm/vma.c
index 96bcb372c90e..71ca012c616c 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -2381,7 +2381,8 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
* vma_merge_new_range() calls khugepaged_enter_vma() too, the below
* call covers the non-merge case.
*/
- khugepaged_enter_vma(vma, map->flags);
+ if (!vma_is_anonymous(vma))
+ khugepaged_enter_vma(vma, map->flags);
ksm_add_vma(vma);
*vmap = vma;
return 0;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 16bfe1c694dd..88998725f1c5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1435,6 +1435,8 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_X86
"direct_map_level2_splits",
"direct_map_level3_splits",
+ "direct_map_level2_collapses",
+ "direct_map_level3_collapses",
#endif
#ifdef CONFIG_PER_VMA_LOCK_STATS
"vma_lock_success",