From 6d2453c3dbc5f70eafc1c866289a90a1fc57ce18 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 24 Aug 2022 12:51:00 +0900 Subject: drivers/block/zram/zram_drv.c: do not keep dangling zcomp pointer after zram reset We do all reset operations under write lock, so we don't need to save ->disksize and ->comp to stack variables. Another thing is that ->comp is freed during zram reset, but comp pointer is not NULL-ed, so zram keeps the freed pointer value. Link: https://lkml.kernel.org/r/20220824035100.971816-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 226ea76cc819..add43f6c8bc0 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1710,9 +1710,6 @@ out: static void zram_reset_device(struct zram *zram) { - struct zcomp *comp; - u64 disksize; - down_write(&zram->init_lock); zram->limit_pages = 0; @@ -1722,17 +1719,15 @@ static void zram_reset_device(struct zram *zram) return; } - comp = zram->comp; - disksize = zram->disksize; - zram->disksize = 0; - set_capacity_and_notify(zram->disk, 0); part_stat_set_all(zram->disk->part0, 0); /* I/O operation under all of CPU are done so let's free */ - zram_meta_free(zram, disksize); + zram_meta_free(zram, zram->disksize); + zram->disksize = 0; memset(&zram->stats, 0, sizeof(zram->stats)); - zcomp_destroy(comp); + zcomp_destroy(zram->comp); + zram->comp = NULL; reset_bdev(zram); up_write(&zram->init_lock); -- cgit v1.2.3 From 641608f36244eb01028c2b30a961c7242144d96a Mon Sep 17 00:00:00 2001 From: Alexey Romanov Date: Wed, 24 Aug 2022 14:31:17 +0300 Subject: zram: don't retry compress incompressible page It doesn't make sense for us to retry to compress an uncompressible page (comp_len == PAGE_SIZE) in zsmalloc slowpath, because we will be storing it uncompressed anyway. We can avoid wasting time on another compression attempt. It is enough to take lock (zcomp_stream_get) and execute the code below. Link: https://lkml.kernel.org/r/20220824113117.78849-1-avromanov@sberdevices.ru Signed-off-by: Alexey Romanov Signed-off-by: Dmitry Rokosov Reviewed-by: Sergey Senozhatsky Cc: Alexey Romanov Cc: Dmitry Rokosov Cc: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index add43f6c8bc0..607f4634c27d 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1410,9 +1410,19 @@ compress_again: handle = zs_malloc(zram->mem_pool, comp_len, GFP_NOIO | __GFP_HIGHMEM | __GFP_MOVABLE); - if (!IS_ERR((void *)handle)) + if (IS_ERR((void *)handle)) + return PTR_ERR((void *)handle); + + if (comp_len != PAGE_SIZE) goto compress_again; - return PTR_ERR((void *)handle); + /* + * If the page is not compressible, you need to acquire the lock and + * execute the code below. The zcomp_stream_get() call is needed to + * disable the cpu hotplug and grab the zstrm buffer back. + * It is necessary that the dereferencing of the zstrm variable below + * occurs correctly. + */ + zstrm = zcomp_stream_get(zram->comp); } alloced_pages = zs_get_total_pages(zram->mem_pool); -- cgit v1.2.3 From 639118d1571f70b1157b4bb5ac574b0ab0f38099 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Sat, 27 Aug 2022 19:20:43 +0800 Subject: mm: kill is_memblock_offlined() Directly check state of struct memory_block, no need a single function. Link: https://lkml.kernel.org/r/20220827112043.187028-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- drivers/base/memory.c | 6 ------ include/linux/memory_hotplug.h | 2 -- mm/memory_hotplug.c | 3 +-- 3 files changed, 1 insertion(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index bc60c9cd3230..9aa0da991cfb 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -869,12 +869,6 @@ void remove_memory_block_devices(unsigned long start, unsigned long size) } } -/* return true if the memory block is offlined, otherwise, return false */ -bool is_memblock_offlined(struct memory_block *mem) -{ - return mem->state == MEM_OFFLINE; -} - static struct attribute *memory_root_attrs[] = { #ifdef CONFIG_ARCH_MEMORY_PROBE &dev_attr_probe.attr, diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index e0b2209ab71c..54675791bc50 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -11,7 +11,6 @@ struct page; struct zone; struct pglist_data; struct mem_section; -struct memory_block; struct memory_group; struct resource; struct vmem_altmap; @@ -333,7 +332,6 @@ extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, extern void remove_pfn_range_from_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); -extern bool is_memblock_offlined(struct memory_block *mem); extern int sparse_add_section(int nid, unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap, struct dev_pagemap *pgmap); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fad6d1f2262a..dc727aee4ad3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1969,11 +1969,10 @@ failed_removal: static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) { - int ret = !is_memblock_offlined(mem); int *nid = arg; *nid = mem->nid; - if (unlikely(ret)) { + if (unlikely(mem->state != MEM_OFFLINE)) { phys_addr_t beginpa, endpa; beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); -- cgit v1.2.3 From 7b88bda3761b95856cf97822efe8281c8100067b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 18 Aug 2022 18:40:36 +0530 Subject: mm/demotion/dax/kmem: set node's abstract distance to MEMTIER_DEFAULT_DAX_ADISTANCE By default, all nodes are assigned to the default memory tier which is the memory tier designated for nodes with DRAM Set dax kmem device node's tier to slower memory tier by assigning abstract distance to MEMTIER_DEFAULT_DAX_ADISTANCE. Low-level drivers like papr_scm or ACPI NFIT can initialize memory device type to a more accurate value based on device tree details or HMAT. If the kernel doesn't find the memory type initialized, a default slower memory type is assigned by the kmem driver. [aneesh.kumar@linux.ibm.com: assign correct memory type for multiple dax devices with the same node affinity] Link: https://lkml.kernel.org/r/20220826100224.542312-1-aneesh.kumar@linux.ibm.com Link: https://lkml.kernel.org/r/20220818131042.113280-5-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: "Huang, Ying" Acked-by: Wei Xu Cc: Alistair Popple Cc: Bharata B Rao Cc: Dan Williams Cc: Dave Hansen Cc: Davidlohr Bueso Cc: Hesham Almatary Cc: Jagdish Gediya Cc: Johannes Weiner Cc: Jonathan Cameron Cc: Michal Hocko Cc: Tim Chen Cc: Yang Shi Cc: SeongJae Park Signed-off-by: Andrew Morton --- drivers/dax/kmem.c | 42 ++++++++++++++-- include/linux/memory-tiers.h | 42 +++++++++++++++- mm/memory-tiers.c | 114 ++++++++++++++++++++++++++++++++++--------- 3 files changed, 171 insertions(+), 27 deletions(-) (limited to 'drivers') diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c index a37622060fff..4852a2dbdb27 100644 --- a/drivers/dax/kmem.c +++ b/drivers/dax/kmem.c @@ -11,9 +11,17 @@ #include #include #include +#include #include "dax-private.h" #include "bus.h" +/* + * Default abstract distance assigned to the NUMA node onlined + * by DAX/kmem if the low level platform driver didn't initialize + * one for this NUMA node. + */ +#define MEMTIER_DEFAULT_DAX_ADISTANCE (MEMTIER_ADISTANCE_DRAM * 5) + /* Memory resource name used for add_memory_driver_managed(). */ static const char *kmem_name; /* Set if any memory will remain added when the driver will be unloaded. */ @@ -41,6 +49,7 @@ struct dax_kmem_data { struct resource *res[]; }; +static struct memory_dev_type *dax_slowmem_type; static int dev_dax_kmem_probe(struct dev_dax *dev_dax) { struct device *dev = &dev_dax->dev; @@ -79,11 +88,13 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax) return -EINVAL; } + init_node_memory_type(numa_node, dax_slowmem_type); + + rc = -ENOMEM; data = kzalloc(struct_size(data, res, dev_dax->nr_range), GFP_KERNEL); if (!data) - return -ENOMEM; + goto err_dax_kmem_data; - rc = -ENOMEM; data->res_name = kstrdup(dev_name(dev), GFP_KERNEL); if (!data->res_name) goto err_res_name; @@ -155,6 +166,8 @@ err_reg_mgid: kfree(data->res_name); err_res_name: kfree(data); +err_dax_kmem_data: + clear_node_memory_type(numa_node, dax_slowmem_type); return rc; } @@ -162,6 +175,7 @@ err_res_name: static void dev_dax_kmem_remove(struct dev_dax *dev_dax) { int i, success = 0; + int node = dev_dax->target_node; struct device *dev = &dev_dax->dev; struct dax_kmem_data *data = dev_get_drvdata(dev); @@ -198,6 +212,14 @@ static void dev_dax_kmem_remove(struct dev_dax *dev_dax) kfree(data->res_name); kfree(data); dev_set_drvdata(dev, NULL); + /* + * Clear the memtype association on successful unplug. + * If not, we have memory blocks left which can be + * offlined/onlined later. We need to keep memory_dev_type + * for that. This implies this reference will be around + * till next reboot. + */ + clear_node_memory_type(node, dax_slowmem_type); } } #else @@ -228,9 +250,22 @@ static int __init dax_kmem_init(void) if (!kmem_name) return -ENOMEM; + dax_slowmem_type = alloc_memory_type(MEMTIER_DEFAULT_DAX_ADISTANCE); + if (IS_ERR(dax_slowmem_type)) { + rc = PTR_ERR(dax_slowmem_type); + goto err_dax_slowmem_type; + } + rc = dax_driver_register(&device_dax_kmem_driver); if (rc) - kfree_const(kmem_name); + goto error_dax_driver; + + return rc; + +error_dax_driver: + destroy_memory_type(dax_slowmem_type); +err_dax_slowmem_type: + kfree_const(kmem_name); return rc; } @@ -239,6 +274,7 @@ static void __exit dax_kmem_exit(void) dax_driver_unregister(&device_dax_kmem_driver); if (!any_hotremove_failed) kfree_const(kmem_name); + destroy_memory_type(dax_slowmem_type); } MODULE_AUTHOR("Intel Corporation"); diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index 7ae6308b2191..49d281866dca 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -2,6 +2,9 @@ #ifndef _LINUX_MEMORY_TIERS_H #define _LINUX_MEMORY_TIERS_H +#include +#include +#include /* * Each tier cover a abstrace distance chunk size of 128 */ @@ -16,12 +19,49 @@ #define MEMTIER_ADISTANCE_DRAM ((4 * MEMTIER_CHUNK_SIZE) + (MEMTIER_CHUNK_SIZE >> 1)) #define MEMTIER_HOTPLUG_PRIO 100 +struct memory_tier; +struct memory_dev_type { + /* list of memory types that are part of same tier as this type */ + struct list_head tier_sibiling; + /* abstract distance for this specific memory type */ + int adistance; + /* Nodes of same abstract distance */ + nodemask_t nodes; + struct kref kref; + struct memory_tier *memtier; +}; + #ifdef CONFIG_NUMA -#include extern bool numa_demotion_enabled; +struct memory_dev_type *alloc_memory_type(int adistance); +void destroy_memory_type(struct memory_dev_type *memtype); +void init_node_memory_type(int node, struct memory_dev_type *default_type); +void clear_node_memory_type(int node, struct memory_dev_type *memtype); #else #define numa_demotion_enabled false +/* + * CONFIG_NUMA implementation returns non NULL error. + */ +static inline struct memory_dev_type *alloc_memory_type(int adistance) +{ + return NULL; +} + +static inline void destroy_memory_type(struct memory_dev_type *memtype) +{ + +} + +static inline void init_node_memory_type(int node, struct memory_dev_type *default_type) +{ + +} + +static inline void clear_node_memory_type(int node, struct memory_dev_type *memtype) +{ + +} #endif /* CONFIG_NUMA */ #endif /* _LINUX_MEMORY_TIERS_H */ diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index f74e077533ac..babc10a5cc13 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -1,6 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -#include -#include #include #include #include @@ -21,27 +19,15 @@ struct memory_tier { int adistance_start; }; -struct memory_dev_type { - /* list of memory types that are part of same tier as this type */ - struct list_head tier_sibiling; - /* abstract distance for this specific memory type */ - int adistance; - /* Nodes of same abstract distance */ - nodemask_t nodes; - struct memory_tier *memtier; +struct node_memory_type_map { + struct memory_dev_type *memtype; + int map_count; }; static DEFINE_MUTEX(memory_tier_lock); static LIST_HEAD(memory_tiers); -static struct memory_dev_type *node_memory_types[MAX_NUMNODES]; -/* - * For now we can have 4 faster memory tiers with smaller adistance - * than default DRAM tier. - */ -static struct memory_dev_type default_dram_type = { - .adistance = MEMTIER_ADISTANCE_DRAM, - .tier_sibiling = LIST_HEAD_INIT(default_dram_type.tier_sibiling), -}; +static struct node_memory_type_map node_memory_types[MAX_NUMNODES]; +static struct memory_dev_type *default_dram_type; static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype) { @@ -87,6 +73,24 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty return new_memtier; } +static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype) +{ + if (!node_memory_types[node].memtype) + node_memory_types[node].memtype = memtype; + /* + * for each device getting added in the same NUMA node + * with this specific memtype, bump the map count. We + * Only take memtype device reference once, so that + * changing a node memtype can be done by droping the + * only reference count taken here. + */ + + if (node_memory_types[node].memtype == memtype) { + if (!node_memory_types[node].map_count++) + kref_get(&memtype->kref); + } +} + static struct memory_tier *set_node_memory_tier(int node) { struct memory_tier *memtier; @@ -97,10 +101,9 @@ static struct memory_tier *set_node_memory_tier(int node) if (!node_state(node, N_MEMORY)) return ERR_PTR(-EINVAL); - if (!node_memory_types[node]) - node_memory_types[node] = &default_dram_type; + __init_node_memory_type(node, default_dram_type); - memtype = node_memory_types[node]; + memtype = node_memory_types[node].memtype; node_set(node, memtype->nodes); memtier = find_create_memory_tier(memtype); return memtier; @@ -131,7 +134,7 @@ static bool clear_node_memory_tier(int node) if (memtier) { struct memory_dev_type *memtype; - memtype = node_memory_types[node]; + memtype = node_memory_types[node].memtype; node_clear(node, memtype->nodes); if (nodes_empty(memtype->nodes)) { list_del_init(&memtype->tier_sibiling); @@ -144,6 +147,63 @@ static bool clear_node_memory_tier(int node) return cleared; } +static void release_memtype(struct kref *kref) +{ + struct memory_dev_type *memtype; + + memtype = container_of(kref, struct memory_dev_type, kref); + kfree(memtype); +} + +struct memory_dev_type *alloc_memory_type(int adistance) +{ + struct memory_dev_type *memtype; + + memtype = kmalloc(sizeof(*memtype), GFP_KERNEL); + if (!memtype) + return ERR_PTR(-ENOMEM); + + memtype->adistance = adistance; + INIT_LIST_HEAD(&memtype->tier_sibiling); + memtype->nodes = NODE_MASK_NONE; + memtype->memtier = NULL; + kref_init(&memtype->kref); + return memtype; +} +EXPORT_SYMBOL_GPL(alloc_memory_type); + +void destroy_memory_type(struct memory_dev_type *memtype) +{ + kref_put(&memtype->kref, release_memtype); +} +EXPORT_SYMBOL_GPL(destroy_memory_type); + +void init_node_memory_type(int node, struct memory_dev_type *memtype) +{ + + mutex_lock(&memory_tier_lock); + __init_node_memory_type(node, memtype); + mutex_unlock(&memory_tier_lock); +} +EXPORT_SYMBOL_GPL(init_node_memory_type); + +void clear_node_memory_type(int node, struct memory_dev_type *memtype) +{ + mutex_lock(&memory_tier_lock); + if (node_memory_types[node].memtype == memtype) + node_memory_types[node].map_count--; + /* + * If we umapped all the attached devices to this node, + * clear the node memory type. + */ + if (!node_memory_types[node].map_count) { + node_memory_types[node].memtype = NULL; + kref_put(&memtype->kref, release_memtype); + } + mutex_unlock(&memory_tier_lock); +} +EXPORT_SYMBOL_GPL(clear_node_memory_type); + static int __meminit memtier_hotplug_callback(struct notifier_block *self, unsigned long action, void *_arg) { @@ -178,6 +238,14 @@ static int __init memory_tier_init(void) struct memory_tier *memtier; mutex_lock(&memory_tier_lock); + /* + * For now we can have 4 faster memory tiers with smaller adistance + * than default DRAM tier. + */ + default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM); + if (!default_dram_type) + panic("%s() failed to allocate default DRAM tier\n", __func__); + /* * Look at all the existing N_MEMORY nodes and add them to * default memory tier or to a tier if we already have memory -- cgit v1.2.3 From d4af56c5c7c6781ca6ca8075e2cf5bc119ed33d1 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:45 +0000 Subject: mm: start tracking VMAs with maple tree Start tracking the VMAs with the new maple tree structure in parallel with the rb_tree. Add debug and trace events for maple tree operations and duplicate the rb_tree that is created on forks into the maple tree. The maple tree is added to the mm_struct including the mm_init struct, added support in required mm/mmap functions, added tracking in kernel/fork for process forking, and used to find the unmapped_area and checked against what the rbtree finds. This also moves the mmap_lock() in exit_mmap() since the oom reaper call does walk the VMAs. Otherwise lockdep will be unhappy if oom happens. When splitting a vma fails due to allocations of the maple tree nodes, the error path in __split_vma() calls new->vm_ops->close(new). The page accounting for hugetlb is actually in the close() operation, so it accounts for the removal of 1/2 of the VMA which was not adjusted. This results in a negative exit value. To avoid the negative charge, set vm_start = vm_end and vm_pgoff = 0. There is also a potential accounting issue in special mappings from insert_vm_struct() failing to allocate, so reverse the charge there in the failure scenario. Link: https://lkml.kernel.org/r/20220906194824.2110408-9-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/kernel/tboot.c | 1 + drivers/firmware/efi/efi.c | 1 + include/linux/mm.h | 5 + include/linux/mm_types.h | 3 + include/trace/events/mmap.h | 73 +++++++++ kernel/fork.c | 20 ++- mm/init-mm.c | 2 + mm/mmap.c | 353 +++++++++++++++++++++++++++++++++++++++----- mm/nommu.c | 13 ++ 9 files changed, 435 insertions(+), 36 deletions(-) (limited to 'drivers') diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 3bacd935f840..e01544202651 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -96,6 +96,7 @@ void __init tboot_probe(void) static pgd_t *tboot_pg_dir; static struct mm_struct tboot_mm = { .mm_rb = RB_ROOT, + .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, tboot_mm.mmap_lock), .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index e4080ad96089..7b6a815b79d3 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -58,6 +58,7 @@ static unsigned long __initdata rt_prop = EFI_INVALID_TABLE_ADDR; struct mm_struct efi_mm = { .mm_rb = RB_ROOT, + .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, efi_mm.mmap_lock), .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), .write_protect_seq = SEQCNT_ZERO(efi_mm.write_protect_seq), diff --git a/include/linux/mm.h b/include/linux/mm.h index 7cc9ffc19e7f..896d04248e66 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2567,6 +2567,8 @@ extern bool arch_has_descending_max_zone_pfns(void); /* nommu.c */ extern atomic_long_t mmap_pages_allocated; extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); +/* mmap.c */ +void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas); /* interval_tree.c */ void vma_interval_tree_insert(struct vm_area_struct *node, @@ -2630,6 +2632,9 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **, bool *need_rmap_locks); extern void exit_mmap(struct mm_struct *); +void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas); +void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas); + static inline int check_data_rlimit(unsigned long rlim, unsigned long new, unsigned long start, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e1797813cc2c..425bc5f7d477 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -486,6 +487,7 @@ struct kioctx_table; struct mm_struct { struct { struct vm_area_struct *mmap; /* list of VMAs */ + struct maple_tree mm_mt; struct rb_root mm_rb; u64 vmacache_seqnum; /* per-thread vmacache */ #ifdef CONFIG_MMU @@ -697,6 +699,7 @@ struct mm_struct { unsigned long cpu_bitmap[]; }; +#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN) extern struct mm_struct init_mm; /* Pointer magic because the dynamic array size confuses some compilers. */ diff --git a/include/trace/events/mmap.h b/include/trace/events/mmap.h index 4661f7ba07c0..216de5f03621 100644 --- a/include/trace/events/mmap.h +++ b/include/trace/events/mmap.h @@ -42,6 +42,79 @@ TRACE_EVENT(vm_unmapped_area, __entry->low_limit, __entry->high_limit, __entry->align_mask, __entry->align_offset) ); + +TRACE_EVENT(vma_mas_szero, + TP_PROTO(struct maple_tree *mt, unsigned long start, + unsigned long end), + + TP_ARGS(mt, start, end), + + TP_STRUCT__entry( + __field(struct maple_tree *, mt) + __field(unsigned long, start) + __field(unsigned long, end) + ), + + TP_fast_assign( + __entry->mt = mt; + __entry->start = start; + __entry->end = end; + ), + + TP_printk("mt_mod %p, (NULL), SNULL, %lu, %lu,", + __entry->mt, + (unsigned long) __entry->start, + (unsigned long) __entry->end + ) +); + +TRACE_EVENT(vma_store, + TP_PROTO(struct maple_tree *mt, struct vm_area_struct *vma), + + TP_ARGS(mt, vma), + + TP_STRUCT__entry( + __field(struct maple_tree *, mt) + __field(struct vm_area_struct *, vma) + __field(unsigned long, vm_start) + __field(unsigned long, vm_end) + ), + + TP_fast_assign( + __entry->mt = mt; + __entry->vma = vma; + __entry->vm_start = vma->vm_start; + __entry->vm_end = vma->vm_end - 1; + ), + + TP_printk("mt_mod %p, (%p), STORE, %lu, %lu,", + __entry->mt, __entry->vma, + (unsigned long) __entry->vm_start, + (unsigned long) __entry->vm_end + ) +); + + +TRACE_EVENT(exit_mmap, + TP_PROTO(struct mm_struct *mm), + + TP_ARGS(mm), + + TP_STRUCT__entry( + __field(struct mm_struct *, mm) + __field(struct maple_tree *, mt) + ), + + TP_fast_assign( + __entry->mm = mm; + __entry->mt = &mm->mm_mt; + ), + + TP_printk("mt_mod %p, DESTROY\n", + __entry->mt + ) +); + #endif /* This part must be outside protection */ diff --git a/kernel/fork.c b/kernel/fork.c index d2da065442af..273364207f17 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -585,6 +585,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, int retval; unsigned long charge; LIST_HEAD(uf); + MA_STATE(mas, &mm->mm_mt, 0, 0); uprobe_start_dup_mmap(); if (mmap_write_lock_killable(oldmm)) { @@ -614,6 +615,10 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, goto out; khugepaged_fork(mm, oldmm); + retval = mas_expected_entries(&mas, oldmm->map_count); + if (retval) + goto out; + prev = NULL; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { struct file *file; @@ -629,7 +634,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, */ if (fatal_signal_pending(current)) { retval = -EINTR; - goto out; + goto loop_out; } if (mpnt->vm_flags & VM_ACCOUNT) { unsigned long len = vma_pages(mpnt); @@ -694,6 +699,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, rb_link = &tmp->vm_rb.rb_right; rb_parent = &tmp->vm_rb; + /* Link the vma into the MT */ + mas.index = tmp->vm_start; + mas.last = tmp->vm_end - 1; + mas_store(&mas, tmp); + mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) retval = copy_page_range(tmp, mpnt); @@ -702,10 +712,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, tmp->vm_ops->open(tmp); if (retval) - goto out; + goto loop_out; } /* a new mm has just been created */ retval = arch_dup_mmap(oldmm, mm); +loop_out: + mas_destroy(&mas); out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); @@ -721,7 +733,7 @@ fail_nomem_policy: fail_nomem: retval = -ENOMEM; vm_unacct_memory(charge); - goto out; + goto loop_out; } static inline int mm_alloc_pgd(struct mm_struct *mm) @@ -1111,6 +1123,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, { mm->mmap = NULL; mm->mm_rb = RB_ROOT; + mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); + mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); mm->vmacache_seqnum = 0; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); diff --git a/mm/init-mm.c b/mm/init-mm.c index fbe7844d0912..b912b0f2eced 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include #include @@ -29,6 +30,7 @@ */ struct mm_struct init_mm = { .mm_rb = RB_ROOT, + .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, init_mm.mmap_lock), .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), diff --git a/mm/mmap.c b/mm/mmap.c index dd25a2aa94f7..5115eea6a0e6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -334,7 +334,71 @@ static int browse_rb(struct mm_struct *mm) } return bug ? -1 : i; } +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) +extern void mt_validate(struct maple_tree *mt); +extern void mt_dump(const struct maple_tree *mt); +/* Validate the maple tree */ +static void validate_mm_mt(struct mm_struct *mm) +{ + struct maple_tree *mt = &mm->mm_mt; + struct vm_area_struct *vma_mt, *vma = mm->mmap; + + MA_STATE(mas, mt, 0, 0); + + mt_validate(&mm->mm_mt); + mas_for_each(&mas, vma_mt, ULONG_MAX) { + if (xa_is_zero(vma_mt)) + continue; + + if (!vma) + break; + + if ((vma != vma_mt) || + (vma->vm_start != vma_mt->vm_start) || + (vma->vm_end != vma_mt->vm_end) || + (vma->vm_start != mas.index) || + (vma->vm_end - 1 != mas.last)) { + pr_emerg("issue in %s\n", current->comm); + dump_stack(); +#ifdef CONFIG_DEBUG_VM + dump_vma(vma_mt); + pr_emerg("and next in rb\n"); + dump_vma(vma->vm_next); +#endif + pr_emerg("mt piv: %p %lu - %lu\n", vma_mt, + mas.index, mas.last); + pr_emerg("mt vma: %p %lu - %lu\n", vma_mt, + vma_mt->vm_start, vma_mt->vm_end); + pr_emerg("rb vma: %p %lu - %lu\n", vma, + vma->vm_start, vma->vm_end); + pr_emerg("rb->next = %p %lu - %lu\n", vma->vm_next, + vma->vm_next->vm_start, vma->vm_next->vm_end); + + mt_dump(mas.tree); + if (vma_mt->vm_end != mas.last + 1) { + pr_err("vma: %p vma_mt %lu-%lu\tmt %lu-%lu\n", + mm, vma_mt->vm_start, vma_mt->vm_end, + mas.index, mas.last); + mt_dump(mas.tree); + } + VM_BUG_ON_MM(vma_mt->vm_end != mas.last + 1, mm); + if (vma_mt->vm_start != mas.index) { + pr_err("vma: %p vma_mt %p %lu - %lu doesn't match\n", + mm, vma_mt, vma_mt->vm_start, vma_mt->vm_end); + mt_dump(mas.tree); + } + VM_BUG_ON_MM(vma_mt->vm_start != mas.index, mm); + } + VM_BUG_ON(vma != vma_mt); + vma = vma->vm_next; + + } + VM_BUG_ON(vma); +} +#else +#define validate_mm_mt(root) do { } while (0) +#endif static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) { struct rb_node *nd; @@ -389,6 +453,7 @@ static void validate_mm(struct mm_struct *mm) } #else #define validate_mm_rb(root, ignore) do { } while (0) +#define validate_mm_mt(root) do { } while (0) #define validate_mm(mm) do { } while (0) #endif @@ -633,6 +698,56 @@ static void __vma_link_file(struct vm_area_struct *vma) } } +/* + * vma_mas_store() - Store a VMA in the maple tree. + * @vma: The vm_area_struct + * @mas: The maple state + * + * Efficient way to store a VMA in the maple tree when the @mas has already + * walked to the correct location. + * + * Note: the end address is inclusive in the maple tree. + */ +void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas) +{ + trace_vma_store(mas->tree, vma); + mas_set_range(mas, vma->vm_start, vma->vm_end - 1); + mas_store_prealloc(mas, vma); +} + +/* + * vma_mas_remove() - Remove a VMA from the maple tree. + * @vma: The vm_area_struct + * @mas: The maple state + * + * Efficient way to remove a VMA from the maple tree when the @mas has already + * been established and points to the correct location. + * Note: the end address is inclusive in the maple tree. + */ +void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas) +{ + trace_vma_mas_szero(mas->tree, vma->vm_start, vma->vm_end - 1); + mas->index = vma->vm_start; + mas->last = vma->vm_end - 1; + mas_store_prealloc(mas, NULL); +} + +/* + * vma_mas_szero() - Set a given range to zero. Used when modifying a + * vm_area_struct start or end. + * + * @mm: The struct_mm + * @start: The start address to zero + * @end: The end address to zero. + */ +static inline void vma_mas_szero(struct ma_state *mas, unsigned long start, + unsigned long end) +{ + trace_vma_mas_szero(mas->tree, start, end - 1); + mas_set_range(mas, start, end - 1); + mas_store_prealloc(mas, NULL); +} + static void __vma_link(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node **rb_link, @@ -642,17 +757,22 @@ __vma_link(struct mm_struct *mm, struct vm_area_struct *vma, __vma_link_rb(mm, vma, rb_link, rb_parent); } -static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, +static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node **rb_link, struct rb_node *rb_parent) { + MA_STATE(mas, &mm->mm_mt, 0, 0); struct address_space *mapping = NULL; + if (mas_preallocate(&mas, vma, GFP_KERNEL)) + return -ENOMEM; + if (vma->vm_file) { mapping = vma->vm_file->f_mapping; i_mmap_lock_write(mapping); } + vma_mas_store(vma, &mas); __vma_link(mm, vma, prev, rb_link, rb_parent); __vma_link_file(vma); @@ -661,13 +781,15 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, mm->map_count++; validate_mm(mm); + return 0; } /* * Helper for vma_adjust() in the split_vma insert case: insert a vma into the * mm's list and rbtree. It has already been inserted into the interval tree. */ -static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) +static void __insert_vm_struct(struct mm_struct *mm, struct ma_state *mas, + struct vm_area_struct *vma) { struct vm_area_struct *prev; struct rb_node **rb_link, *rb_parent; @@ -675,7 +797,10 @@ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) if (find_vma_links(mm, vma->vm_start, vma->vm_end, &prev, &rb_link, &rb_parent)) BUG(); - __vma_link(mm, vma, prev, rb_link, rb_parent); + + vma_mas_store(vma, mas); + __vma_link_list(mm, vma, prev); + __vma_link_rb(mm, vma, rb_link, rb_parent); mm->map_count++; } @@ -702,6 +827,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *next = vma->vm_next, *orig_vma = vma; + struct vm_area_struct *next_next; struct address_space *mapping = NULL; struct rb_root_cached *root = NULL; struct anon_vma *anon_vma = NULL; @@ -709,10 +835,13 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, bool start_changed = false, end_changed = false; long adjust_next = 0; int remove_next = 0; + MA_STATE(mas, &mm->mm_mt, 0, 0); + struct vm_area_struct *exporter = NULL, *importer = NULL; - if (next && !insert) { - struct vm_area_struct *exporter = NULL, *importer = NULL; + validate_mm(mm); + validate_mm_mt(mm); + if (next && !insert) { if (end >= next->vm_end) { /* * vma expands, overlapping all the next, and @@ -741,10 +870,11 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * remove_next == 1 is case 1 or 7. */ remove_next = 1 + (end > next->vm_end); + if (remove_next == 2) + next_next = find_vma(mm, next->vm_end); + VM_WARN_ON(remove_next == 2 && end != next->vm_next->vm_end); - /* trim end to next, for case 6 first pass */ - end = next->vm_end; } exporter = next; @@ -792,9 +922,11 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, return error; } } -again: - vma_adjust_trans_huge(orig_vma, start, end, adjust_next); + if (mas_preallocate(&mas, vma, GFP_KERNEL)) + return -ENOMEM; + + vma_adjust_trans_huge(orig_vma, start, end, adjust_next); if (file) { mapping = file->f_mapping; root = &mapping->i_mmap; @@ -835,17 +967,28 @@ again: } if (start != vma->vm_start) { + unsigned long old_start = vma->vm_start; vma->vm_start = start; + if (old_start < start) + vma_mas_szero(&mas, old_start, start); start_changed = true; } if (end != vma->vm_end) { + unsigned long old_end = vma->vm_end; vma->vm_end = end; + if (old_end > end) + vma_mas_szero(&mas, end, old_end); end_changed = true; } + + if (end_changed || start_changed) + vma_mas_store(vma, &mas); + vma->vm_pgoff = pgoff; if (adjust_next) { next->vm_start += adjust_next; next->vm_pgoff += adjust_next >> PAGE_SHIFT; + vma_mas_store(next, &mas); } if (file) { @@ -859,10 +1002,14 @@ again: /* * vma_merge has merged next into vma, and needs * us to remove next before dropping the locks. + * Since we have expanded over this vma, the maple tree will + * have overwritten by storing the value */ - if (remove_next != 3) + if (remove_next != 3) { __vma_unlink(mm, next, next); - else + if (remove_next == 2) + __vma_unlink(mm, next_next, next_next); + } else { /* * vma is not before next if they've been * swapped. @@ -873,15 +1020,19 @@ again: * "vma"). */ __vma_unlink(mm, next, vma); - if (file) + } + if (file) { __remove_shared_vm_struct(next, file, mapping); + if (remove_next == 2) + __remove_shared_vm_struct(next_next, file, mapping); + } } else if (insert) { /* * split_vma has split insert from vma, and needs * us to insert it before dropping the locks * (it may either follow vma or precede it). */ - __insert_vm_struct(mm, insert); + __insert_vm_struct(mm, &mas, insert); } else { if (start_changed) vma_gap_update(vma); @@ -909,6 +1060,7 @@ again: } if (remove_next) { +again: if (file) { uprobe_munmap(next, next->vm_start, next->vm_end); fput(file); @@ -930,7 +1082,7 @@ again: * "next->vm_prev->vm_end" changed and the * "vma->vm_next" gap must be updated. */ - next = vma->vm_next; + next = next_next; } else { /* * For the scope of the comment "next" and @@ -946,7 +1098,6 @@ again: } if (remove_next == 2) { remove_next = 1; - end = next->vm_end; goto again; } else if (next) @@ -978,6 +1129,7 @@ again: uprobe_mmap(insert); validate_mm(mm); + validate_mm_mt(mm); return 0; } @@ -1131,6 +1283,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, struct vm_area_struct *area, *next; int err; + validate_mm_mt(mm); /* * We later require that vma->vm_flags == vm_flags, * so this tests vma->vm_flags & VM_SPECIAL, too. @@ -1206,6 +1359,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, khugepaged_enter_vma(area, vm_flags); return area; } + validate_mm_mt(mm); return NULL; } @@ -1688,6 +1842,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, struct rb_node **rb_link, *rb_parent; unsigned long charged = 0; + validate_mm_mt(mm); /* Check against address space limit. */ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { unsigned long nr_pages; @@ -1802,7 +1957,13 @@ unsigned long mmap_region(struct file *file, unsigned long addr, goto free_vma; } - vma_link(mm, vma, prev, rb_link, rb_parent); + if (vma_link(mm, vma, prev, rb_link, rb_parent)) { + error = -ENOMEM; + if (file) + goto unmap_and_free_vma; + else + goto free_vma; + } /* * vma_merge() calls khugepaged_enter_vma() either, the below @@ -1842,6 +2003,7 @@ out: vma_set_page_prot(vma); + validate_mm_mt(mm); return addr; unmap_and_free_vma: @@ -1857,6 +2019,7 @@ free_vma: unacct_error: if (charged) vm_unacct_memory(charged); + validate_mm_mt(mm); return error; } @@ -1873,12 +2036,19 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long length, low_limit, high_limit, gap_start, gap_end; + unsigned long gap; + MA_STATE(mas, &mm->mm_mt, 0, 0); /* Adjust search length to account for worst case alignment overhead */ length = info->length + info->align_mask; if (length < info->length) return -ENOMEM; + mas_empty_area(&mas, info->low_limit, info->high_limit - 1, + length); + gap = mas.index; + gap += (info->align_offset - gap) & info->align_mask; + /* Adjust search limits by the desired length */ if (info->high_limit < length) return -ENOMEM; @@ -1960,20 +2130,31 @@ found: VM_BUG_ON(gap_start + info->length > info->high_limit); VM_BUG_ON(gap_start + info->length > gap_end); + + VM_BUG_ON(gap != gap_start); return gap_start; } static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; + struct vm_area_struct *vma = NULL; unsigned long length, low_limit, high_limit, gap_start, gap_end; + unsigned long gap; + + MA_STATE(mas, &mm->mm_mt, 0, 0); + validate_mm_mt(mm); /* Adjust search length to account for worst case alignment overhead */ length = info->length + info->align_mask; if (length < info->length) return -ENOMEM; + mas_empty_area_rev(&mas, info->low_limit, info->high_limit - 1, + length); + gap = mas.last + 1 - info->length; + gap -= (gap - info->align_offset) & info->align_mask; + /* * Adjust search limits by the desired length. * See implementation comment at top of unmapped_area(). @@ -2059,6 +2240,32 @@ found_highest: VM_BUG_ON(gap_end < info->low_limit); VM_BUG_ON(gap_end < gap_start); + + if (gap != gap_end) { + pr_err("%s: %p Gap was found: mt %lu gap_end %lu\n", __func__, + mm, gap, gap_end); + pr_err("window was %lu - %lu size %lu\n", info->high_limit, + info->low_limit, length); + pr_err("mas.min %lu max %lu mas.last %lu\n", mas.min, mas.max, + mas.last); + pr_err("mas.index %lu align mask %lu offset %lu\n", mas.index, + info->align_mask, info->align_offset); + pr_err("rb_find_vma find on %lu => %p (%p)\n", mas.index, + find_vma(mm, mas.index), vma); +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) + mt_dump(&mm->mm_mt); +#endif + { + struct vm_area_struct *dv = mm->mmap; + + while (dv) { + pr_err("vma %p %lu-%lu\n", dv, dv->vm_start, dv->vm_end); + dv = dv->vm_next; + } + } + VM_BUG_ON(gap != gap_end); + } + return gap_end; } @@ -2284,7 +2491,6 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) vmacache_update(addr, vma); return vma; } - EXPORT_SYMBOL(find_vma); /* @@ -2357,7 +2563,9 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) struct vm_area_struct *next; unsigned long gap_addr; int error = 0; + MA_STATE(mas, &mm->mm_mt, 0, 0); + validate_mm_mt(mm); if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; @@ -2381,9 +2589,14 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* Check that both stack segments have the same anon_vma? */ } + if (mas_preallocate(&mas, vma, GFP_KERNEL)) + return -ENOMEM; + /* We must make sure the anon_vma is allocated. */ - if (unlikely(anon_vma_prepare(vma))) + if (unlikely(anon_vma_prepare(vma))) { + mas_destroy(&mas); return -ENOMEM; + } /* * vma->vm_start/vm_end cannot change under us because the caller @@ -2420,6 +2633,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; + /* Overwrite old entry in mtree. */ + vma_mas_store(vma, &mas); anon_vma_interval_tree_post_update_vma(vma); if (vma->vm_next) vma_gap_update(vma->vm_next); @@ -2434,6 +2649,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma(vma, vma->vm_flags); validate_mm(mm); + validate_mm_mt(mm); + mas_destroy(&mas); return error; } #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ @@ -2447,7 +2664,9 @@ int expand_downwards(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *prev; int error = 0; + MA_STATE(mas, &mm->mm_mt, 0, 0); + validate_mm(mm); address &= PAGE_MASK; if (address < mmap_min_addr) return -EPERM; @@ -2461,9 +2680,14 @@ int expand_downwards(struct vm_area_struct *vma, return -ENOMEM; } + if (mas_preallocate(&mas, vma, GFP_KERNEL)) + return -ENOMEM; + /* We must make sure the anon_vma is allocated. */ - if (unlikely(anon_vma_prepare(vma))) + if (unlikely(anon_vma_prepare(vma))) { + mas_destroy(&mas); return -ENOMEM; + } /* * vma->vm_start/vm_end cannot change under us because the caller @@ -2501,6 +2725,8 @@ int expand_downwards(struct vm_area_struct *vma, anon_vma_interval_tree_pre_update_vma(vma); vma->vm_start = address; vma->vm_pgoff -= grow; + /* Overwrite old entry in mtree. */ + vma_mas_store(vma, &mas); anon_vma_interval_tree_post_update_vma(vma); vma_gap_update(vma); spin_unlock(&mm->page_table_lock); @@ -2512,6 +2738,7 @@ int expand_downwards(struct vm_area_struct *vma, anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma(vma, vma->vm_flags); validate_mm(mm); + mas_destroy(&mas); return error; } @@ -2633,14 +2860,17 @@ static void unmap_region(struct mm_struct *mm, * vma list as we go.. */ static bool -detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, unsigned long end) +detach_vmas_to_be_unmapped(struct mm_struct *mm, struct ma_state *mas, + struct vm_area_struct *vma, struct vm_area_struct *prev, + unsigned long end) { struct vm_area_struct **insertion_point; struct vm_area_struct *tail_vma = NULL; insertion_point = (prev ? &prev->vm_next : &mm->mmap); vma->vm_prev = NULL; + mas_set_range(mas, vma->vm_start, end - 1); + mas_store_prealloc(mas, NULL); do { vma_rb_erase(vma, &mm->mm_rb); if (vma->vm_flags & VM_LOCKED) @@ -2681,6 +2911,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, { struct vm_area_struct *new; int err; + validate_mm_mt(mm); if (vma->vm_ops && vma->vm_ops->may_split) { err = vma->vm_ops->may_split(vma, addr); @@ -2723,6 +2954,9 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (!err) return 0; + /* Avoid vm accounting in close() operation */ + new->vm_start = new->vm_end; + new->vm_pgoff = 0; /* Clean everything up if vma_adjust failed. */ if (new->vm_ops && new->vm_ops->close) new->vm_ops->close(new); @@ -2733,6 +2967,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, mpol_put(vma_policy(new)); out_free_vma: vm_area_free(new); + validate_mm_mt(mm); return err; } @@ -2759,6 +2994,8 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, { unsigned long end; struct vm_area_struct *vma, *prev, *last; + int error = -ENOMEM; + MA_STATE(mas, &mm->mm_mt, 0, 0); if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) return -EINVAL; @@ -2779,6 +3016,9 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, vma = find_vma_intersection(mm, start, end); if (!vma) return 0; + + if (mas_preallocate(&mas, vma, GFP_KERNEL)) + return -ENOMEM; prev = vma->vm_prev; /* @@ -2789,7 +3029,6 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, * places tmp vma above, and higher split_vma places tmp vma below. */ if (start > vma->vm_start) { - int error; /* * Make sure that map_count on return from munmap() will @@ -2797,20 +3036,20 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, * its limit temporarily, to help free resources as expected. */ if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) - return -ENOMEM; + goto map_count_exceeded; error = __split_vma(mm, vma, start, 0); if (error) - return error; + goto split_failed; prev = vma; } /* Does it split the last one? */ last = find_vma(mm, end); if (last && end > last->vm_start) { - int error = __split_vma(mm, last, end, 1); + error = __split_vma(mm, last, end, 1); if (error) - return error; + goto split_failed; } vma = vma_next(mm, prev); @@ -2824,13 +3063,13 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, * split, despite we could. This is unlikely enough * failure that it's not worth optimizing it for. */ - int error = userfaultfd_unmap_prep(vma, start, end, uf); + error = userfaultfd_unmap_prep(vma, start, end, uf); if (error) - return error; + goto userfaultfd_error; } /* Detach vmas from rbtree */ - if (!detach_vmas_to_be_unmapped(mm, vma, prev, end)) + if (!detach_vmas_to_be_unmapped(mm, &mas, vma, prev, end)) downgrade = false; if (downgrade) @@ -2842,6 +3081,12 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, remove_vma_list(mm, vma); return downgrade ? 1 : 0; + +map_count_exceeded: +split_failed: +userfaultfd_error: + mas_destroy(&mas); + return error; } int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, @@ -2981,6 +3226,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla pgoff_t pgoff = addr >> PAGE_SHIFT; int error; unsigned long mapped_addr; + validate_mm_mt(mm); /* Until we need other flags, refuse anything except VM_EXEC. */ if ((flags & (~VM_EXEC)) != 0) @@ -3030,7 +3276,9 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla vma->vm_pgoff = pgoff; vma->vm_flags = flags; vma->vm_page_prot = vm_get_page_prot(flags); - vma_link(mm, vma, prev, rb_link, rb_parent); + if (vma_link(mm, vma, prev, rb_link, rb_parent)) + goto no_vma_link; + out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; @@ -3038,7 +3286,12 @@ out: if (flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); vma->vm_flags |= VM_SOFTDIRTY; + validate_mm_mt(mm); return 0; + +no_vma_link: + vm_area_free(vma); + return -ENOMEM; } int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) @@ -3127,6 +3380,9 @@ void exit_mmap(struct mm_struct *mm) vma = remove_vma(vma); cond_resched(); } + + trace_exit_mmap(mm); + __mt_destroy(&mm->mm_mt); mm->mmap = NULL; mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); @@ -3140,12 +3396,30 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { struct vm_area_struct *prev; struct rb_node **rb_link, *rb_parent; + unsigned long start = vma->vm_start; + struct vm_area_struct *overlap = NULL; + unsigned long charged = vma_pages(vma); if (find_vma_links(mm, vma->vm_start, vma->vm_end, &prev, &rb_link, &rb_parent)) + + if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) return -ENOMEM; + + overlap = mt_find(&mm->mm_mt, &start, vma->vm_end - 1); + if (overlap) { + + pr_err("Found vma ending at %lu\n", start - 1); + pr_err("vma : %lu => %lu-%lu\n", (unsigned long)overlap, + overlap->vm_start, overlap->vm_end - 1); +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) + mt_dump(&mm->mm_mt); +#endif + BUG(); + } + if ((vma->vm_flags & VM_ACCOUNT) && - security_vm_enough_memory_mm(mm, vma_pages(vma))) + security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; /* @@ -3165,7 +3439,11 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; } - vma_link(mm, vma, prev, rb_link, rb_parent); + if (vma_link(mm, vma, prev, rb_link, rb_parent)) { + vm_unacct_memory(charged); + return -ENOMEM; + } + return 0; } @@ -3183,7 +3461,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, struct vm_area_struct *new_vma, *prev; struct rb_node **rb_link, *rb_parent; bool faulted_in_anon_vma = true; + unsigned long index = addr; + validate_mm_mt(mm); /* * If anonymous vma has not yet been faulted, update new pgoff * to match new location, to increase its chance of merging. @@ -3195,6 +3475,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) return NULL; /* should never get here */ + if (mt_find(&mm->mm_mt, &index, addr+len - 1)) + BUG(); new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); @@ -3238,6 +3520,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, vma_link(mm, new_vma, prev, rb_link, rb_parent); *need_rmap_locks = false; } + validate_mm_mt(mm); return new_vma; out_free_mempol: @@ -3245,6 +3528,7 @@ out_free_mempol: out_free_vma: vm_area_free(new_vma); out: + validate_mm_mt(mm); return NULL; } @@ -3381,6 +3665,7 @@ static struct vm_area_struct *__install_special_mapping( int ret; struct vm_area_struct *vma; + validate_mm_mt(mm); vma = vm_area_alloc(mm); if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); @@ -3403,10 +3688,12 @@ static struct vm_area_struct *__install_special_mapping( perf_event_mmap(vma); + validate_mm_mt(mm); return vma; out: vm_area_free(vma); + validate_mm_mt(mm); return ERR_PTR(ret); } diff --git a/mm/nommu.c b/mm/nommu.c index e819cbc21b39..c63793c53a82 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -545,6 +545,19 @@ static void put_nommu_region(struct vm_region *region) __put_nommu_region(region); } +void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas) +{ + mas_set_range(mas, vma->vm_start, vma->vm_end - 1); + mas_store_prealloc(mas, vma); +} + +void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas) +{ + mas->index = vma->vm_start; + mas->last = vma->vm_end - 1; + mas_store_prealloc(mas, NULL); +} + /* * add a VMA into a process's mm_struct in the appropriate place in the list * and tree and add to the address space's page tree also if not an anonymous -- cgit v1.2.3 From 524e00b36e8c547f5582eef3fb645a8d9fc5e3df Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:48 +0000 Subject: mm: remove rb tree. Remove the RB tree and start using the maple tree for vm_area_struct tracking. Drop validate_mm() calls in expand_upwards() and expand_downwards() as the lock is not held. Link: https://lkml.kernel.org/r/20220906194824.2110408-18-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/kernel/tboot.c | 1 - drivers/firmware/efi/efi.c | 1 - include/linux/mm.h | 2 - include/linux/mm_types.h | 14 -- kernel/fork.c | 8 - mm/init-mm.c | 2 - mm/mmap.c | 506 ++++++++++----------------------------------- mm/nommu.c | 87 +++----- mm/util.c | 10 +- 9 files changed, 144 insertions(+), 487 deletions(-) (limited to 'drivers') diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index e01544202651..4c1bcb6053fc 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -95,7 +95,6 @@ void __init tboot_probe(void) static pgd_t *tboot_pg_dir; static struct mm_struct tboot_mm = { - .mm_rb = RB_ROOT, .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, tboot_mm.mmap_lock), .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 7b6a815b79d3..042a3ef4db1c 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -57,7 +57,6 @@ static unsigned long __initdata mem_reserve = EFI_INVALID_TABLE_ADDR; static unsigned long __initdata rt_prop = EFI_INVALID_TABLE_ADDR; struct mm_struct efi_mm = { - .mm_rb = RB_ROOT, .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, efi_mm.mmap_lock), .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), diff --git a/include/linux/mm.h b/include/linux/mm.h index 646ea4d3bd74..dfce1aaa7a64 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2654,8 +2654,6 @@ extern int __split_vma(struct mm_struct *, struct vm_area_struct *, extern int split_vma(struct mm_struct *, struct vm_area_struct *, unsigned long addr, int new_below); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); -extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, - struct rb_node **, struct rb_node *); extern void unlink_file_vma(struct vm_area_struct *); extern struct vm_area_struct *copy_vma(struct vm_area_struct **, unsigned long addr, unsigned long len, pgoff_t pgoff, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d0b51fbdf5d4..ac747273c4d6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -410,19 +410,6 @@ struct vm_area_struct { /* linked list of VM areas per task, sorted by address */ struct vm_area_struct *vm_next, *vm_prev; - - struct rb_node vm_rb; - - /* - * Largest free memory gap in bytes to the left of this VMA. - * Either between this VMA and vma->vm_prev, or between one of the - * VMAs below us in the VMA rbtree and its ->vm_prev. This helps - * get_unmapped_area find a free area of the right size. - */ - unsigned long rb_subtree_gap; - - /* Second cache line starts here. */ - struct mm_struct *vm_mm; /* The address space we belong to. */ /* @@ -488,7 +475,6 @@ struct mm_struct { struct { struct vm_area_struct *mmap; /* list of VMAs */ struct maple_tree mm_mt; - struct rb_root mm_rb; u64 vmacache_seqnum; /* per-thread vmacache */ #ifdef CONFIG_MMU unsigned long (*get_unmapped_area) (struct file *filp, diff --git a/kernel/fork.c b/kernel/fork.c index 16970c346b5b..5f81c009bb20 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -581,7 +581,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { struct vm_area_struct *mpnt, *tmp, *prev, **pprev; - struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge = 0; LIST_HEAD(uf); @@ -608,8 +607,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mm->exec_vm = oldmm->exec_vm; mm->stack_vm = oldmm->stack_vm; - rb_link = &mm->mm_rb.rb_node; - rb_parent = NULL; pprev = &mm->mmap; retval = ksm_fork(mm, oldmm); if (retval) @@ -701,10 +698,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, tmp->vm_prev = prev; prev = tmp; - __vma_link_rb(mm, tmp, rb_link, rb_parent); - rb_link = &tmp->vm_rb.rb_right; - rb_parent = &tmp->vm_rb; - /* Link the vma into the MT */ mas.index = tmp->vm_start; mas.last = tmp->vm_end - 1; @@ -1133,7 +1126,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { mm->mmap = NULL; - mm->mm_rb = RB_ROOT; mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); mm->vmacache_seqnum = 0; diff --git a/mm/init-mm.c b/mm/init-mm.c index b912b0f2eced..c9327abb771c 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include -#include #include #include #include @@ -29,7 +28,6 @@ * and size this cpu_bitmask to NR_CPUS. */ struct mm_struct init_mm = { - .mm_rb = RB_ROOT, .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, init_mm.mmap_lock), .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), diff --git a/mm/mmap.c b/mm/mmap.c index 68ee2958c0be..f60d83c7f233 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include #include @@ -247,93 +246,6 @@ out: return origbrk; } -static inline unsigned long vma_compute_gap(struct vm_area_struct *vma) -{ - unsigned long gap, prev_end; - - /* - * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we - * allow two stack_guard_gaps between them here, and when choosing - * an unmapped area; whereas when expanding we only require one. - * That's a little inconsistent, but keeps the code here simpler. - */ - gap = vm_start_gap(vma); - if (vma->vm_prev) { - prev_end = vm_end_gap(vma->vm_prev); - if (gap > prev_end) - gap -= prev_end; - else - gap = 0; - } - return gap; -} - -#ifdef CONFIG_DEBUG_VM_RB -static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma) -{ - unsigned long max = vma_compute_gap(vma), subtree_gap; - if (vma->vm_rb.rb_left) { - subtree_gap = rb_entry(vma->vm_rb.rb_left, - struct vm_area_struct, vm_rb)->rb_subtree_gap; - if (subtree_gap > max) - max = subtree_gap; - } - if (vma->vm_rb.rb_right) { - subtree_gap = rb_entry(vma->vm_rb.rb_right, - struct vm_area_struct, vm_rb)->rb_subtree_gap; - if (subtree_gap > max) - max = subtree_gap; - } - return max; -} - -static int browse_rb(struct mm_struct *mm) -{ - struct rb_root *root = &mm->mm_rb; - int i = 0, j, bug = 0; - struct rb_node *nd, *pn = NULL; - unsigned long prev = 0, pend = 0; - - for (nd = rb_first(root); nd; nd = rb_next(nd)) { - struct vm_area_struct *vma; - vma = rb_entry(nd, struct vm_area_struct, vm_rb); - if (vma->vm_start < prev) { - pr_emerg("vm_start %lx < prev %lx\n", - vma->vm_start, prev); - bug = 1; - } - if (vma->vm_start < pend) { - pr_emerg("vm_start %lx < pend %lx\n", - vma->vm_start, pend); - bug = 1; - } - if (vma->vm_start > vma->vm_end) { - pr_emerg("vm_start %lx > vm_end %lx\n", - vma->vm_start, vma->vm_end); - bug = 1; - } - spin_lock(&mm->page_table_lock); - if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { - pr_emerg("free gap %lx, correct %lx\n", - vma->rb_subtree_gap, - vma_compute_subtree_gap(vma)); - bug = 1; - } - spin_unlock(&mm->page_table_lock); - i++; - pn = nd; - prev = vma->vm_start; - pend = vma->vm_end; - } - j = 0; - for (nd = pn; nd; nd = rb_prev(nd)) - j++; - if (i != j) { - pr_emerg("backwards %d, forwards %d\n", j, i); - bug = 1; - } - return bug ? -1 : i; -} #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) extern void mt_validate(struct maple_tree *mt); extern void mt_dump(const struct maple_tree *mt); @@ -361,19 +273,25 @@ static void validate_mm_mt(struct mm_struct *mm) (vma->vm_end - 1 != mas.last)) { pr_emerg("issue in %s\n", current->comm); dump_stack(); -#ifdef CONFIG_DEBUG_VM dump_vma(vma_mt); - pr_emerg("and next in rb\n"); + pr_emerg("and vm_next\n"); dump_vma(vma->vm_next); -#endif pr_emerg("mt piv: %p %lu - %lu\n", vma_mt, mas.index, mas.last); pr_emerg("mt vma: %p %lu - %lu\n", vma_mt, vma_mt->vm_start, vma_mt->vm_end); - pr_emerg("rb vma: %p %lu - %lu\n", vma, + if (vma->vm_prev) { + pr_emerg("ll prev: %p %lu - %lu\n", + vma->vm_prev, vma->vm_prev->vm_start, + vma->vm_prev->vm_end); + } + pr_emerg("ll vma: %p %lu - %lu\n", vma, vma->vm_start, vma->vm_end); - pr_emerg("rb->next = %p %lu - %lu\n", vma->vm_next, - vma->vm_next->vm_start, vma->vm_next->vm_end); + if (vma->vm_next) { + pr_emerg("ll next: %p %lu - %lu\n", + vma->vm_next, vma->vm_next->vm_start, + vma->vm_next->vm_end); + } mt_dump(mas.tree); if (vma_mt->vm_end != mas.last + 1) { @@ -396,21 +314,6 @@ static void validate_mm_mt(struct mm_struct *mm) } VM_BUG_ON(vma); } -#else -#define validate_mm_mt(root) do { } while (0) -#endif -static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) -{ - struct rb_node *nd; - - for (nd = rb_first(root); nd; nd = rb_next(nd)) { - struct vm_area_struct *vma; - vma = rb_entry(nd, struct vm_area_struct, vm_rb); - VM_BUG_ON_VMA(vma != ignore && - vma->rb_subtree_gap != vma_compute_subtree_gap(vma), - vma); - } -} static void validate_mm(struct mm_struct *mm) { @@ -419,7 +322,10 @@ static void validate_mm(struct mm_struct *mm) unsigned long highest_address = 0; struct vm_area_struct *vma = mm->mmap; + validate_mm_mt(mm); + while (vma) { +#ifdef CONFIG_DEBUG_VM_RB struct anon_vma *anon_vma = vma->anon_vma; struct anon_vma_chain *avc; @@ -429,6 +335,7 @@ static void validate_mm(struct mm_struct *mm) anon_vma_interval_tree_verify(avc); anon_vma_unlock_read(anon_vma); } +#endif highest_address = vm_end_gap(vma); vma = vma->vm_next; @@ -443,80 +350,13 @@ static void validate_mm(struct mm_struct *mm) mm->highest_vm_end, highest_address); bug = 1; } - i = browse_rb(mm); - if (i != mm->map_count) { - if (i != -1) - pr_emerg("map_count %d rb %d\n", mm->map_count, i); - bug = 1; - } VM_BUG_ON_MM(bug, mm); } -#else -#define validate_mm_rb(root, ignore) do { } while (0) + +#else /* !CONFIG_DEBUG_VM_MAPLE_TREE */ #define validate_mm_mt(root) do { } while (0) #define validate_mm(mm) do { } while (0) -#endif - -RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks, - struct vm_area_struct, vm_rb, - unsigned long, rb_subtree_gap, vma_compute_gap) - -/* - * Update augmented rbtree rb_subtree_gap values after vma->vm_start or - * vma->vm_prev->vm_end values changed, without modifying the vma's position - * in the rbtree. - */ -static void vma_gap_update(struct vm_area_struct *vma) -{ - /* - * As it turns out, RB_DECLARE_CALLBACKS_MAX() already created - * a callback function that does exactly what we want. - */ - vma_gap_callbacks_propagate(&vma->vm_rb, NULL); -} - -static inline void vma_rb_insert(struct vm_area_struct *vma, - struct rb_root *root) -{ - /* All rb_subtree_gap values must be consistent prior to insertion */ - validate_mm_rb(root, NULL); - - rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); -} - -static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) -{ - /* - * Note rb_erase_augmented is a fairly large inline function, - * so make sure we instantiate it only once with our desired - * augmented rbtree callbacks. - */ - rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); -} - -static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma, - struct rb_root *root, - struct vm_area_struct *ignore) -{ - /* - * All rb_subtree_gap values must be consistent prior to erase, - * with the possible exception of - * - * a. the "next" vma being erased if next->vm_start was reduced in - * __vma_adjust() -> __vma_unlink() - * b. the vma being erased in detach_vmas_to_be_unmapped() -> - * vma_rb_erase() - */ - validate_mm_rb(root, ignore); - - __vma_rb_erase(vma, root); -} - -static __always_inline void vma_rb_erase(struct vm_area_struct *vma, - struct rb_root *root) -{ - vma_rb_erase_ignore(vma, root, vma); -} +#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ /* * vma has some anon_vma assigned, and is already inserted on that @@ -550,39 +390,26 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); } -static int find_vma_links(struct mm_struct *mm, unsigned long addr, - unsigned long end, struct vm_area_struct **pprev, - struct rb_node ***rb_link, struct rb_node **rb_parent) +/* + * range_has_overlap() - Check the @start - @end range for overlapping VMAs and + * sets up a pointer to the previous VMA + * @mm: the mm struct + * @start: the start address of the range + * @end: the end address of the range + * @pprev: the pointer to the pointer of the previous VMA + * + * Returns: True if there is an overlapping VMA, false otherwise + */ +static inline +bool range_has_overlap(struct mm_struct *mm, unsigned long start, + unsigned long end, struct vm_area_struct **pprev) { - struct rb_node **__rb_link, *__rb_parent, *rb_prev; - - mmap_assert_locked(mm); - __rb_link = &mm->mm_rb.rb_node; - rb_prev = __rb_parent = NULL; - - while (*__rb_link) { - struct vm_area_struct *vma_tmp; - - __rb_parent = *__rb_link; - vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); + struct vm_area_struct *existing; - if (vma_tmp->vm_end > addr) { - /* Fail if an existing vma overlaps the area */ - if (vma_tmp->vm_start < end) - return -ENOMEM; - __rb_link = &__rb_parent->rb_left; - } else { - rb_prev = __rb_parent; - __rb_link = &__rb_parent->rb_right; - } - } - - *pprev = NULL; - if (rb_prev) - *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); - *rb_link = __rb_link; - *rb_parent = __rb_parent; - return 0; + MA_STATE(mas, &mm->mm_mt, start, start); + existing = mas_find(&mas, end - 1); + *pprev = mas_prev(&mas, 0); + return existing ? true : false; } /* @@ -609,8 +436,6 @@ static inline struct vm_area_struct *__vma_next(struct mm_struct *mm, * @start: The start of the range. * @len: The length of the range. * @pprev: pointer to the pointer that will be set to previous vm_area_struct - * @rb_link: the rb_node - * @rb_parent: the parent rb_node * * Find all the vm_area_struct that overlap from @start to * @end and munmap them. Set @pprev to the previous vm_area_struct. @@ -619,14 +444,11 @@ static inline struct vm_area_struct *__vma_next(struct mm_struct *mm, */ static inline int munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len, - struct vm_area_struct **pprev, struct rb_node ***link, - struct rb_node **parent, struct list_head *uf) + struct vm_area_struct **pprev, struct list_head *uf) { - - while (find_vma_links(mm, start, start + len, pprev, link, parent)) + while (range_has_overlap(mm, start, start + len, pprev)) if (do_munmap(mm, start, len, uf)) return -ENOMEM; - return 0; } @@ -647,30 +469,6 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm, return nr_pages; } -void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, - struct rb_node **rb_link, struct rb_node *rb_parent) -{ - /* Update tracking information for the gap following the new vma. */ - if (vma->vm_next) - vma_gap_update(vma->vm_next); - else - mm->highest_vm_end = vm_end_gap(vma); - - /* - * vma->vm_prev wasn't known when we followed the rbtree to find the - * correct insertion point for that vma. As a result, we could not - * update the vma vm_rb parents rb_subtree_gap values on the way down. - * So, we first insert the vma with a zero rb_subtree_gap value - * (to be consistent with what we did on the way down), and then - * immediately update the gap to the correct value. Finally we - * rebalance the rbtree after all augmented values have been set. - */ - rb_link_node(&vma->vm_rb, rb_parent, rb_link); - vma->rb_subtree_gap = 0; - vma_gap_update(vma); - vma_rb_insert(vma, &mm->mm_rb); -} - static void __vma_link_file(struct vm_area_struct *vma) { struct file *file; @@ -738,18 +536,8 @@ static inline void vma_mas_szero(struct ma_state *mas, unsigned long start, mas_store_prealloc(mas, NULL); } -static void -__vma_link(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, struct rb_node **rb_link, - struct rb_node *rb_parent) -{ - __vma_link_list(mm, vma, prev); - __vma_link_rb(mm, vma, rb_link, rb_parent); -} - static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, struct rb_node **rb_link, - struct rb_node *rb_parent) + struct vm_area_struct *prev) { MA_STATE(mas, &mm->mm_mt, 0, 0); struct address_space *mapping = NULL; @@ -763,7 +551,7 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma, } vma_mas_store(vma, &mas); - __vma_link(mm, vma, prev, rb_link, rb_parent); + __vma_link_list(mm, vma, prev); __vma_link_file(vma); if (mapping) @@ -776,34 +564,20 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma, /* * Helper for vma_adjust() in the split_vma insert case: insert a vma into the - * mm's list and rbtree. It has already been inserted into the interval tree. + * mm's list and the mm tree. It has already been inserted into the interval tree. */ static void __insert_vm_struct(struct mm_struct *mm, struct ma_state *mas, struct vm_area_struct *vma) { struct vm_area_struct *prev; - struct rb_node **rb_link, *rb_parent; - - if (find_vma_links(mm, vma->vm_start, vma->vm_end, - &prev, &rb_link, &rb_parent)) - BUG(); + mas_set(mas, vma->vm_start); + prev = mas_prev(mas, 0); vma_mas_store(vma, mas); __vma_link_list(mm, vma, prev); - __vma_link_rb(mm, vma, rb_link, rb_parent); mm->map_count++; } -static __always_inline void __vma_unlink(struct mm_struct *mm, - struct vm_area_struct *vma, - struct vm_area_struct *ignore) -{ - vma_rb_erase_ignore(vma, &mm->mm_rb, ignore); - __vma_unlink_list(mm, vma); - /* Kill the cache */ - vmacache_invalidate(mm); -} - /* * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that * is already present in an i_mmap tree without adjusting the tree. @@ -816,21 +590,18 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, struct vm_area_struct *expand) { struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *next = vma->vm_next, *orig_vma = vma; - struct vm_area_struct *next_next; + struct vm_area_struct *next_next, *next = find_vma(mm, vma->vm_end); + struct vm_area_struct *orig_vma = vma; struct address_space *mapping = NULL; struct rb_root_cached *root = NULL; struct anon_vma *anon_vma = NULL; struct file *file = vma->vm_file; - bool start_changed = false, end_changed = false; + bool vma_changed = false; long adjust_next = 0; int remove_next = 0; MA_STATE(mas, &mm->mm_mt, 0, 0); struct vm_area_struct *exporter = NULL, *importer = NULL; - validate_mm(mm); - validate_mm_mt(mm); - if (next && !insert) { if (end >= next->vm_end) { /* @@ -957,21 +728,21 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } if (start != vma->vm_start) { - unsigned long old_start = vma->vm_start; + if (vma->vm_start < start) + vma_mas_szero(&mas, vma->vm_start, start); + vma_changed = true; vma->vm_start = start; - if (old_start < start) - vma_mas_szero(&mas, old_start, start); - start_changed = true; } if (end != vma->vm_end) { - unsigned long old_end = vma->vm_end; + if (vma->vm_end > end) + vma_mas_szero(&mas, end, vma->vm_end); + vma_changed = true; vma->vm_end = end; - if (old_end > end) - vma_mas_szero(&mas, end, old_end); - end_changed = true; + if (!next) + mm->highest_vm_end = vm_end_gap(vma); } - if (end_changed || start_changed) + if (vma_changed) vma_mas_store(vma, &mas); vma->vm_pgoff = pgoff; @@ -995,22 +766,12 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * Since we have expanded over this vma, the maple tree will * have overwritten by storing the value */ - if (remove_next != 3) { - __vma_unlink(mm, next, next); - if (remove_next == 2) - __vma_unlink(mm, next_next, next_next); - } else { - /* - * vma is not before next if they've been - * swapped. - * - * pre-swap() next->vm_start was reduced so - * tell validate_mm_rb to ignore pre-swap() - * "next" (which is stored in post-swap() - * "vma"). - */ - __vma_unlink(mm, next, vma); - } + __vma_unlink_list(mm, next); + if (remove_next == 2) + __vma_unlink_list(mm, next_next); + /* Kill the cache */ + vmacache_invalidate(mm); + if (file) { __remove_shared_vm_struct(next, file, mapping); if (remove_next == 2) @@ -1023,15 +784,6 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * (it may either follow vma or precede it). */ __insert_vm_struct(mm, &mas, insert); - } else { - if (start_changed) - vma_gap_update(vma); - if (end_changed) { - if (!next) - mm->highest_vm_end = vm_end_gap(vma); - else if (!adjust_next) - vma_gap_update(next); - } } if (anon_vma) { @@ -1059,7 +811,10 @@ again: anon_vma_merge(vma, next); mm->map_count--; mpol_put(vma_policy(next)); + if (remove_next != 2) + BUG_ON(vma->vm_end < next->vm_end); vm_area_free(next); + /* * In mprotect's case 6 (see comments on vma_merge), * we must remove another next too. It would clutter @@ -1089,10 +844,7 @@ again: if (remove_next == 2) { remove_next = 1; goto again; - } - else if (next) - vma_gap_update(next); - else { + } else if (!next) { /* * If remove_next == 2 we obviously can't * reach this path. @@ -1119,8 +871,6 @@ again: uprobe_mmap(insert); validate_mm(mm); - validate_mm_mt(mm); - return 0; } @@ -1273,7 +1023,6 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, struct vm_area_struct *area, *next; int err; - validate_mm_mt(mm); /* * We later require that vma->vm_flags == vm_flags, * so this tests vma->vm_flags & VM_SPECIAL, too. @@ -1349,7 +1098,6 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, khugepaged_enter_vma(area, vm_flags); return area; } - validate_mm_mt(mm); return NULL; } @@ -1519,6 +1267,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags_t vm_flags; int pkey = 0; + validate_mm(mm); *populate = 0; if (!len) @@ -1829,10 +1578,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev, *merge; int error; - struct rb_node **rb_link, *rb_parent; unsigned long charged = 0; - validate_mm_mt(mm); /* Check against address space limit. */ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { unsigned long nr_pages; @@ -1848,8 +1595,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, return -ENOMEM; } - /* Clear old maps, set up prev, rb_link, rb_parent, and uf */ - if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf)) + /* Clear old maps, set up prev and uf */ + if (munmap_vma_range(mm, addr, len, &prev, uf)) return -ENOMEM; /* * Private writable mapping: check memory availability @@ -1947,7 +1694,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, goto free_vma; } - if (vma_link(mm, vma, prev, rb_link, rb_parent)) { + if (vma_link(mm, vma, prev)) { error = -ENOMEM; if (file) goto unmap_and_free_vma; @@ -1993,7 +1740,6 @@ out: vma_set_page_prot(vma); - validate_mm_mt(mm); return addr; unmap_and_free_vma: @@ -2009,7 +1755,6 @@ free_vma: unacct_error: if (charged) vm_unacct_memory(charged); - validate_mm_mt(mm); return error; } @@ -2367,7 +2112,6 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) int error = 0; MA_STATE(mas, &mm->mm_mt, 0, 0); - validate_mm_mt(mm); if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; @@ -2419,15 +2163,13 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) error = acct_stack_growth(vma, size, grow); if (!error) { /* - * vma_gap_update() doesn't support concurrent - * updates, but we only hold a shared mmap_lock - * lock here, so we need to protect against - * concurrent vma expansions. - * anon_vma_lock_write() doesn't help here, as - * we don't guarantee that all growable vmas - * in a mm share the same root anon vma. - * So, we reuse mm->page_table_lock to guard - * against concurrent vma expansions. + * We only hold a shared mmap_lock lock here, so + * we need to protect against concurrent vma + * expansions. anon_vma_lock_write() doesn't + * help here, as we don't guarantee that all + * growable vmas in a mm share the same root + * anon vma. So, we reuse mm->page_table_lock + * to guard against concurrent vma expansions. */ spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) @@ -2438,9 +2180,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* Overwrite old entry in mtree. */ vma_mas_store(vma, &mas); anon_vma_interval_tree_post_update_vma(vma); - if (vma->vm_next) - vma_gap_update(vma->vm_next); - else + if (!vma->vm_next) mm->highest_vm_end = vm_end_gap(vma); spin_unlock(&mm->page_table_lock); @@ -2450,8 +2190,6 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma(vma, vma->vm_flags); - validate_mm(mm); - validate_mm_mt(mm); mas_destroy(&mas); return error; } @@ -2460,15 +2198,13 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* * vma is the first one with address < vma->vm_start. Have to extend vma. */ -int expand_downwards(struct vm_area_struct *vma, - unsigned long address) +int expand_downwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *prev; int error = 0; MA_STATE(mas, &mm->mm_mt, 0, 0); - validate_mm(mm); address &= PAGE_MASK; if (address < mmap_min_addr) return -EPERM; @@ -2510,15 +2246,13 @@ int expand_downwards(struct vm_area_struct *vma, error = acct_stack_growth(vma, size, grow); if (!error) { /* - * vma_gap_update() doesn't support concurrent - * updates, but we only hold a shared mmap_lock - * lock here, so we need to protect against - * concurrent vma expansions. - * anon_vma_lock_write() doesn't help here, as - * we don't guarantee that all growable vmas - * in a mm share the same root anon vma. - * So, we reuse mm->page_table_lock to guard - * against concurrent vma expansions. + * We only hold a shared mmap_lock lock here, so + * we need to protect against concurrent vma + * expansions. anon_vma_lock_write() doesn't + * help here, as we don't guarantee that all + * growable vmas in a mm share the same root + * anon vma. So, we reuse mm->page_table_lock + * to guard against concurrent vma expansions. */ spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) @@ -2530,7 +2264,6 @@ int expand_downwards(struct vm_area_struct *vma, /* Overwrite old entry in mtree. */ vma_mas_store(vma, &mas); anon_vma_interval_tree_post_update_vma(vma); - vma_gap_update(vma); spin_unlock(&mm->page_table_lock); perf_event_mmap(vma); @@ -2539,7 +2272,6 @@ int expand_downwards(struct vm_area_struct *vma, } anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma(vma, vma->vm_flags); - validate_mm(mm); mas_destroy(&mas); return error; } @@ -2671,10 +2403,8 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct ma_state *mas, insertion_point = (prev ? &prev->vm_next : &mm->mmap); vma->vm_prev = NULL; - mas_set_range(mas, vma->vm_start, end - 1); - mas_store_prealloc(mas, NULL); + vma_mas_szero(mas, vma->vm_start, end); do { - vma_rb_erase(vma, &mm->mm_rb); if (vma->vm_flags & VM_LOCKED) mm->locked_vm -= vma_pages(vma); mm->map_count--; @@ -2682,10 +2412,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct ma_state *mas, vma = vma->vm_next; } while (vma && vma->vm_start < end); *insertion_point = vma; - if (vma) { + if (vma) vma->vm_prev = prev; - vma_gap_update(vma); - } else + else mm->highest_vm_end = prev ? vm_end_gap(prev) : 0; tail_vma->vm_next = NULL; @@ -2807,11 +2536,7 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, if (len == 0) return -EINVAL; - /* - * arch_unmap() might do unmaps itself. It must be called - * and finish any rbtree manipulation before this code - * runs and also starts to manipulate the rbtree. - */ + /* arch_unmap() might do unmaps itself. */ arch_unmap(mm, start, end); /* Find the first overlapping VMA where start < vma->vm_end */ @@ -2822,6 +2547,11 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, if (mas_preallocate(&mas, vma, GFP_KERNEL)) return -ENOMEM; prev = vma->vm_prev; + /* we have start < vma->vm_end */ + + /* if it doesn't overlap, we have nothing.. */ + if (vma->vm_start >= end) + return 0; /* * If we need to split any vma, do it now to save pain later. @@ -2882,6 +2612,8 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, /* Fix up all other VM information */ remove_vma_list(mm, vma); + + validate_mm(mm); return downgrade ? 1 : 0; map_count_exceeded: @@ -3020,11 +2752,11 @@ out: * anonymous maps. eventually we may be able to do some * brk-specific accounting here. */ -static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long flags, struct list_head *uf) +static int do_brk_flags(unsigned long addr, unsigned long len, + unsigned long flags, struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; - struct rb_node **rb_link, *rb_parent; pgoff_t pgoff = addr >> PAGE_SHIFT; int error; unsigned long mapped_addr; @@ -3043,8 +2775,8 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla if (error) return error; - /* Clear old maps, set up prev, rb_link, rb_parent, and uf */ - if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf)) + /* Clear old maps, set up prev and uf */ + if (munmap_vma_range(mm, addr, len, &prev, uf)) return -ENOMEM; /* Check against address space limits *after* clearing old maps... */ @@ -3078,7 +2810,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla vma->vm_pgoff = pgoff; vma->vm_flags = flags; vma->vm_page_prot = vm_get_page_prot(flags); - if (vma_link(mm, vma, prev, rb_link, rb_parent)) + if (vma_link(mm, vma, prev)) goto no_vma_link; out: @@ -3197,29 +2929,12 @@ void exit_mmap(struct mm_struct *mm) int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { struct vm_area_struct *prev; - struct rb_node **rb_link, *rb_parent; - unsigned long start = vma->vm_start; - struct vm_area_struct *overlap = NULL; unsigned long charged = vma_pages(vma); - if (find_vma_links(mm, vma->vm_start, vma->vm_end, - &prev, &rb_link, &rb_parent)) - if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) + if (range_has_overlap(mm, vma->vm_start, vma->vm_end, &prev)) return -ENOMEM; - overlap = mt_find(&mm->mm_mt, &start, vma->vm_end - 1); - if (overlap) { - - pr_err("Found vma ending at %lu\n", start - 1); - pr_err("vma : %lu => %lu-%lu\n", (unsigned long)overlap, - overlap->vm_start, overlap->vm_end - 1); -#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) - mt_dump(&mm->mm_mt); -#endif - BUG(); - } - if ((vma->vm_flags & VM_ACCOUNT) && security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; @@ -3241,7 +2956,7 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; } - if (vma_link(mm, vma, prev, rb_link, rb_parent)) { + if (vma_link(mm, vma, prev)) { vm_unacct_memory(charged); return -ENOMEM; } @@ -3261,9 +2976,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, unsigned long vma_start = vma->vm_start; struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma, *prev; - struct rb_node **rb_link, *rb_parent; bool faulted_in_anon_vma = true; - unsigned long index = addr; validate_mm_mt(mm); /* @@ -3275,10 +2988,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, faulted_in_anon_vma = false; } - if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) + if (range_has_overlap(mm, addr, addr + len, &prev)) return NULL; /* should never get here */ - if (mt_find(&mm->mm_mt, &index, addr+len - 1)) - BUG(); + new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); @@ -3319,12 +3031,16 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, get_file(new_vma->vm_file); if (new_vma->vm_ops && new_vma->vm_ops->open) new_vma->vm_ops->open(new_vma); - vma_link(mm, new_vma, prev, rb_link, rb_parent); + if (vma_link(mm, new_vma, prev)) + goto out_vma_link; *need_rmap_locks = false; } validate_mm_mt(mm); return new_vma; +out_vma_link: + if (new_vma->vm_ops && new_vma->vm_ops->close) + new_vma->vm_ops->close(new_vma); out_free_mempol: mpol_put(vma_policy(new_vma)); out_free_vma: diff --git a/mm/nommu.c b/mm/nommu.c index c63793c53a82..321c7e6718a8 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -566,9 +566,9 @@ void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas) */ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct *pvma, *prev; struct address_space *mapping; - struct rb_node **p, *parent, *rb_prev; + struct vm_area_struct *prev; + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); BUG_ON(!vma->vm_region); @@ -586,42 +586,10 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) i_mmap_unlock_write(mapping); } + prev = mas_prev(&mas, 0); + mas_reset(&mas); /* add the VMA to the tree */ - parent = rb_prev = NULL; - p = &mm->mm_rb.rb_node; - while (*p) { - parent = *p; - pvma = rb_entry(parent, struct vm_area_struct, vm_rb); - - /* sort by: start addr, end addr, VMA struct addr in that order - * (the latter is necessary as we may get identical VMAs) */ - if (vma->vm_start < pvma->vm_start) - p = &(*p)->rb_left; - else if (vma->vm_start > pvma->vm_start) { - rb_prev = parent; - p = &(*p)->rb_right; - } else if (vma->vm_end < pvma->vm_end) - p = &(*p)->rb_left; - else if (vma->vm_end > pvma->vm_end) { - rb_prev = parent; - p = &(*p)->rb_right; - } else if (vma < pvma) - p = &(*p)->rb_left; - else if (vma > pvma) { - rb_prev = parent; - p = &(*p)->rb_right; - } else - BUG(); - } - - rb_link_node(&vma->vm_rb, parent, p); - rb_insert_color(&vma->vm_rb, &mm->mm_rb); - - /* add VMA to the VMA list also */ - prev = NULL; - if (rb_prev) - prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); - + vma_mas_store(vma, &mas); __vma_link_list(mm, vma, prev); } @@ -634,6 +602,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) struct address_space *mapping; struct mm_struct *mm = vma->vm_mm; struct task_struct *curr = current; + MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0); mm->map_count--; for (i = 0; i < VMACACHE_SIZE; i++) { @@ -656,8 +625,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) } /* remove from the MM's tree and list */ - rb_erase(&vma->vm_rb, &mm->mm_rb); - + vma_mas_remove(vma, &mas); __vma_unlink_list(mm, vma); } @@ -681,24 +649,19 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma; + MA_STATE(mas, &mm->mm_mt, addr, addr); /* check the cache first */ vma = vmacache_find(mm, addr); if (likely(vma)) return vma; - /* trawl the list (there may be multiple mappings in which addr - * resides) */ - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->vm_start > addr) - return NULL; - if (vma->vm_end > addr) { - vmacache_update(addr, vma); - return vma; - } - } + vma = mas_walk(&mas); - return NULL; + if (vma) + vmacache_update(addr, vma); + + return vma; } EXPORT_SYMBOL(find_vma); @@ -730,26 +693,23 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, { struct vm_area_struct *vma; unsigned long end = addr + len; + MA_STATE(mas, &mm->mm_mt, addr, addr); /* check the cache first */ vma = vmacache_find_exact(mm, addr, end); if (vma) return vma; - /* trawl the list (there may be multiple mappings in which addr - * resides) */ - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->vm_start < addr) - continue; - if (vma->vm_start > addr) - return NULL; - if (vma->vm_end == end) { - vmacache_update(addr, vma); - return vma; - } - } + vma = mas_walk(&mas); + if (!vma) + return NULL; + if (vma->vm_start != addr) + return NULL; + if (vma->vm_end != end) + return NULL; - return NULL; + vmacache_update(addr, vma); + return vma; } /* @@ -1546,6 +1506,7 @@ void exit_mmap(struct mm_struct *mm) delete_vma(mm, vma); cond_resched(); } + __mt_destroy(&mm->mm_mt); } int vm_brk(unsigned long addr, unsigned long len) diff --git a/mm/util.c b/mm/util.c index 8d944ce71e94..10effe256dfa 100644 --- a/mm/util.c +++ b/mm/util.c @@ -288,6 +288,8 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, vma->vm_next = next; if (next) next->vm_prev = vma; + else + mm->highest_vm_end = vm_end_gap(vma); } void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma) @@ -300,8 +302,14 @@ void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma) prev->vm_next = next; else mm->mmap = next; - if (next) + if (next) { next->vm_prev = prev; + } else { + if (prev) + mm->highest_vm_end = vm_end_gap(prev); + else + mm->highest_vm_end = 0; + } } /* Check if the vma is being used as a stack by this task */ -- cgit v1.2.3 From 7ccf089b262be2c7f2cdc6c08412a1939a812102 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:49 +0000 Subject: xen: use vma_lookup() in privcmd_ioctl_mmap() vma_lookup() walks the VMA tree for a specific value, find_vma() will search the tree after walking to a specific value. It is more efficient to only walk to the requested value since privcmd_ioctl_mmap() will exit the loop if vm_start != msg->va. Link: https://lkml.kernel.org/r/20220906194824.2110408-20-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/xen/privcmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index e88e8f6f0a33..fae50a24630b 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -282,7 +282,7 @@ static long privcmd_ioctl_mmap(struct file *file, void __user *udata) struct page, lru); struct privcmd_mmap_entry *msg = page_address(page); - vma = find_vma(mm, msg->va); + vma = vma_lookup(mm, msg->va); rc = -EINVAL; if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data) -- cgit v1.2.3 From d9fa0e37cdd47cfb71f5fa7a599ef5b9e32c55ed Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:55 +0000 Subject: cxl: remove vma linked list walk Use the VMA iterator instead. This requires a little restructuring of the surrounding code to hoist the mm to the caller. That turns cxl_prefault_one() into a trivial function, so call cxl_fault_segment() directly. Link: https://lkml.kernel.org/r/20220906194824.2110408-38-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/misc/cxl/fault.c | 45 +++++++++++++++------------------------------ 1 file changed, 15 insertions(+), 30 deletions(-) (limited to 'drivers') diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c index 60c829113299..2c64f55cf01f 100644 --- a/drivers/misc/cxl/fault.c +++ b/drivers/misc/cxl/fault.c @@ -280,22 +280,6 @@ void cxl_handle_fault(struct work_struct *fault_work) mmput(mm); } -static void cxl_prefault_one(struct cxl_context *ctx, u64 ea) -{ - struct mm_struct *mm; - - mm = get_mem_context(ctx); - if (mm == NULL) { - pr_devel("cxl_prefault_one unable to get mm %i\n", - pid_nr(ctx->pid)); - return; - } - - cxl_fault_segment(ctx, mm, ea); - - mmput(mm); -} - static u64 next_segment(u64 ea, u64 vsid) { if (vsid & SLB_VSID_B_1T) @@ -306,23 +290,16 @@ static u64 next_segment(u64 ea, u64 vsid) return ea + 1; } -static void cxl_prefault_vma(struct cxl_context *ctx) +static void cxl_prefault_vma(struct cxl_context *ctx, struct mm_struct *mm) { u64 ea, last_esid = 0; struct copro_slb slb; + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; int rc; - struct mm_struct *mm; - - mm = get_mem_context(ctx); - if (mm == NULL) { - pr_devel("cxl_prefault_vm unable to get mm %i\n", - pid_nr(ctx->pid)); - return; - } mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { for (ea = vma->vm_start; ea < vma->vm_end; ea = next_segment(ea, slb.vsid)) { rc = copro_calculate_slb(mm, ea, &slb); @@ -337,20 +314,28 @@ static void cxl_prefault_vma(struct cxl_context *ctx) } } mmap_read_unlock(mm); - - mmput(mm); } void cxl_prefault(struct cxl_context *ctx, u64 wed) { + struct mm_struct *mm = get_mem_context(ctx); + + if (mm == NULL) { + pr_devel("cxl_prefault unable to get mm %i\n", + pid_nr(ctx->pid)); + return; + } + switch (ctx->afu->prefault_mode) { case CXL_PREFAULT_WED: - cxl_prefault_one(ctx, wed); + cxl_fault_segment(ctx, mm, wed); break; case CXL_PREFAULT_ALL: - cxl_prefault_vma(ctx); + cxl_prefault_vma(ctx, mm); break; default: break; } + + mmput(mm); } -- cgit v1.2.3 From df724cedcfd7ce6638f40903144902a3e29fcec7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:55 +0000 Subject: optee: remove vma linked list walk Use the VMA iterator instead. Change the calling convention of __check_mem_type() to pass in the mm instead of the first vma in the range. Link: https://lkml.kernel.org/r/20220906194824.2110408-39-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/tee/optee/call.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/tee/optee/call.c b/drivers/tee/optee/call.c index 28f87cd8b3ed..290b1bb0e9cd 100644 --- a/drivers/tee/optee/call.c +++ b/drivers/tee/optee/call.c @@ -492,15 +492,18 @@ static bool is_normal_memory(pgprot_t p) #endif } -static int __check_mem_type(struct vm_area_struct *vma, unsigned long end) +static int __check_mem_type(struct mm_struct *mm, unsigned long start, + unsigned long end) { - while (vma && is_normal_memory(vma->vm_page_prot)) { - if (vma->vm_end >= end) - return 0; - vma = vma->vm_next; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, start); + + for_each_vma_range(vmi, vma, end) { + if (!is_normal_memory(vma->vm_page_prot)) + return -EINVAL; } - return -EINVAL; + return 0; } int optee_check_mem_type(unsigned long start, size_t num_pages) @@ -516,8 +519,7 @@ int optee_check_mem_type(unsigned long start, size_t num_pages) return 0; mmap_read_lock(mm); - rc = __check_mem_type(find_vma(mm, start), - start + num_pages * PAGE_SIZE); + rc = __check_mem_type(mm, start, start + num_pages * PAGE_SIZE); mmap_read_unlock(mm); return rc; -- cgit v1.2.3 From f683b9d613193362ceb954c216f663a43c027302 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:49:04 +0000 Subject: i915: use the VMA iterator Replace the linked list in probe_range() with the VMA iterator. Link: https://lkml.kernel.org/r/20220906194824.2110408-65-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c index 8423df021b71..d4398948f016 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c @@ -426,12 +426,11 @@ static const struct drm_i915_gem_object_ops i915_gem_userptr_ops = { static int probe_range(struct mm_struct *mm, unsigned long addr, unsigned long len) { - const unsigned long end = addr + len; + VMA_ITERATOR(vmi, mm, addr); struct vm_area_struct *vma; - int ret = -EFAULT; mmap_read_lock(mm); - for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) { + for_each_vma_range(vmi, vma, addr + len) { /* Check for holes, note that we also update the addr below */ if (vma->vm_start > addr) break; @@ -439,16 +438,13 @@ probe_range(struct mm_struct *mm, unsigned long addr, unsigned long len) if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) break; - if (vma->vm_end >= end) { - ret = 0; - break; - } - addr = vma->vm_end; } mmap_read_unlock(mm); - return ret; + if (vma) + return -EFAULT; + return 0; } /* -- cgit v1.2.3 From f635725c3905e755a8c3e2dc8cab7fcd0d38977f Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 13 Sep 2022 00:27:44 +0900 Subject: zram: do not waste zram_table_entry flags bits zram_table_entry::flags stores object size in the lower bits and zram pageflags in the upper bits. However, for some reason, we use 24 lower bits, while maximum zram object size is PAGE_SIZE, which requires PAGE_SHIFT bits (up to 16 on arm64). This wastes 24 - PAGE_SHIFT bits that we can use for additional zram pageflags instead. Also add a BUILD_BUG_ON() to alert us should we run out of bits in zram_table_entry::flags. Link: https://lkml.kernel.org/r/20220912152744.527438-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Reviewed-by: Brian Geffon Acked-by: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 2 ++ drivers/block/zram/zram_drv.h | 15 +++++++-------- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 607f4634c27d..eb021db21ddf 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2131,6 +2131,8 @@ static int __init zram_init(void) { int ret; + BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > BITS_PER_LONG); + ret = cpuhp_setup_state_multi(CPUHP_ZCOMP_PREPARE, "block/zram:prepare", zcomp_cpu_up_prepare, zcomp_cpu_dead); if (ret < 0) diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 80c3b43b4828..a2bda53020fd 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -30,16 +30,15 @@ /* - * The lower ZRAM_FLAG_SHIFT bits of table.flags is for - * object size (excluding header), the higher bits is for - * zram_pageflags. + * ZRAM is mainly used for memory efficiency so we want to keep memory + * footprint small and thus squeeze size and zram pageflags into a flags + * member. The lower ZRAM_FLAG_SHIFT bits is for object size (excluding + * header), which cannot be larger than PAGE_SIZE (requiring PAGE_SHIFT + * bits), the higher bits are for zram_pageflags. * - * zram is mainly used for memory efficiency so we want to keep memory - * footprint small so we can squeeze size and flags into a field. - * The lower ZRAM_FLAG_SHIFT bits is for object size (excluding header), - * the higher bits is for zram_pageflags. + * We use BUILD_BUG_ON() to make sure that zram pageflags don't overflow. */ -#define ZRAM_FLAG_SHIFT 24 +#define ZRAM_FLAG_SHIFT (PAGE_SHIFT + 1) /* Flags for zram pages (table[page_no].flags) */ enum zram_pageflags { -- cgit v1.2.3 From f9bceb2f4114fe9a9725c922f9f1500d173d4763 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 14 Sep 2022 14:20:33 +0900 Subject: zram: keep comments within 80-columns limit Several trivial fixups (that I should have spotted during review). Link: https://lkml.kernel.org/r/20220914052033.838050-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index eb021db21ddf..43eeef2b9fbe 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -329,8 +329,8 @@ static ssize_t idle_store(struct device *dev, if (!sysfs_streq(buf, "all")) { /* - * If it did not parse as 'all' try to treat it as an integer when - * we have memory tracking enabled. + * If it did not parse as 'all' try to treat it as an integer + * when we have memory tracking enabled. */ u64 age_sec; @@ -345,7 +345,10 @@ static ssize_t idle_store(struct device *dev, if (!init_done(zram)) goto out_unlock; - /* A cutoff_time of 0 marks everything as idle, this is the "all" behavior */ + /* + * A cutoff_time of 0 marks everything as idle, this is the + * "all" behavior. + */ mark_idle(zram, cutoff_time); rv = len; @@ -1416,11 +1419,11 @@ compress_again: if (comp_len != PAGE_SIZE) goto compress_again; /* - * If the page is not compressible, you need to acquire the lock and - * execute the code below. The zcomp_stream_get() call is needed to - * disable the cpu hotplug and grab the zstrm buffer back. - * It is necessary that the dereferencing of the zstrm variable below - * occurs correctly. + * If the page is not compressible, you need to acquire the + * lock and execute the code below. The zcomp_stream_get() + * call is needed to disable the cpu hotplug and grab the + * zstrm buffer back. It is necessary that the dereferencing + * of the zstrm variable below occurs correctly. */ zstrm = zcomp_stream_get(zram->comp); } -- cgit v1.2.3 From b958d4d08fbfe938af24ea06ebbf839b48fa18a9 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 14 Sep 2022 15:26:02 +0800 Subject: mm: hugetlb: simplify per-node sysfs creation and removal Patch series "simplify handling of per-node sysfs creation and removal", v4. This patch (of 2): The following commit offload per-node sysfs creation and removal to a kworker and did not say why it is needed. And it also said "I don't know that this is absolutely required". It seems like the author was not sure as well. Since it only complicates the code, this patch will revert the changes to simplify the code. 39da08cb074c ("hugetlb: offload per node attribute registrations") We could use memory hotplug notifier to do per-node sysfs creation and removal instead of inserting those operations to node registration and unregistration. Then, it can reduce the code coupling between node.c and hugetlb.c. Also, it can simplify the code. Link: https://lkml.kernel.org/r/20220914072603.60293-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220914072603.60293-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Mike Kravetz Acked-by: David Hildenbrand Cc: Andi Kleen Cc: Greg Kroah-Hartman Cc: Muchun Song Cc: Oscar Salvador Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton --- drivers/base/node.c | 139 ++------------------------------------------------- include/linux/node.h | 24 ++------- mm/hugetlb.c | 35 ++++++++----- 3 files changed, 30 insertions(+), 168 deletions(-) (limited to 'drivers') diff --git a/drivers/base/node.c b/drivers/base/node.c index eb0f43784c2b..ed391cb09999 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -587,64 +587,9 @@ static const struct attribute_group *node_dev_groups[] = { NULL }; -#ifdef CONFIG_HUGETLBFS -/* - * hugetlbfs per node attributes registration interface: - * When/if hugetlb[fs] subsystem initializes [sometime after this module], - * it will register its per node attributes for all online nodes with - * memory. It will also call register_hugetlbfs_with_node(), below, to - * register its attribute registration functions with this node driver. - * Once these hooks have been initialized, the node driver will call into - * the hugetlb module to [un]register attributes for hot-plugged nodes. - */ -static node_registration_func_t __hugetlb_register_node; -static node_registration_func_t __hugetlb_unregister_node; - -static inline bool hugetlb_register_node(struct node *node) -{ - if (__hugetlb_register_node && - node_state(node->dev.id, N_MEMORY)) { - __hugetlb_register_node(node); - return true; - } - return false; -} - -static inline void hugetlb_unregister_node(struct node *node) -{ - if (__hugetlb_unregister_node) - __hugetlb_unregister_node(node); -} - -void register_hugetlbfs_with_node(node_registration_func_t doregister, - node_registration_func_t unregister) -{ - __hugetlb_register_node = doregister; - __hugetlb_unregister_node = unregister; -} -#else -static inline void hugetlb_register_node(struct node *node) {} - -static inline void hugetlb_unregister_node(struct node *node) {} -#endif - static void node_device_release(struct device *dev) { - struct node *node = to_node(dev); - -#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_HUGETLBFS) - /* - * We schedule the work only when a memory section is - * onlined/offlined on this node. When we come here, - * all the memory on this node has been offlined, - * so we won't enqueue new work to this work. - * - * The work is using node->node_work, so we should - * flush work before freeing the memory. - */ - flush_work(&node->node_work); -#endif - kfree(node); + kfree(to_node(dev)); } /* @@ -665,11 +610,9 @@ static int register_node(struct node *node, int num) if (error) put_device(&node->dev); - else { - hugetlb_register_node(node); - + else compaction_register_node(node); - } + return error; } @@ -683,7 +626,6 @@ static int register_node(struct node *node, int num) void unregister_node(struct node *node) { compaction_unregister_node(node); - hugetlb_unregister_node(node); /* no-op, if memoryless node */ node_remove_accesses(node); node_remove_caches(node); device_unregister(&node->dev); @@ -905,74 +847,8 @@ void register_memory_blocks_under_node(int nid, unsigned long start_pfn, (void *)&nid, func); return; } - -#ifdef CONFIG_HUGETLBFS -/* - * Handle per node hstate attribute [un]registration on transistions - * to/from memoryless state. - */ -static void node_hugetlb_work(struct work_struct *work) -{ - struct node *node = container_of(work, struct node, node_work); - - /* - * We only get here when a node transitions to/from memoryless state. - * We can detect which transition occurred by examining whether the - * node has memory now. hugetlb_register_node() already check this - * so we try to register the attributes. If that fails, then the - * node has transitioned to memoryless, try to unregister the - * attributes. - */ - if (!hugetlb_register_node(node)) - hugetlb_unregister_node(node); -} - -static void init_node_hugetlb_work(int nid) -{ - INIT_WORK(&node_devices[nid]->node_work, node_hugetlb_work); -} - -static int node_memory_callback(struct notifier_block *self, - unsigned long action, void *arg) -{ - struct memory_notify *mnb = arg; - int nid = mnb->status_change_nid; - - switch (action) { - case MEM_ONLINE: - case MEM_OFFLINE: - /* - * offload per node hstate [un]registration to a work thread - * when transitioning to/from memoryless state. - */ - if (nid != NUMA_NO_NODE) - schedule_work(&node_devices[nid]->node_work); - break; - - case MEM_GOING_ONLINE: - case MEM_GOING_OFFLINE: - case MEM_CANCEL_ONLINE: - case MEM_CANCEL_OFFLINE: - default: - break; - } - - return NOTIFY_OK; -} -#endif /* CONFIG_HUGETLBFS */ #endif /* CONFIG_MEMORY_HOTPLUG */ -#if !defined(CONFIG_MEMORY_HOTPLUG) || !defined(CONFIG_HUGETLBFS) -static inline int node_memory_callback(struct notifier_block *self, - unsigned long action, void *arg) -{ - return NOTIFY_OK; -} - -static void init_node_hugetlb_work(int nid) { } - -#endif - int __register_one_node(int nid) { int error; @@ -991,8 +867,6 @@ int __register_one_node(int nid) } INIT_LIST_HEAD(&node_devices[nid]->access_list); - /* initialize work queue for memory hot plug */ - init_node_hugetlb_work(nid); node_init_caches(nid); return error; @@ -1063,13 +937,8 @@ static const struct attribute_group *cpu_root_attr_groups[] = { NULL, }; -#define NODE_CALLBACK_PRI 2 /* lower than SLAB */ void __init node_dev_init(void) { - static struct notifier_block node_memory_callback_nb = { - .notifier_call = node_memory_callback, - .priority = NODE_CALLBACK_PRI, - }; int ret, i; BUILD_BUG_ON(ARRAY_SIZE(node_state_attr) != NR_NODE_STATES); @@ -1079,8 +948,6 @@ void __init node_dev_init(void) if (ret) panic("%s() failed to register subsystem: %d\n", __func__, ret); - register_hotmemory_notifier(&node_memory_callback_nb); - /* * Create all node devices, which will properly link the node * to applicable memory block devices and already created cpu devices. diff --git a/include/linux/node.h b/include/linux/node.h index 9ec680dd607f..427a5975cf40 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -2,15 +2,15 @@ /* * include/linux/node.h - generic node definition * - * This is mainly for topological representation. We define the - * basic 'struct node' here, which can be embedded in per-arch + * This is mainly for topological representation. We define the + * basic 'struct node' here, which can be embedded in per-arch * definitions of processors. * * Basic handling of the devices is done in drivers/base/node.c - * and system devices are handled in drivers/base/sys.c. + * and system devices are handled in drivers/base/sys.c. * * Nodes are exported via driverfs in the class/node/devices/ - * directory. + * directory. */ #ifndef _LINUX_NODE_H_ #define _LINUX_NODE_H_ @@ -18,7 +18,6 @@ #include #include #include -#include /** * struct node_hmem_attrs - heterogeneous memory performance attributes @@ -84,10 +83,6 @@ static inline void node_set_perf_attrs(unsigned int nid, struct node { struct device dev; struct list_head access_list; - -#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_HUGETLBFS) - struct work_struct node_work; -#endif #ifdef CONFIG_HMEM_REPORTING struct list_head cache_attrs; struct device *cache_dev; @@ -96,7 +91,6 @@ struct node { struct memory_block; extern struct node *node_devices[]; -typedef void (*node_registration_func_t)(struct node *); #if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_NUMA) void register_memory_blocks_under_node(int nid, unsigned long start_pfn, @@ -144,11 +138,6 @@ extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk); extern int register_memory_node_under_compute_node(unsigned int mem_nid, unsigned int cpu_nid, unsigned access); - -#ifdef CONFIG_HUGETLBFS -extern void register_hugetlbfs_with_node(node_registration_func_t doregister, - node_registration_func_t unregister); -#endif #else static inline void node_dev_init(void) { @@ -176,11 +165,6 @@ static inline int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) static inline void unregister_memory_block_under_nodes(struct memory_block *mem_blk) { } - -static inline void register_hugetlbfs_with_node(node_registration_func_t reg, - node_registration_func_t unreg) -{ -} #endif #define to_node(device) container_of(device, struct node, dev) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6af123374e98..397f2988c37f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -4000,6 +4001,23 @@ static void hugetlb_register_node(struct node *node) } } +static int __meminit hugetlb_memory_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_notify *mnb = arg; + int nid = mnb->status_change_nid; + + if (nid == NUMA_NO_NODE) + return NOTIFY_DONE; + + if (action == MEM_GOING_ONLINE) + hugetlb_register_node(node_devices[nid]); + else if (action == MEM_CANCEL_ONLINE || action == MEM_OFFLINE) + hugetlb_unregister_node(node_devices[nid]); + + return NOTIFY_OK; +} + /* * hugetlb init time: register hstate attributes for all registered node * devices of nodes that have memory. All on-line nodes should have @@ -4009,18 +4027,11 @@ static void __init hugetlb_register_all_nodes(void) { int nid; - for_each_node_state(nid, N_MEMORY) { - struct node *node = node_devices[nid]; - if (node->dev.id == nid) - hugetlb_register_node(node); - } - - /* - * Let the node device driver know we're here so it can - * [un]register hstate attributes on node hotplug. - */ - register_hugetlbfs_with_node(hugetlb_register_node, - hugetlb_unregister_node); + get_online_mems(); + hotplug_memory_notifier(hugetlb_memory_callback, 0); + for_each_node_state(nid, N_MEMORY) + hugetlb_register_node(node_devices[nid]); + put_online_mems(); } #else /* !CONFIG_NUMA */ -- cgit v1.2.3 From a4a00b451ef5e1deb959088e25e248f4ee399792 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 14 Sep 2022 15:26:03 +0800 Subject: mm: hugetlb: eliminate memory-less nodes handling The memory-notify-based approach aims to handle meory-less nodes, however, it just adds the complexity of code as pointed by David in thread [1]. The handling of memory-less nodes is introduced by commit 4faf8d950ec4 ("hugetlb: handle memory hot-plug events"). >From its commit message, we cannot find any necessity of handling this case. So, we can simply register/unregister sysfs entries in register_node/unregister_node to simlify the code. BTW, hotplug callback added because in hugetlb_register_all_nodes() we register sysfs nodes only for N_MEMORY nodes, seeing commit 9b5e5d0fdc91, which said it was a preparation for handling memory-less nodes via memory hotplug. Since we want to remove memory hotplug, so make sure we only register per-node sysfs for online (N_ONLINE) nodes in hugetlb_register_all_nodes(). https://lore.kernel.org/linux-mm/60933ffc-b850-976c-78a0-0ee6e0ea9ef0@redhat.com/ [1] Link: https://lkml.kernel.org/r/20220914072603.60293-3-songmuchun@bytedance.com Suggested-by: David Hildenbrand Signed-off-by: Muchun Song Acked-by: David Hildenbrand Cc: Andi Kleen Cc: Greg Kroah-Hartman Cc: Mike Kravetz Cc: Oscar Salvador Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton --- drivers/base/node.c | 8 ++++-- include/linux/hugetlb.h | 14 ++++++++++ mm/hugetlb.c | 70 ++++++++++++++++++++----------------------------- 3 files changed, 49 insertions(+), 43 deletions(-) (limited to 'drivers') diff --git a/drivers/base/node.c b/drivers/base/node.c index ed391cb09999..80b1e91b9608 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -20,6 +20,7 @@ #include #include #include +#include static struct bus_type node_subsys = { .name = "node", @@ -608,10 +609,12 @@ static int register_node(struct node *node, int num) node->dev.groups = node_dev_groups; error = device_register(&node->dev); - if (error) + if (error) { put_device(&node->dev); - else + } else { + hugetlb_register_node(node); compaction_register_node(node); + } return error; } @@ -625,6 +628,7 @@ static int register_node(struct node *node, int num) */ void unregister_node(struct node *node) { + hugetlb_unregister_node(node); compaction_unregister_node(node); node_remove_accesses(node); node_remove_caches(node); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 57e72954a482..6d7f39754060 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -16,6 +16,7 @@ struct ctl_table; struct user_struct; struct mmu_gather; +struct node; #ifndef is_hugepd typedef struct { unsigned long pd; } hugepd_t; @@ -935,6 +936,11 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, } #endif +#ifdef CONFIG_NUMA +void hugetlb_register_node(struct node *node); +void hugetlb_unregister_node(struct node *node); +#endif + #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; @@ -1109,6 +1115,14 @@ static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { } + +static inline void hugetlb_register_node(struct node *node) +{ +} + +static inline void hugetlb_unregister_node(struct node *node) +{ +} #endif /* CONFIG_HUGETLB_PAGE */ static inline spinlock_t *huge_pte_lock(struct hstate *h, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 397f2988c37f..0b1ab5af939e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3871,24 +3871,8 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, return 0; } -static void __init hugetlb_sysfs_init(void) -{ - struct hstate *h; - int err; - - hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); - if (!hugepages_kobj) - return; - - for_each_hstate(h) { - err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, - hstate_kobjs, &hstate_attr_group); - if (err) - pr_err("HugeTLB: Unable to add hstate %s", h->name); - } -} - #ifdef CONFIG_NUMA +static bool hugetlb_sysfs_initialized __ro_after_init; /* * node_hstate/s - associate per node hstate attributes, via their kobjects, @@ -3944,7 +3928,7 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) * Unregister hstate attributes from a single node device. * No-op if no hstate attributes attached. */ -static void hugetlb_unregister_node(struct node *node) +void hugetlb_unregister_node(struct node *node) { struct hstate *h; struct node_hstate *nhs = &node_hstates[node->dev.id]; @@ -3974,12 +3958,15 @@ static void hugetlb_unregister_node(struct node *node) * Register hstate attributes for a single node device. * No-op if attributes already registered. */ -static void hugetlb_register_node(struct node *node) +void hugetlb_register_node(struct node *node) { struct hstate *h; struct node_hstate *nhs = &node_hstates[node->dev.id]; int err; + if (!hugetlb_sysfs_initialized) + return; + if (nhs->hugepages_kobj) return; /* already allocated */ @@ -4001,23 +3988,6 @@ static void hugetlb_register_node(struct node *node) } } -static int __meminit hugetlb_memory_callback(struct notifier_block *self, - unsigned long action, void *arg) -{ - struct memory_notify *mnb = arg; - int nid = mnb->status_change_nid; - - if (nid == NUMA_NO_NODE) - return NOTIFY_DONE; - - if (action == MEM_GOING_ONLINE) - hugetlb_register_node(node_devices[nid]); - else if (action == MEM_CANCEL_ONLINE || action == MEM_OFFLINE) - hugetlb_unregister_node(node_devices[nid]); - - return NOTIFY_OK; -} - /* * hugetlb init time: register hstate attributes for all registered node * devices of nodes that have memory. All on-line nodes should have @@ -4027,11 +3997,8 @@ static void __init hugetlb_register_all_nodes(void) { int nid; - get_online_mems(); - hotplug_memory_notifier(hugetlb_memory_callback, 0); - for_each_node_state(nid, N_MEMORY) + for_each_online_node(nid) hugetlb_register_node(node_devices[nid]); - put_online_mems(); } #else /* !CONFIG_NUMA */ @@ -4055,6 +4022,28 @@ static inline __init void hugetlb_cma_check(void) } #endif +static void __init hugetlb_sysfs_init(void) +{ + struct hstate *h; + int err; + + hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); + if (!hugepages_kobj) + return; + + for_each_hstate(h) { + err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, + hstate_kobjs, &hstate_attr_group); + if (err) + pr_err("HugeTLB: Unable to add hstate %s", h->name); + } + +#ifdef CONFIG_NUMA + hugetlb_sysfs_initialized = true; +#endif + hugetlb_register_all_nodes(); +} + static int __init hugetlb_init(void) { int i; @@ -4109,7 +4098,6 @@ static int __init hugetlb_init(void) report_hugepages(); hugetlb_sysfs_init(); - hugetlb_register_all_nodes(); hugetlb_cgroup_file_init(); #ifdef CONFIG_SMP -- cgit v1.2.3 From 6e9f05dc66f951e8812c84a3ef148b601e3f8f45 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:44 +0200 Subject: libnvdimm/pfn_dev: increase MAX_STRUCT_PAGE_SIZE KMSAN adds extra metadata fields to struct page, so it does not fit into 64 bytes anymore. This change leads to increased memory consumption of the nvdimm driver, regardless of whether the kernel is built with KMSAN or not. Link: https://lkml.kernel.org/r/20220915150417.722975-11-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- drivers/nvdimm/nd.h | 2 +- drivers/nvdimm/pfn_devs.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index ec5219680092..85ca5b4da3cf 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -652,7 +652,7 @@ void devm_namespace_disable(struct device *dev, struct nd_namespace_common *ndns); #if IS_ENABLED(CONFIG_ND_CLAIM) /* max struct page size independent of kernel config */ -#define MAX_STRUCT_PAGE_SIZE 64 +#define MAX_STRUCT_PAGE_SIZE 128 int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap); #else static inline int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 0e92ab4b3283..61af072ac98f 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -787,7 +787,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) * when populating the vmemmap. This *should* be equal to * PMD_SIZE for most architectures. * - * Also make sure size of struct page is less than 64. We + * Also make sure size of struct page is less than 128. We * want to make sure we use large enough size here so that * we don't have a dynamic reserve space depending on * struct page size. But we also want to make sure we notice -- cgit v1.2.3 From 79dbd006a6d6f51777ba4948046561b6d9270504 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:46 +0200 Subject: kmsan: disable instrumentation of unsupported common kernel code EFI stub cannot be linked with KMSAN runtime, so we disable instrumentation for it. Instrumenting kcov, stackdepot or lockdep leads to infinite recursion caused by instrumentation hooks calling instrumented code again. Link: https://lkml.kernel.org/r/20220915150417.722975-13-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- drivers/firmware/efi/libstub/Makefile | 1 + kernel/Makefile | 1 + kernel/locking/Makefile | 3 ++- lib/Makefile | 3 +++ 4 files changed, 7 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index d0537573501e..81432d0c904b 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -46,6 +46,7 @@ GCOV_PROFILE := n # Sanitizer runtimes are unavailable and cannot be linked here. KASAN_SANITIZE := n KCSAN_SANITIZE := n +KMSAN_SANITIZE := n UBSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y diff --git a/kernel/Makefile b/kernel/Makefile index 318789c728d3..d754e0be1176 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -38,6 +38,7 @@ KCOV_INSTRUMENT_kcov.o := n KASAN_SANITIZE_kcov.o := n KCSAN_SANITIZE_kcov.o := n UBSAN_SANITIZE_kcov.o := n +KMSAN_SANITIZE_kcov.o := n CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector # Don't instrument error handlers diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index d51cabf28f38..ea925731fa40 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -5,8 +5,9 @@ KCOV_INSTRUMENT := n obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o -# Avoid recursion lockdep -> KCSAN -> ... -> lockdep. +# Avoid recursion lockdep -> sanitizer -> ... -> lockdep. KCSAN_SANITIZE_lockdep.o := n +KMSAN_SANITIZE_lockdep.o := n ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) diff --git a/lib/Makefile b/lib/Makefile index d7d94102991b..42e185cdecd0 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -270,6 +270,9 @@ obj-$(CONFIG_POLYNOMIAL) += polynomial.o CFLAGS_stackdepot.o += -fno-builtin obj-$(CONFIG_STACKDEPOT) += stackdepot.o KASAN_SANITIZE_stackdepot.o := n +# In particular, instrumenting stackdepot.c with KMSAN will result in infinite +# recursion. +KMSAN_SANITIZE_stackdepot.o := n KCOV_INSTRUMENT_stackdepot.o := n obj-$(CONFIG_REF_TRACKER) += ref_tracker.o -- cgit v1.2.3 From 38317724f6a85572af373229a27e214d5282ddf8 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:54 +0200 Subject: input: libps2: mark data received in __ps2_command() as initialized KMSAN does not know that the device initializes certain bytes in ps2dev->cmdbuf. Call kmsan_unpoison_memory() to explicitly mark them as initialized. Link: https://lkml.kernel.org/r/20220915150417.722975-21-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- drivers/input/serio/libps2.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/input/serio/libps2.c b/drivers/input/serio/libps2.c index 250e213cc80c..3e19344eda93 100644 --- a/drivers/input/serio/libps2.c +++ b/drivers/input/serio/libps2.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -294,9 +295,11 @@ int __ps2_command(struct ps2dev *ps2dev, u8 *param, unsigned int command) serio_pause_rx(ps2dev->serio); - if (param) + if (param) { for (i = 0; i < receive; i++) param[i] = ps2dev->cmdbuf[(receive - 1) - i]; + kmsan_unpoison_memory(param, receive); + } if (ps2dev->cmdcnt && (command != PS2_CMD_RESET_BAT || ps2dev->cmdcnt != 1)) { -- cgit v1.2.3 From 88938359e2dfe1f5f5840268b98935948db8fbd9 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:56 +0200 Subject: virtio: kmsan: check/unpoison scatterlist in vring_map_one_sg() If vring doesn't use the DMA API, KMSAN is unable to tell whether the memory is initialized by hardware. Explicitly call kmsan_handle_dma() from vring_map_one_sg() in this case to prevent false positives. Link: https://lkml.kernel.org/r/20220915150417.722975-23-glider@google.com Signed-off-by: Alexander Potapenko Acked-by: Michael S. Tsirkin Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- drivers/virtio/virtio_ring.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 4620e9d79dde..8974c34b40fd 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -352,8 +353,15 @@ static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist *sg, enum dma_data_direction direction) { - if (!vq->use_dma_api) + if (!vq->use_dma_api) { + /* + * If DMA is not used, KMSAN doesn't know that the scatterlist + * is initialized by the hardware. Explicitly check/unpoison it + * depending on the direction. + */ + kmsan_handle_dma(sg_page(sg), sg->offset, sg->length, direction); return (dma_addr_t)sg_phys(sg); + } /* * We can't use dma_map_sg, because we don't use scatterlists in -- cgit v1.2.3 From 553a80188a5d7164d2b0688b06bf3fe297023bfe Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:57 +0200 Subject: kmsan: handle memory sent to/from USB Depending on the value of is_out kmsan_handle_urb() KMSAN either marks the data copied to the kernel from a USB device as initialized, or checks the data sent to the device for being initialized. Link: https://lkml.kernel.org/r/20220915150417.722975-24-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- drivers/usb/core/urb.c | 2 ++ include/linux/kmsan.h | 15 +++++++++++++++ mm/kmsan/hooks.c | 16 ++++++++++++++++ 3 files changed, 33 insertions(+) (limited to 'drivers') diff --git a/drivers/usb/core/urb.c b/drivers/usb/core/urb.c index 33d62d7e3929..9f3c54032556 100644 --- a/drivers/usb/core/urb.c +++ b/drivers/usb/core/urb.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -426,6 +427,7 @@ int usb_submit_urb(struct urb *urb, gfp_t mem_flags) URB_SETUP_MAP_SINGLE | URB_SETUP_MAP_LOCAL | URB_DMA_SG_COMBINED); urb->transfer_flags |= (is_out ? URB_DIR_OUT : URB_DIR_IN); + kmsan_handle_urb(urb, is_out); if (xfertype != USB_ENDPOINT_XFER_CONTROL && dev->state < USB_STATE_CONFIGURED) diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index dac296da45c5..c473e0e21683 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -18,6 +18,7 @@ struct page; struct kmem_cache; struct task_struct; struct scatterlist; +struct urb; #ifdef CONFIG_KMSAN @@ -203,6 +204,16 @@ void kmsan_handle_dma(struct page *page, size_t offset, size_t size, void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, enum dma_data_direction dir); +/** + * kmsan_handle_urb() - Handle a USB data transfer. + * @urb: struct urb pointer. + * @is_out: data transfer direction (true means output to hardware). + * + * If @is_out is true, KMSAN checks the transfer buffer of @urb. Otherwise, + * KMSAN initializes the transfer buffer. + */ +void kmsan_handle_urb(const struct urb *urb, bool is_out); + #else static inline void kmsan_init_shadow(void) @@ -295,6 +306,10 @@ static inline void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, { } +static inline void kmsan_handle_urb(const struct urb *urb, bool is_out) +{ +} + #endif #endif /* _LINUX_KMSAN_H */ diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 563c09443a37..79d7e73e2cfd 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "../internal.h" #include "../slab.h" @@ -245,6 +246,21 @@ void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy, } EXPORT_SYMBOL(kmsan_copy_to_user); +/* Helper function to check an URB. */ +void kmsan_handle_urb(const struct urb *urb, bool is_out) +{ + if (!urb) + return; + if (is_out) + kmsan_internal_check_memory(urb->transfer_buffer, + urb->transfer_buffer_length, + /*user_addr*/ 0, REASON_SUBMIT_URB); + else + kmsan_internal_unpoison_memory(urb->transfer_buffer, + urb->transfer_buffer_length, + /*checked*/ false); +} + static void kmsan_handle_dma_page(const void *addr, size_t size, enum dma_data_direction dir) { -- cgit v1.2.3 From 440fed95ebc30420d1f7802c6578f95e18523140 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:00 +0200 Subject: crypto: kmsan: disable accelerated configs under KMSAN KMSAN is unable to understand when initialized values come from assembly. Disable accelerated configs in KMSAN builds to prevent false positive reports. Link: https://lkml.kernel.org/r/20220915150417.722975-27-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- crypto/Kconfig | 30 ++++++++++++++++++++++++++++++ drivers/net/Kconfig | 1 + 2 files changed, 31 insertions(+) (limited to 'drivers') diff --git a/crypto/Kconfig b/crypto/Kconfig index bb427a835e44..182fb817ebb5 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -319,6 +319,7 @@ config CRYPTO_CURVE25519 config CRYPTO_CURVE25519_X86 tristate "x86_64 accelerated Curve25519 scalar multiplication library" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_LIB_CURVE25519_GENERIC select CRYPTO_ARCH_HAVE_LIB_CURVE25519 @@ -367,11 +368,13 @@ config CRYPTO_AEGIS128 config CRYPTO_AEGIS128_SIMD bool "Support SIMD acceleration for AEGIS-128" depends on CRYPTO_AEGIS128 && ((ARM || ARM64) && KERNEL_MODE_NEON) + depends on !KMSAN # avoid false positives from assembly default y config CRYPTO_AEGIS128_AESNI_SSE2 tristate "AEGIS-128 AEAD algorithm (x86_64 AESNI+SSE2 implementation)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_AEAD select CRYPTO_SIMD help @@ -517,6 +520,7 @@ config CRYPTO_NHPOLY1305 config CRYPTO_NHPOLY1305_SSE2 tristate "NHPoly1305 hash function (x86_64 SSE2 implementation)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_NHPOLY1305 help SSE2 optimized implementation of the hash function used by the @@ -525,6 +529,7 @@ config CRYPTO_NHPOLY1305_SSE2 config CRYPTO_NHPOLY1305_AVX2 tristate "NHPoly1305 hash function (x86_64 AVX2 implementation)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_NHPOLY1305 help AVX2 optimized implementation of the hash function used by the @@ -649,6 +654,7 @@ config CRYPTO_CRC32C config CRYPTO_CRC32C_INTEL tristate "CRC32c INTEL hardware acceleration" depends on X86 + depends on !KMSAN # avoid false positives from assembly select CRYPTO_HASH help In Intel processor with SSE4.2 supported, the processor will @@ -689,6 +695,7 @@ config CRYPTO_CRC32 config CRYPTO_CRC32_PCLMUL tristate "CRC32 PCLMULQDQ hardware acceleration" depends on X86 + depends on !KMSAN # avoid false positives from assembly select CRYPTO_HASH select CRC32 help @@ -748,6 +755,7 @@ config CRYPTO_BLAKE2B config CRYPTO_BLAKE2S_X86 bool "BLAKE2s digest algorithm (x86 accelerated version)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_LIB_BLAKE2S_GENERIC select CRYPTO_ARCH_HAVE_LIB_BLAKE2S @@ -762,6 +770,7 @@ config CRYPTO_CRCT10DIF config CRYPTO_CRCT10DIF_PCLMUL tristate "CRCT10DIF PCLMULQDQ hardware acceleration" depends on X86 && 64BIT && CRC_T10DIF + depends on !KMSAN # avoid false positives from assembly select CRYPTO_HASH help For x86_64 processors with SSE4.2 and PCLMULQDQ supported, @@ -831,6 +840,7 @@ config CRYPTO_POLY1305 config CRYPTO_POLY1305_X86_64 tristate "Poly1305 authenticator algorithm (x86_64/SSE2/AVX2)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_LIB_POLY1305_GENERIC select CRYPTO_ARCH_HAVE_LIB_POLY1305 help @@ -920,6 +930,7 @@ config CRYPTO_SHA1 config CRYPTO_SHA1_SSSE3 tristate "SHA1 digest algorithm (SSSE3/AVX/AVX2/SHA-NI)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SHA1 select CRYPTO_HASH help @@ -931,6 +942,7 @@ config CRYPTO_SHA1_SSSE3 config CRYPTO_SHA256_SSSE3 tristate "SHA256 digest algorithm (SSSE3/AVX/AVX2/SHA-NI)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SHA256 select CRYPTO_HASH help @@ -943,6 +955,7 @@ config CRYPTO_SHA256_SSSE3 config CRYPTO_SHA512_SSSE3 tristate "SHA512 digest algorithm (SSSE3/AVX/AVX2)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SHA512 select CRYPTO_HASH help @@ -1168,6 +1181,7 @@ config CRYPTO_WP512 config CRYPTO_GHASH_CLMUL_NI_INTEL tristate "GHASH hash function (CLMUL-NI accelerated)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_CRYPTD help This is the x86_64 CLMUL-NI accelerated implementation of @@ -1228,6 +1242,7 @@ config CRYPTO_AES_TI config CRYPTO_AES_NI_INTEL tristate "AES cipher algorithms (AES-NI)" depends on X86 + depends on !KMSAN # avoid false positives from assembly select CRYPTO_AEAD select CRYPTO_LIB_AES select CRYPTO_ALGAPI @@ -1369,6 +1384,7 @@ config CRYPTO_BLOWFISH_COMMON config CRYPTO_BLOWFISH_X86_64 tristate "Blowfish cipher algorithm (x86_64)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_BLOWFISH_COMMON imply CRYPTO_CTR @@ -1399,6 +1415,7 @@ config CRYPTO_CAMELLIA config CRYPTO_CAMELLIA_X86_64 tristate "Camellia cipher algorithm (x86_64)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER imply CRYPTO_CTR help @@ -1415,6 +1432,7 @@ config CRYPTO_CAMELLIA_X86_64 config CRYPTO_CAMELLIA_AESNI_AVX_X86_64 tristate "Camellia cipher algorithm (x86_64/AES-NI/AVX)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_CAMELLIA_X86_64 select CRYPTO_SIMD @@ -1433,6 +1451,7 @@ config CRYPTO_CAMELLIA_AESNI_AVX_X86_64 config CRYPTO_CAMELLIA_AESNI_AVX2_X86_64 tristate "Camellia cipher algorithm (x86_64/AES-NI/AVX2)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_CAMELLIA_AESNI_AVX_X86_64 help Camellia cipher algorithm module (x86_64/AES-NI/AVX2). @@ -1478,6 +1497,7 @@ config CRYPTO_CAST5 config CRYPTO_CAST5_AVX_X86_64 tristate "CAST5 (CAST-128) cipher algorithm (x86_64/AVX)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_CAST5 select CRYPTO_CAST_COMMON @@ -1501,6 +1521,7 @@ config CRYPTO_CAST6 config CRYPTO_CAST6_AVX_X86_64 tristate "CAST6 (CAST-256) cipher algorithm (x86_64/AVX)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_CAST6 select CRYPTO_CAST_COMMON @@ -1534,6 +1555,7 @@ config CRYPTO_DES_SPARC64 config CRYPTO_DES3_EDE_X86_64 tristate "Triple DES EDE cipher algorithm (x86-64)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_LIB_DES imply CRYPTO_CTR @@ -1604,6 +1626,7 @@ config CRYPTO_CHACHA20 config CRYPTO_CHACHA20_X86_64 tristate "ChaCha stream cipher algorithms (x86_64/SSSE3/AVX2/AVX-512VL)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_LIB_CHACHA_GENERIC select CRYPTO_ARCH_HAVE_LIB_CHACHA @@ -1674,6 +1697,7 @@ config CRYPTO_SERPENT config CRYPTO_SERPENT_SSE2_X86_64 tristate "Serpent cipher algorithm (x86_64/SSE2)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_SERPENT select CRYPTO_SIMD @@ -1693,6 +1717,7 @@ config CRYPTO_SERPENT_SSE2_X86_64 config CRYPTO_SERPENT_SSE2_586 tristate "Serpent cipher algorithm (i586/SSE2)" depends on X86 && !64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_SERPENT select CRYPTO_SIMD @@ -1712,6 +1737,7 @@ config CRYPTO_SERPENT_SSE2_586 config CRYPTO_SERPENT_AVX_X86_64 tristate "Serpent cipher algorithm (x86_64/AVX)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_SERPENT select CRYPTO_SIMD @@ -1732,6 +1758,7 @@ config CRYPTO_SERPENT_AVX_X86_64 config CRYPTO_SERPENT_AVX2_X86_64 tristate "Serpent cipher algorithm (x86_64/AVX2)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SERPENT_AVX_X86_64 help Serpent cipher algorithm, by Anderson, Biham & Knudsen. @@ -1876,6 +1903,7 @@ config CRYPTO_TWOFISH_586 config CRYPTO_TWOFISH_X86_64 tristate "Twofish cipher algorithm (x86_64)" depends on (X86 || UML_X86) && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_ALGAPI select CRYPTO_TWOFISH_COMMON imply CRYPTO_CTR @@ -1893,6 +1921,7 @@ config CRYPTO_TWOFISH_X86_64 config CRYPTO_TWOFISH_X86_64_3WAY tristate "Twofish cipher algorithm (x86_64, 3-way parallel)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_TWOFISH_COMMON select CRYPTO_TWOFISH_X86_64 @@ -1913,6 +1942,7 @@ config CRYPTO_TWOFISH_X86_64_3WAY config CRYPTO_TWOFISH_AVX_X86_64 tristate "Twofish cipher algorithm (x86_64/AVX)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_SIMD select CRYPTO_TWOFISH_COMMON diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 94c889802566..2aaf02bfe6f7 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -76,6 +76,7 @@ config WIREGUARD tristate "WireGuard secure network tunnel" depends on NET && INET depends on IPV6 || !IPV6 + depends on !KMSAN # KMSAN doesn't support the crypto configs below select NET_UDP_TUNNEL select DST_CACHE select CRYPTO -- cgit v1.2.3