From fc99547a8bda22a6a489284641385d8dcfb3ecd8 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 25 May 2026 15:39:46 -0700 Subject: bpf: Factor out stack_map build ID helpers Factor out helpers from stack_map_get_build_id_offset() in preparation for adding a sleepable build ID resolution path: stack_map_build_id_set_ip(), stack_map_build_id_offset(), and stack_map_build_id_set_valid(). While here, refactor stack_map_get_build_id_offset(): * use continue-driven control flow in the main loop and remove build_id_valid label * update prev_vma and prev_build_id on the fall-back-to-IP branch so the cache reflects the actual VMA seen on the previous IP [1] * guard fetch_build_id() with vma_is_anonymous() [2] to skip parse attempts that would otherwise fail the ELF magic check [1] https://lore.kernel.org/bpf/CAEf4Bzac9uWWqBvzH0iFzKvJcq3vxscZ3pKm0sUHmN-F-z9wVQ@mail.gmail.com/ [2] https://lore.kernel.org/bpf/226398c1ff3f2b686c0aeb010408d85fb15df13f9ff60a045bee31e79b9e41e9@mail.kernel.org/ Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Acked-by: Mykyta Yatsenko Link: https://lore.kernel.org/bpf/20260525223948.1920986-2-ihor.solodrai@linux.dev --- kernel/bpf/stackmap.c | 57 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 17 deletions(-) (limited to 'kernel/bpf/stackmap.c') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index da3d328f5c15..e23be7d44503 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -152,6 +152,28 @@ static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, b : build_id_parse_nofault(vma, build_id, NULL); } +static inline void stack_map_build_id_set_ip(struct bpf_stack_build_id *id) +{ + id->status = BPF_STACK_BUILD_ID_IP; + memset(id->build_id, 0, BUILD_ID_SIZE_MAX); +} + +static inline u64 stack_map_build_id_offset(unsigned long vm_pgoff, + unsigned long vm_start, u64 ip) +{ + return (vm_pgoff << PAGE_SHIFT) + ip - vm_start; +} + +static inline void stack_map_build_id_set_valid(struct bpf_stack_build_id *id, + u64 offset, + const unsigned char *build_id) +{ + id->status = BPF_STACK_BUILD_ID_VALID; + id->offset = offset; + if (id->build_id != build_id) + memcpy(id->build_id, build_id, BUILD_ID_SIZE_MAX); +} + /* * Expects all id_offs[i].ip values to be set to correct initial IPs. * They will be subsequently: @@ -165,44 +187,45 @@ static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, b static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, u32 trace_nr, bool user, bool may_fault) { - int i; struct mmap_unlock_irq_work *work = NULL; bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work); + bool has_user_ctx = user && current && current->mm; struct vm_area_struct *vma, *prev_vma = NULL; - const char *prev_build_id; + const unsigned char *prev_build_id = NULL; + int i; /* If the irq_work is in use, fall back to report ips. Same * fallback is used for kernel stack (!user) on a stackmap with * build_id. */ - if (!user || !current || !current->mm || irq_work_busy || - !mmap_read_trylock(current->mm)) { + if (!has_user_ctx || irq_work_busy || !mmap_read_trylock(current->mm)) { /* cannot access current->mm, fall back to ips */ - for (i = 0; i < trace_nr; i++) { - id_offs[i].status = BPF_STACK_BUILD_ID_IP; - memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); - } + for (i = 0; i < trace_nr; i++) + stack_map_build_id_set_ip(&id_offs[i]); return; } for (i = 0; i < trace_nr; i++) { u64 ip = READ_ONCE(id_offs[i].ip); + u64 offset; - if (range_in_vma(prev_vma, ip, ip)) { + if (prev_build_id && range_in_vma(prev_vma, ip, ip)) { vma = prev_vma; - memcpy(id_offs[i].build_id, prev_build_id, BUILD_ID_SIZE_MAX); - goto build_id_valid; + offset = stack_map_build_id_offset(vma->vm_pgoff, vma->vm_start, ip); + stack_map_build_id_set_valid(&id_offs[i], offset, prev_build_id); + continue; } vma = find_vma(current->mm, ip); - if (!vma || fetch_build_id(vma, id_offs[i].build_id, may_fault)) { + if (!vma || vma_is_anonymous(vma) || + fetch_build_id(vma, id_offs[i].build_id, may_fault)) { /* per entry fall back to ips */ - id_offs[i].status = BPF_STACK_BUILD_ID_IP; - memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); + stack_map_build_id_set_ip(&id_offs[i]); + prev_vma = vma; + prev_build_id = NULL; continue; } -build_id_valid: - id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ip - vma->vm_start; - id_offs[i].status = BPF_STACK_BUILD_ID_VALID; + offset = stack_map_build_id_offset(vma->vm_pgoff, vma->vm_start, ip); + stack_map_build_id_set_valid(&id_offs[i], offset, id_offs[i].build_id); prev_vma = vma; prev_build_id = id_offs[i].build_id; } -- cgit v1.2.3 From fad3021faf7b0b64e9daea41c5662b65c8ad7379 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 25 May 2026 15:39:47 -0700 Subject: bpf: Avoid faultable build ID reads under mm locks Sleepable build ID parsing can block in __kernel_read() [1], so the stackmap sleepable path must not call it while holding mmap_lock or a per-VMA read lock. The issue and the fix are conceptually similar to a recent procfs patch [2]. A similar VMA locking pattern has already been used in PROCMAP_QUERY [3]. Resolve each covered VMA with a stable read-side reference, preferring lock_vma_under_rcu() and falling back to mmap_read_trylock() only long enough to acquire the VMA read lock. Take a reference to the backing file, drop the VMA lock, and then parse the build ID through (sleepable) build_id_parse_file(). We have to use mmap_read_trylock() (and give up on failure) in this context because taking mmap_read_lock() is generally unsafe on code paths reachable from BPF programs [4], and may lead to deadlocks. [1] https://lore.kernel.org/all/20251218005818.614819-1-shakeel.butt@linux.dev/ [2] https://lore.kernel.org/all/20260128183232.2854138-1-andrii@kernel.org/ [3] https://lore.kernel.org/all/20250808152850.2580887-1-surenb@google.com/ [4] https://lore.kernel.org/bpf/2895ecd8-df1e-4cc0-b9f9-aef893dc2360@linux.dev/ Fixes: d4dd9775ec24 ("bpf: wire up sleepable bpf_get_stack() and bpf_get_task_stack() helpers") Suggested-by: Puranjay Mohan Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260525223948.1920986-3-ihor.solodrai@linux.dev --- kernel/bpf/stackmap.c | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) (limited to 'kernel/bpf/stackmap.c') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index e23be7d44503..c53cfd9a67cf 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "percpu_freelist.h" #include "mmap_unlock_work.h" @@ -174,6 +175,109 @@ static inline void stack_map_build_id_set_valid(struct bpf_stack_build_id *id, memcpy(id->build_id, build_id, BUILD_ID_SIZE_MAX); } +struct stack_map_vma_lock { + struct vm_area_struct *vma; + struct mm_struct *mm; +}; + +/* + * Acquire a stable read-side reference on the VMA covering @ip. + * + * With CONFIG_PER_VMA_LOCK=y this returns a VMA with its per-VMA read + * lock held and mmap_lock dropped, so the caller may sleep. + * + * With CONFIG_PER_VMA_LOCK=n it returns a VMA with mmap_lock still + * held; the caller must snapshot any fields it needs and pin vm_file + * with get_file() before stack_map_unlock_vma() drops mmap_lock, as + * the VMA may be split, merged, or freed after that. + * + * Returns NULL on failure, in which case no lock is held. + */ +static struct vm_area_struct * +stack_map_lock_vma(struct stack_map_vma_lock *lock, unsigned long ip) +{ + struct mm_struct *mm = lock->mm; + struct vm_area_struct *vma; + + /* noop under !CONFIG_PER_VMA_LOCK */ + vma = lock_vma_under_rcu(mm, ip); + if (vma) { + lock->vma = vma; + return vma; + } + + /* + * Taking mmap_read_lock() is unsafe here, because the caller BPF + * program might already hold it, causing a deadlock. + */ + if (!mmap_read_trylock(mm)) + return NULL; + + vma = vma_lookup(mm, ip); + if (!vma) { + mmap_read_unlock(mm); + return NULL; + } + +#ifdef CONFIG_PER_VMA_LOCK + if (!vma_start_read_locked(vma)) { + mmap_read_unlock(mm); + return NULL; + } + mmap_read_unlock(mm); +#endif + + lock->vma = vma; + return vma; +} + +static void stack_map_unlock_vma(struct stack_map_vma_lock *lock) +{ +#ifdef CONFIG_PER_VMA_LOCK + vma_end_read(lock->vma); +#else + mmap_read_unlock(lock->mm); +#endif + lock->vma = NULL; +} + +static void stack_map_get_build_id_offset_sleepable(struct bpf_stack_build_id *id_offs, + u32 trace_nr) +{ + struct mm_struct *mm = current->mm; + struct stack_map_vma_lock lock = { .mm = mm }; + struct vm_area_struct *vma; + struct file *file; + u64 offset; + u64 ip; + + for (u32 i = 0; i < trace_nr; i++) { + ip = READ_ONCE(id_offs[i].ip); + + vma = stack_map_lock_vma(&lock, ip); + if (!vma) { + stack_map_build_id_set_ip(&id_offs[i]); + continue; + } + if (vma_is_anonymous(vma) || !vma->vm_file) { + stack_map_build_id_set_ip(&id_offs[i]); + stack_map_unlock_vma(&lock); + continue; + } + + file = get_file(vma->vm_file); + offset = stack_map_build_id_offset(vma->vm_pgoff, vma->vm_start, ip); + stack_map_unlock_vma(&lock); + + /* build_id_parse_file() may block on filesystem reads */ + if (build_id_parse_file(file, id_offs[i].build_id, NULL)) + stack_map_build_id_set_ip(&id_offs[i]); + else + stack_map_build_id_set_valid(&id_offs[i], offset, id_offs[i].build_id); + fput(file); + } +} + /* * Expects all id_offs[i].ip values to be set to correct initial IPs. * They will be subsequently: @@ -194,6 +298,11 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, const unsigned char *prev_build_id = NULL; int i; + if (may_fault && has_user_ctx) { + stack_map_get_build_id_offset_sleepable(id_offs, trace_nr); + return; + } + /* If the irq_work is in use, fall back to report ips. Same * fallback is used for kernel stack (!user) on a stackmap with * build_id. -- cgit v1.2.3 From 5e9099d8ff24ec32aa5af872a4d84d086bd70579 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 25 May 2026 15:39:48 -0700 Subject: bpf: Cache build IDs in sleepable stackmap path Stack traces often contain adjacent IPs from the same VMA or from different VMAs backed by the same ELF file. Cache the last successfully parsed build id together with the resolved VMA range and backing file so the sleepable build id path can avoid repeated VMA locking and file parsing in common cases. Suggested-by: Mykyta Yatsenko Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Acked-by: Mykyta Yatsenko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260525223948.1920986-4-ihor.solodrai@linux.dev --- kernel/bpf/stackmap.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 55 insertions(+), 6 deletions(-) (limited to 'kernel/bpf/stackmap.c') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index c53cfd9a67cf..77ba03216c09 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -246,6 +246,14 @@ static void stack_map_get_build_id_offset_sleepable(struct bpf_stack_build_id *i { struct mm_struct *mm = current->mm; struct stack_map_vma_lock lock = { .mm = mm }; + struct { + struct file *file; + const unsigned char *build_id; + unsigned long vm_start; + unsigned long vm_end; + unsigned long vm_pgoff; + } cache = {}; + unsigned long vm_pgoff, vm_start, vm_end; struct vm_area_struct *vma; struct file *file; u64 offset; @@ -254,6 +262,17 @@ static void stack_map_get_build_id_offset_sleepable(struct bpf_stack_build_id *i for (u32 i = 0; i < trace_nr; i++) { ip = READ_ONCE(id_offs[i].ip); + /* + * Range cache fast path: if ip falls within the previously + * resolved VMA range, reuse the cache build_id without + * re-acquiring the VMA lock. + */ + if (cache.build_id && ip >= cache.vm_start && ip < cache.vm_end) { + offset = stack_map_build_id_offset(cache.vm_pgoff, cache.vm_start, ip); + stack_map_build_id_set_valid(&id_offs[i], offset, cache.build_id); + continue; + } + vma = stack_map_lock_vma(&lock, ip); if (!vma) { stack_map_build_id_set_ip(&id_offs[i]); @@ -265,17 +284,47 @@ static void stack_map_get_build_id_offset_sleepable(struct bpf_stack_build_id *i continue; } - file = get_file(vma->vm_file); - offset = stack_map_build_id_offset(vma->vm_pgoff, vma->vm_start, ip); + file = vma->vm_file; + vm_pgoff = vma->vm_pgoff; + vm_start = vma->vm_start; + vm_end = vma->vm_end; + offset = stack_map_build_id_offset(vm_pgoff, vm_start, ip); + + /* + * Same backing file as previous (e.g. different VMAs + * of the same ELF binary). Reuse the cache build_id. + */ + if (file == cache.file) { + stack_map_unlock_vma(&lock); + stack_map_build_id_set_valid(&id_offs[i], offset, cache.build_id); + cache.vm_start = vm_start; + cache.vm_end = vm_end; + cache.vm_pgoff = vm_pgoff; + continue; + } + + file = get_file(file); stack_map_unlock_vma(&lock); /* build_id_parse_file() may block on filesystem reads */ - if (build_id_parse_file(file, id_offs[i].build_id, NULL)) + if (build_id_parse_file(file, id_offs[i].build_id, NULL)) { stack_map_build_id_set_ip(&id_offs[i]); - else - stack_map_build_id_set_valid(&id_offs[i], offset, id_offs[i].build_id); - fput(file); + fput(file); + continue; + } + + stack_map_build_id_set_valid(&id_offs[i], offset, id_offs[i].build_id); + if (cache.file) + fput(cache.file); + cache.file = file; + cache.build_id = id_offs[i].build_id; + cache.vm_start = vm_start; + cache.vm_end = vm_end; + cache.vm_pgoff = vm_pgoff; } + + if (cache.file) + fput(cache.file); } /* -- cgit v1.2.3