diff options
Diffstat (limited to 'kernel/bpf')
| -rw-r--r-- | kernel/bpf/Makefile | 2 | ||||
| -rw-r--r-- | kernel/bpf/arena.c | 315 | ||||
| -rw-r--r-- | kernel/bpf/arraymap.c | 14 | ||||
| -rw-r--r-- | kernel/bpf/backtrack.c | 82 | ||||
| -rw-r--r-- | kernel/bpf/bpf_lru_list.c | 165 | ||||
| -rw-r--r-- | kernel/bpf/bpf_lru_list.h | 25 | ||||
| -rw-r--r-- | kernel/bpf/bpf_lsm.c | 20 | ||||
| -rw-r--r-- | kernel/bpf/bpf_struct_ops.c | 63 | ||||
| -rw-r--r-- | kernel/bpf/btf.c | 318 | ||||
| -rw-r--r-- | kernel/bpf/cgroup.c | 65 | ||||
| -rw-r--r-- | kernel/bpf/cnum.c | 120 | ||||
| -rw-r--r-- | kernel/bpf/cnum_defs.h | 247 | ||||
| -rw-r--r-- | kernel/bpf/const_fold.c | 8 | ||||
| -rw-r--r-- | kernel/bpf/core.c | 27 | ||||
| -rw-r--r-- | kernel/bpf/devmap.c | 19 | ||||
| -rw-r--r-- | kernel/bpf/fixups.c | 27 | ||||
| -rw-r--r-- | kernel/bpf/hashtab.c | 840 | ||||
| -rw-r--r-- | kernel/bpf/helpers.c | 204 | ||||
| -rw-r--r-- | kernel/bpf/inode.c | 260 | ||||
| -rw-r--r-- | kernel/bpf/liveness.c | 183 | ||||
| -rw-r--r-- | kernel/bpf/log.c | 132 | ||||
| -rw-r--r-- | kernel/bpf/lpm_trie.c | 8 | ||||
| -rw-r--r-- | kernel/bpf/map_in_map.c | 5 | ||||
| -rw-r--r-- | kernel/bpf/map_iter.c | 7 | ||||
| -rw-r--r-- | kernel/bpf/stackmap.c | 215 | ||||
| -rw-r--r-- | kernel/bpf/states.c | 67 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 312 | ||||
| -rw-r--r-- | kernel/bpf/trampoline.c | 671 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 4146 |
29 files changed, 5670 insertions, 2897 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 399007b67a92..4dc41bf5780c 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse endif CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o const_fold.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o cnum.o log.o token.o liveness.o const_fold.o obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o bpf_insn_array.o diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 49a8f7b1beef..af49c154473d 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -53,12 +53,15 @@ struct bpf_arena { u64 user_vm_start; u64 user_vm_end; struct vm_struct *kern_vm; + struct page *scratch_page; struct range_tree rt; /* protects rt */ rqspinlock_t spinlock; struct list_head vma_list; /* protects vma_list */ struct mutex lock; + u64 zap_gen; + struct mutex zap_mutex; struct irq_work free_irq; struct work_struct free_work; struct llist_head free_spans; @@ -83,6 +86,32 @@ u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena) return arena ? arena->user_vm_start : 0; } +/** + * bpf_arena_map_kern_vm_start - kern_vm_start lookup by struct bpf_map * + * @map: a BPF_MAP_TYPE_ARENA map + * + * Return @map's kern_vm_start. + */ +u64 bpf_arena_map_kern_vm_start(struct bpf_map *map) +{ + return bpf_arena_get_kern_vm_start(container_of(map, struct bpf_arena, map)); +} + +/** + * bpf_prog_arena - return the bpf_map of the arena referenced by @prog + * @prog: a loaded BPF program + * + * The verifier enforces at most one arena per program and stores it in + * prog->aux->arena. Return that arena's underlying bpf_map, or NULL if + * @prog does not reference an arena. + */ +struct bpf_map *bpf_prog_arena(struct bpf_prog *prog) +{ + struct bpf_arena *arena = prog->aux->arena; + + return arena ? &arena->map : NULL; +} + static long arena_map_peek_elem(struct bpf_map *map, void *value) { return -EOPNOTSUPP; @@ -115,26 +144,57 @@ static long compute_pgoff(struct bpf_arena *arena, long uaddr) struct apply_range_data { struct page **pages; + struct page *scratch_page; int i; }; +struct clear_range_data { + struct llist_head *free_pages; + struct page *scratch_page; +}; + static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data) { struct apply_range_data *d = data; struct page *page; + pte_t pteval; if (!data) return 0; - /* sanity check */ - if (unlikely(!pte_none(ptep_get(pte)))) - return -EBUSY; page = d->pages[d->i]; /* paranoia, similar to vmap_pages_pte_range() */ if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page)))) return -EINVAL; - set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL)); + pteval = mk_pte(page, PAGE_KERNEL); +#ifdef ptep_try_set + /* + * Kernel-fault recovery may have installed the scratch page here, and + * some architectures (arm64) prohibit valid->valid PTE transitions. + * Install atomically into a none slot. If scratch is present, clear it + * and flush_tlb_before_set() (break-before-make) before retrying. + */ + while (!ptep_try_set(pte, pteval)) { + pte_t old = ptep_get(pte); + + if (pte_none(old)) + continue; + if (WARN_ON_ONCE(pte_page(old) != d->scratch_page)) + return -EBUSY; + ptep_get_and_clear(&init_mm, addr, pte); + flush_tlb_before_set(addr); + } +#else + /* + * Without ptep_try_set() there is no atomic installer, but such arches + * also do not wire up bpf_arena_handle_page_fault(), so no scratch page + * is ever installed and the slot is always none here. + */ + if (unlikely(!pte_none(ptep_get(pte)))) + return -EBUSY; + set_pte_at(&init_mm, addr, pte, pteval); +#endif d->i++; return 0; } @@ -144,33 +204,59 @@ static void flush_vmap_cache(unsigned long start, unsigned long size) flush_cache_vmap(start, start + size); } -static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *free_pages) +static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data) { + struct clear_range_data *d = data; pte_t old_pte; struct page *page; - /* sanity check */ - old_pte = ptep_get(pte); + /* + * Pairs with ptep_try_set() in the kernel-fault scratch installer. + * Both sides must be atomic. + */ + old_pte = ptep_get_and_clear(&init_mm, addr, pte); if (pte_none(old_pte) || !pte_present(old_pte)) - return 0; /* nothing to do */ + return 0; page = pte_page(old_pte); if (WARN_ON_ONCE(!page)) return -EINVAL; - pte_clear(&init_mm, addr, pte); + /* + * Skip the per-arena scratch page. A kernel fault on an unallocated uaddr + * scratches its PTE. A later bpf_arena_free_pages() over that range walks + * here. Without the skip, scratch_page would be freed. + */ + if (page == d->scratch_page) + return 0; + + __llist_add(&page->pcp_llist, d->free_pages); + return 0; +} - /* Add page to the list so it is freed later */ - if (free_pages) - __llist_add(&page->pcp_llist, free_pages); +static int apply_range_set_scratch_cb(pte_t *pte, unsigned long addr, void *data) +{ + struct page *scratch_page = data; + if (!pte_none(ptep_get(pte))) + return 0; + /* + * Best-effort install. ptep_try_set() returns false only if another + * installer (real allocation or concurrent fault) won the cmpxchg. + * Their PTE is already valid, so the access retry succeeds. + * + * No flush_tlb_kernel_range() needed. Stale "not mapped" entries just + * cause one extra re-fault through this same path. + */ + ptep_try_set(pte, mk_pte(scratch_page, PAGE_KERNEL)); return 0; } static int populate_pgtable_except_pte(struct bpf_arena *arena) { + /* Populate intermediates for the recovery range (4 GiB + upper half-guard). */ return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), - KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL); + SZ_4G + GUARD_SZ / 2, apply_range_set_cb, NULL); } static struct bpf_map *arena_map_alloc(union bpf_attr *attr) @@ -221,22 +307,30 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) init_irq_work(&arena->free_irq, arena_free_irq); INIT_WORK(&arena->free_work, arena_free_worker); bpf_map_init_from_attr(&arena->map, attr); + + err = bpf_map_alloc_pages(&arena->map, NUMA_NO_NODE, 1, &arena->scratch_page); + if (err) + goto err_free_arena; + range_tree_init(&arena->rt); err = range_tree_set(&arena->rt, 0, attr->max_entries); - if (err) { - bpf_map_area_free(arena); - goto err; - } + if (err) + goto err_free_scratch; mutex_init(&arena->lock); + mutex_init(&arena->zap_mutex); raw_res_spin_lock_init(&arena->spinlock); err = populate_pgtable_except_pte(arena); - if (err) { - range_tree_destroy(&arena->rt); - bpf_map_area_free(arena); - goto err; - } + if (err) + goto err_destroy_rt; return &arena->map; + +err_destroy_rt: + range_tree_destroy(&arena->rt); +err_free_scratch: + __free_page(arena->scratch_page); +err_free_arena: + bpf_map_area_free(arena); err: free_vm_area(kern_vm); return ERR_PTR(err); @@ -244,6 +338,7 @@ err: static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data) { + struct bpf_arena *arena = data; struct page *page; pte_t pte; @@ -252,6 +347,12 @@ static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data) return 0; page = pte_page(pte); /* + * Skip the scratch page. The walk is page-table-driven, not range-tree-driven, + * so it can visit scratch PTEs at uaddrs the BPF program never allocated. + */ + if (page == arena->scratch_page) + return 0; + /* * We do not update pte here: * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug * 2. TLB flushing is batched or deferred. Even if we clear pte, @@ -286,9 +387,10 @@ static void arena_map_free(struct bpf_map *map) * free those pages. */ apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), - KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL); + SZ_4G + GUARD_SZ / 2, existing_page_cb, arena); free_vm_area(arena->kern_vm); range_tree_destroy(&arena->rt); + __free_page(arena->scratch_page); bpf_map_area_free(arena); } @@ -318,6 +420,7 @@ struct vma_list { struct vm_area_struct *vma; struct list_head head; refcount_t mmap_count; + u64 zap_gen; }; static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) @@ -330,6 +433,7 @@ static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) refcount_set(&vml->mmap_count, 1); vma->vm_private_data = vml; vml->vma = vma; + vml->zap_gen = 0; list_add(&vml->head, &arena->vma_list); return 0; } @@ -384,33 +488,38 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) return VM_FAULT_RETRY; page = vmalloc_to_page((void *)kaddr); - if (page) + if (page) { + if (page == arena->scratch_page) + /* BPF triggered scratch here; don't lazy-alloc over it */ + goto out_sigsegv; /* already have a page vmap-ed */ goto out; + } bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT) /* User space requested to segfault when page is not allocated by bpf prog */ - goto out_unlock_sigsegv; + goto out_sigsegv_memcg; ret = range_tree_clear(&arena->rt, vmf->pgoff, 1); if (ret) - goto out_unlock_sigsegv; + goto out_sigsegv_memcg; - struct apply_range_data data = { .pages = &page, .i = 0 }; + struct apply_range_data data = { .pages = &page, .i = 0, + .scratch_page = arena->scratch_page }; /* Account into memcg of the process that created bpf_arena */ ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); - goto out_unlock_sigsegv; + goto out_sigsegv_memcg; } ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); free_pages_nolock(page, 0); - goto out_unlock_sigsegv; + goto out_sigsegv_memcg; } flush_vmap_cache(kaddr, PAGE_SIZE); bpf_map_memcg_exit(old_memcg, new_memcg); @@ -419,8 +528,9 @@ out: raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); vmf->page = page; return 0; -out_unlock_sigsegv: +out_sigsegv_memcg: bpf_map_memcg_exit(old_memcg, new_memcg); +out_sigsegv: raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); return VM_FAULT_SIGSEGV; } @@ -587,6 +697,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt return 0; } data.pages = pages; + data.scratch_page = arena->scratch_page; if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) goto out_free_pages; @@ -668,12 +779,60 @@ out_free_pages: */ static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) { + unsigned long size = (unsigned long)page_cnt << PAGE_SHIFT; + struct vm_area_struct *vma; + struct mm_struct *mm; struct vma_list *vml; + unsigned long vm_start; + u64 my_gen; - guard(mutex)(&arena->lock); - /* iterate link list under lock */ - list_for_each_entry(vml, &arena->vma_list, head) - zap_vma_range(vml->vma, uaddr, PAGE_SIZE * page_cnt); + /* + * Taking mmap_read_lock() under arena->lock would deadlock against + * arena_vm_close(), which runs with mmap_write_lock held and then + * acquires arena->lock. Drop arena->lock for mmap_read_lock(). + * + * Use per-call my_gen, recorded in vml->zap_gen, to remember which + * vmls this invocation has already processed across the lock drop. + * Hold zap_mutex around the whole walk so concurrent zap_pages() + * callers cannot overwrite each other's marks on shared vmls -- + * otherwise call B's mark would make call A skip a vml that A has + * not yet zapped for A's uaddr range. + */ + mutex_lock(&arena->zap_mutex); + mutex_lock(&arena->lock); + my_gen = ++arena->zap_gen; + for (;;) { + mm = NULL; + list_for_each_entry(vml, &arena->vma_list, head) { + if (vml->zap_gen >= my_gen) + continue; + vml->zap_gen = my_gen; + if (!mmget_not_zero(vml->vma->vm_mm)) + continue; + mm = vml->vma->vm_mm; + vm_start = vml->vma->vm_start; + break; + } + if (!mm) + break; + mutex_unlock(&arena->lock); + + mmap_read_lock(mm); + /* + * Re-resolve: while we waited the VMA could have been unmapped + * and a different mapping installed at the same address. + */ + vma = find_vma(mm, vm_start); + if (vma && vma->vm_start == vm_start && + vma->vm_file && vma->vm_file->private_data == &arena->map) + zap_vma_range(vma, uaddr, size); + mmap_read_unlock(mm); + mmput(mm); + + mutex_lock(&arena->lock); + } + mutex_unlock(&arena->lock); + mutex_unlock(&arena->zap_mutex); } static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable) @@ -685,6 +844,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, struct llist_head free_pages; struct llist_node *pos, *t; struct arena_free_span *s; + struct clear_range_data cdata; unsigned long flags; int ret = 0; @@ -713,9 +873,11 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, range_tree_set(&arena->rt, pgoff, page_cnt); init_llist_head(&free_pages); + cdata.free_pages = &free_pages; + cdata.scratch_page = arena->scratch_page; /* clear ptes and collect struct pages */ apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT, - apply_range_clear_cb, &free_pages); + apply_range_clear_cb, &cdata); /* drop the lock to do the tlb flush and zap pages */ raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); @@ -805,6 +967,7 @@ static void arena_free_worker(struct work_struct *work) struct arena_free_span *s; u64 arena_vm_start, user_vm_start; struct llist_head free_pages; + struct clear_range_data cdata; struct page *page; unsigned long full_uaddr; long kaddr, page_cnt, pgoff; @@ -818,6 +981,8 @@ static void arena_free_worker(struct work_struct *work) bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); init_llist_head(&free_pages); + cdata.free_pages = &free_pages; + cdata.scratch_page = arena->scratch_page; arena_vm_start = bpf_arena_get_kern_vm_start(arena); user_vm_start = bpf_arena_get_user_vm_start(arena); @@ -830,7 +995,7 @@ static void arena_free_worker(struct work_struct *work) /* clear ptes and collect pages in free_pages llist */ apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT, - apply_range_clear_cb, &free_pages); + apply_range_clear_cb, &cdata); range_tree_set(&arena->rt, pgoff, page_cnt); } @@ -893,6 +1058,19 @@ void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 pag return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false); } + +void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt, + int node_id, u64 flags) +{ + struct bpf_map *map = p__map; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt) + return NULL; + + return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true); +} + __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt) { struct bpf_map *map = p__map; @@ -945,23 +1123,12 @@ static int __init kfunc_init(void) } late_initcall(kfunc_init); -void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip) +static void __bpf_prog_report_arena_violation(struct bpf_prog *prog, bool write, + unsigned long addr, unsigned long fault_ip) { struct bpf_stream_stage ss; - struct bpf_prog *prog; u64 user_vm_start; - /* - * The RCU read lock is held to safely traverse the latch tree, but we - * don't need its protection when accessing the prog, since it will not - * disappear while we are handling the fault. - */ - rcu_read_lock(); - prog = bpf_prog_ksym_find(fault_ip); - rcu_read_unlock(); - if (!prog) - return; - /* Use main prog for stream access */ prog = prog->aux->main_prog_aux->prog; @@ -974,3 +1141,53 @@ void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned lo bpf_stream_dump_stack(ss); })); } + +bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip) +{ + struct bpf_arena *arena; + struct bpf_prog *prog; + unsigned long kbase; + unsigned long page_addr = addr & PAGE_MASK; + + prog = bpf_prog_find_from_stack(); + if (!prog) + return false; + + arena = prog->aux->arena; + /* a prog not using arena may be on stack, so arena can be NULL */ + if (!arena) + return false; + + kbase = bpf_arena_get_kern_vm_start(arena); + + /* + * Recovery covers the 4 GiB mappable band plus the upper half-guard. + * Lower guard is unreachable from kfuncs; an address there indicates + * a different bug class - leave it to the regular kernel oops path. + */ + if (page_addr < kbase || page_addr >= kbase + SZ_4G + GUARD_SZ / 2) + return false; + + apply_to_page_range(&init_mm, page_addr, PAGE_SIZE, + apply_range_set_scratch_cb, arena->scratch_page); + flush_vmap_cache(page_addr, PAGE_SIZE); + __bpf_prog_report_arena_violation(prog, is_write, page_addr - kbase, fault_ip); + return true; +} + +void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip) +{ + struct bpf_prog *prog; + + /* + * The RCU read lock is held to safely traverse the latch tree, but we + * don't need its protection when accessing the prog, since it will not + * disappear while we are handling the fault. + */ + rcu_read_lock(); + prog = bpf_prog_ksym_find(fault_ip); + rcu_read_unlock(); + if (!prog) + return; + __bpf_prog_report_arena_violation(prog, write, addr, fault_ip); +} diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index dfb2110ab733..248b4818178c 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -175,14 +175,12 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) return array->value + (u64)array->elem_size * (index & array->index_mask); } -static int array_map_get_hash(struct bpf_map *map, u32 hash_buf_size, - void *hash_buf) +static int array_map_get_hash(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); sha256(array->value, (u64)array->elem_size * array->map.max_entries, - hash_buf); - memcpy(array->map.sha, hash_buf, sizeof(array->map.sha)); + array->map.sha); return 0; } @@ -386,7 +384,7 @@ static long array_map_update_elem(struct bpf_map *map, void *key, void *value, if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { val = this_cpu_ptr(array->pptrs[index & array->index_mask]); copy_map_value(map, val, value); - bpf_obj_free_fields(array->map.record, val); + bpf_obj_cancel_fields(map, val); } else { val = array->value + (u64)array->elem_size * (index & array->index_mask); @@ -394,7 +392,7 @@ static long array_map_update_elem(struct bpf_map *map, void *key, void *value, copy_map_value_locked(map, val, value, false); else copy_map_value(map, val, value); - bpf_obj_free_fields(array->map.record, val); + bpf_obj_cancel_fields(map, val); } return 0; } @@ -434,14 +432,14 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, cpu = map_flags >> 32; ptr = per_cpu_ptr(pptr, cpu); copy_map_value(map, ptr, value); - bpf_obj_free_fields(array->map.record, ptr); + bpf_obj_cancel_fields(map, ptr); goto unlock; } for_each_possible_cpu(cpu) { ptr = per_cpu_ptr(pptr, cpu); val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu; copy_map_value(map, ptr, val); - bpf_obj_free_fields(array->map.record, ptr); + bpf_obj_cancel_fields(map, ptr); } unlock: rcu_read_unlock(); diff --git a/kernel/bpf/backtrack.c b/kernel/bpf/backtrack.c index 854731dc93fe..2e4ae0ef0860 100644 --- a/kernel/bpf/backtrack.c +++ b/kernel/bpf/backtrack.c @@ -9,7 +9,7 @@ /* for any branch, call, exit record the history of jmps in the given state */ int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, - int insn_flags, u64 linked_regs) + int insn_flags, int spi, int frame, u64 linked_regs) { u32 cnt = cur->jmp_history_cnt; struct bpf_jmp_history_entry *p; @@ -25,6 +25,8 @@ int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state env, "insn history: insn_idx %d cur flags %x new flags %x", env->insn_idx, env->cur_hist_ent->flags, insn_flags); env->cur_hist_ent->flags |= insn_flags; + env->cur_hist_ent->spi = spi; + env->cur_hist_ent->frame = frame; verifier_bug_if(env->cur_hist_ent->linked_regs != 0, env, "insn history: insn_idx %d linked_regs: %#llx", env->insn_idx, env->cur_hist_ent->linked_regs); @@ -43,6 +45,8 @@ int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state p->idx = env->insn_idx; p->prev_idx = env->prev_insn_idx; p->flags = insn_flags; + p->spi = spi; + p->frame = frame; p->linked_regs = linked_regs; cur->jmp_history_cnt = cnt; env->cur_hist_ent = p; @@ -64,16 +68,6 @@ static bool is_atomic_fetch_insn(const struct bpf_insn *insn) (insn->imm & BPF_FETCH); } -static int insn_stack_access_spi(int insn_flags) -{ - return (insn_flags >> INSN_F_SPI_SHIFT) & INSN_F_SPI_MASK; -} - -static int insn_stack_access_frameno(int insn_flags) -{ - return insn_flags & INSN_F_FRAMENO_MASK; -} - /* Backtrack one insn at a time. If idx is not at the top of recorded * history then previous instruction came from straight line execution. * Return -ENOENT if we exhausted all instructions within given state. @@ -135,11 +129,21 @@ static inline u32 bt_empty(struct backtrack_state *bt) int i; for (i = 0; i <= bt->frame; i++) - mask |= bt->reg_masks[i] | bt->stack_masks[i]; + mask |= bt->reg_masks[i] | bt->stack_masks[i] | bt->stack_arg_masks[i]; return mask == 0; } +static inline void bt_clear_frame_stack_arg_slot(struct backtrack_state *bt, u32 frame, u32 slot) +{ + bt->stack_arg_masks[frame] &= ~(1 << slot); +} + +static inline bool bt_is_frame_stack_arg_slot_set(struct backtrack_state *bt, u32 frame, u32 slot) +{ + return bt->stack_arg_masks[frame] & (1 << slot); +} + static inline int bt_subprog_enter(struct backtrack_state *bt) { if (bt->frame == MAX_CALL_FRAMES - 1) { @@ -200,6 +204,11 @@ static inline u64 bt_stack_mask(struct backtrack_state *bt) return bt->stack_masks[bt->frame]; } +static inline u8 bt_stack_arg_mask(struct backtrack_state *bt) +{ + return bt->stack_arg_masks[bt->frame]; +} + static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg) { return bt->reg_masks[bt->frame] & (1 << reg); @@ -341,6 +350,19 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, return 0; bt_clear_reg(bt, load_reg); + if (hist && hist->flags & INSN_F_STACK_ARG_ACCESS) { + spi = hist->spi; + /* + * Stack arg read: callee reads from r11+off, but + * the data lives in the caller's stack_arg_regs. + * Set the mask in the caller frame so precision + * is marked in the caller's slot at the callee + * entry checkpoint. + */ + bt_set_frame_stack_arg_slot(bt, bt->frame - 1, spi); + return 0; + } + /* scalars can only be spilled into stack w/o losing precision. * Load from any other memory can be zero extended. * The desire to keep that precision is already indicated @@ -353,8 +375,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * that [fp - off] slot contains scalar that needs to be * tracked with precision */ - spi = insn_stack_access_spi(hist->flags); - fr = insn_stack_access_frameno(hist->flags); + spi = hist->spi; + fr = hist->frame; bpf_bt_set_frame_slot(bt, fr, spi); } else if (class == BPF_STX || class == BPF_ST) { if (bt_is_reg_set(bt, dreg)) @@ -363,11 +385,22 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * encountered a case of pointer subtraction. */ return -ENOTSUPP; + + if (hist && hist->flags & INSN_F_STACK_ARG_ACCESS) { + spi = hist->spi; + if (!bt_is_frame_stack_arg_slot_set(bt, bt->frame, spi)) + return 0; + bt_clear_frame_stack_arg_slot(bt, bt->frame, spi); + if (class == BPF_STX) + bt_set_reg(bt, sreg); + return 0; + } + /* scalars can only be spilled into stack */ if (!hist || !(hist->flags & INSN_F_STACK_ACCESS)) return 0; - spi = insn_stack_access_spi(hist->flags); - fr = insn_stack_access_frameno(hist->flags); + spi = hist->spi; + fr = hist->frame; if (!bt_is_frame_slot_set(bt, fr, spi)) return 0; bt_clear_frame_slot(bt, fr, spi); @@ -431,6 +464,12 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, bpf_bt_set_frame_reg(bt, bt->frame - 1, i); } } + if (bt_stack_arg_mask(bt)) { + verifier_bug(env, + "static subprog leftover stack arg slots %x", + bt_stack_arg_mask(bt)); + return -EFAULT; + } if (bt_subprog_exit(bt)) return -EFAULT; return 0; @@ -901,6 +940,17 @@ int bpf_mark_chain_precision(struct bpf_verifier_env *env, *changed = true; } } + for (i = 0; i < func->out_stack_arg_cnt; i++) { + if (!bt_is_frame_stack_arg_slot_set(bt, fr, i)) + continue; + reg = &func->stack_arg_regs[i]; + if (reg->type != SCALAR_VALUE || reg->precise) { + bt_clear_frame_stack_arg_slot(bt, fr, i); + } else { + reg->precise = true; + *changed = true; + } + } if (env->log.level & BPF_LOG_LEVEL2) { fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_frame_reg_mask(bt, fr)); diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index e7a2fc60523f..5ed7cb4b98c0 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -13,23 +13,8 @@ #define PERCPU_FREE_TARGET (4) #define PERCPU_NR_SCANS PERCPU_FREE_TARGET -/* Helpers to get the local list index */ -#define LOCAL_LIST_IDX(t) ((t) - BPF_LOCAL_LIST_T_OFFSET) -#define LOCAL_FREE_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE) -#define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING) #define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET) -/* Local list helpers */ -static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l) -{ - return &loc_l->lists[LOCAL_FREE_LIST_IDX]; -} - -static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l) -{ - return &loc_l->lists[LOCAL_PENDING_LIST_IDX]; -} - /* bpf_lru_node helpers */ static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node) { @@ -72,6 +57,7 @@ static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l, bpf_lru_list_count_dec(l, node->type); node->type = tgt_free_type; + WRITE_ONCE(node->pending_free, 0); list_move(&node->list, free_list); } @@ -87,6 +73,9 @@ static void __bpf_lru_node_move_in(struct bpf_lru_list *l, bpf_lru_list_count_inc(l, tgt_type); node->type = tgt_type; bpf_lru_node_clear_ref(node); + /* Reset pending_free only when moving to the free list */ + if (tgt_type == BPF_LRU_LIST_T_FREE) + WRITE_ONCE(node->pending_free, 0); list_move(&node->list, &l->lists[tgt_type]); } @@ -212,9 +201,11 @@ __bpf_lru_list_shrink_inactive(struct bpf_lru *lru, unsigned int i = 0; list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) { - if (bpf_lru_node_is_ref(node)) { + if (bpf_lru_node_is_ref(node) && + !READ_ONCE(node->pending_free)) { __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); - } else if (lru->del_from_htab(lru->del_arg, node)) { + } else if (READ_ONCE(node->pending_free) || + lru->del_from_htab(lru->del_arg, node)) { __bpf_lru_node_move_to_free(l, node, free_list, tgt_free_type); if (++nshrinked == tgt_nshrink) @@ -273,7 +264,8 @@ static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru, list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list, list) { - if (lru->del_from_htab(lru->del_arg, node)) { + if (READ_ONCE(node->pending_free) || + lru->del_from_htab(lru->del_arg, node)) { __bpf_lru_node_move_to_free(l, node, free_list, tgt_free_type); return 1; @@ -290,8 +282,10 @@ static void __local_list_flush(struct bpf_lru_list *l, struct bpf_lru_node *node, *tmp_node; list_for_each_entry_safe_reverse(node, tmp_node, - local_pending_list(loc_l), list) { - if (bpf_lru_node_is_ref(node)) + &loc_l->pending_list, list) { + if (READ_ONCE(node->pending_free)) + __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_FREE); + else if (bpf_lru_node_is_ref(node)) __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE); else __bpf_lru_node_move_in(l, node, @@ -307,9 +301,12 @@ static void bpf_lru_list_push_free(struct bpf_lru_list *l, if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) return; - raw_spin_lock_irqsave(&l->lock, flags); + if (raw_res_spin_lock_irqsave(&l->lock, flags)) { + WRITE_ONCE(node->pending_free, 1); + return; + } __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); - raw_spin_unlock_irqrestore(&l->lock, flags); + raw_res_spin_unlock_irqrestore(&l->lock, flags); } static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, @@ -318,8 +315,10 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, struct bpf_lru_list *l = &lru->common_lru.lru_list; struct bpf_lru_node *node, *tmp_node; unsigned int nfree = 0; + LIST_HEAD(tmp_free); - raw_spin_lock(&l->lock); + if (raw_res_spin_lock(&l->lock)) + return; __local_list_flush(l, loc_l); @@ -327,7 +326,7 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE], list) { - __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l), + __bpf_lru_node_move_to_free(l, node, &tmp_free, BPF_LRU_LOCAL_LIST_T_FREE); if (++nfree == lru->target_free) break; @@ -335,10 +334,19 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, if (nfree < lru->target_free) __bpf_lru_list_shrink(lru, l, lru->target_free - nfree, - local_free_list(loc_l), + &tmp_free, BPF_LRU_LOCAL_LIST_T_FREE); - raw_spin_unlock(&l->lock); + raw_res_spin_unlock(&l->lock); + + /* + * Transfer the harvested nodes from the temporary list_head into + * the lockless per-CPU free llist. + */ + list_for_each_entry_safe(node, tmp_node, &tmp_free, list) { + list_del(&node->list); + llist_add(&node->llist, &loc_l->free_llist); + } } static void __local_list_add_pending(struct bpf_lru *lru, @@ -350,22 +358,21 @@ static void __local_list_add_pending(struct bpf_lru *lru, *(u32 *)((void *)node + lru->hash_offset) = hash; node->cpu = cpu; node->type = BPF_LRU_LOCAL_LIST_T_PENDING; + WRITE_ONCE(node->pending_free, 0); bpf_lru_node_clear_ref(node); - list_add(&node->list, local_pending_list(loc_l)); + list_add(&node->list, &loc_l->pending_list); } static struct bpf_lru_node * __local_list_pop_free(struct bpf_lru_locallist *loc_l) { - struct bpf_lru_node *node; + struct llist_node *llnode; - node = list_first_entry_or_null(local_free_list(loc_l), - struct bpf_lru_node, - list); - if (node) - list_del(&node->list); + llnode = llist_del_first(&loc_l->free_llist); + if (!llnode) + return NULL; - return node; + return container_of(llnode, struct bpf_lru_node, llist); } static struct bpf_lru_node * @@ -376,10 +383,10 @@ __local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l) ignore_ref: /* Get from the tail (i.e. older element) of the pending list. */ - list_for_each_entry_reverse(node, local_pending_list(loc_l), - list) { + list_for_each_entry_reverse(node, &loc_l->pending_list, list) { if ((!bpf_lru_node_is_ref(node) || force) && - lru->del_from_htab(lru->del_arg, node)) { + (READ_ONCE(node->pending_free) || + lru->del_from_htab(lru->del_arg, node))) { list_del(&node->list); return node; } @@ -404,7 +411,8 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru, l = per_cpu_ptr(lru->percpu_lru, cpu); - raw_spin_lock_irqsave(&l->lock, flags); + if (raw_res_spin_lock_irqsave(&l->lock, flags)) + return NULL; __bpf_lru_list_rotate(lru, l); @@ -420,7 +428,7 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru, __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); } - raw_spin_unlock_irqrestore(&l->lock, flags); + raw_res_spin_unlock_irqrestore(&l->lock, flags); return node; } @@ -437,7 +445,8 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, loc_l = per_cpu_ptr(clru->local_list, cpu); - raw_spin_lock_irqsave(&loc_l->lock, flags); + if (raw_res_spin_lock_irqsave(&loc_l->lock, flags)) + return NULL; node = __local_list_pop_free(loc_l); if (!node) { @@ -448,17 +457,22 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, if (node) __local_list_add_pending(lru, loc_l, cpu, node, hash); - raw_spin_unlock_irqrestore(&loc_l->lock, flags); + raw_res_spin_unlock_irqrestore(&loc_l->lock, flags); if (node) return node; - /* No free nodes found from the local free list and + /* + * No free nodes found from the local free list and * the global LRU list. * * Steal from the local free/pending list of the * current CPU and remote CPU in RR. It starts * with the loc_l->next_steal CPU. + * + * Acquire the victim's lock before touching either list. On + * acquisition failure (rqspinlock AA or timeout) skip the victim + * and try the next CPU. */ first_steal = loc_l->next_steal; @@ -466,24 +480,36 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, do { steal_loc_l = per_cpu_ptr(clru->local_list, steal); - raw_spin_lock_irqsave(&steal_loc_l->lock, flags); - - node = __local_list_pop_free(steal_loc_l); - if (!node) - node = __local_list_pop_pending(lru, steal_loc_l); - - raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags); + if (!raw_res_spin_lock_irqsave(&steal_loc_l->lock, flags)) { + node = __local_list_pop_free(steal_loc_l); + if (!node) + node = __local_list_pop_pending(lru, steal_loc_l); + raw_res_spin_unlock_irqrestore(&steal_loc_l->lock, flags); + } steal = cpumask_next_wrap(steal, cpu_possible_mask); } while (!node && steal != first_steal); loc_l->next_steal = steal; - if (node) { - raw_spin_lock_irqsave(&loc_l->lock, flags); - __local_list_add_pending(lru, loc_l, cpu, node, hash); - raw_spin_unlock_irqrestore(&loc_l->lock, flags); + if (!node) + return NULL; + + if (raw_res_spin_lock_irqsave(&loc_l->lock, flags)) { + /* + * The local pending lock can't be acquired (rqspinlock AA + * or timeout). Return the stolen node to the per-CPU + * free_llist instead of orphaning it; the next pop_free on + * this CPU will pick it up. + */ + node->type = BPF_LRU_LOCAL_LIST_T_FREE; + bpf_lru_node_clear_ref(node); + WRITE_ONCE(node->pending_free, 0); + llist_add(&node->llist, &loc_l->free_llist); + return NULL; } + __local_list_add_pending(lru, loc_l, cpu, node, hash); + raw_res_spin_unlock_irqrestore(&loc_l->lock, flags); return node; } @@ -511,18 +537,24 @@ static void bpf_common_lru_push_free(struct bpf_lru *lru, loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu); - raw_spin_lock_irqsave(&loc_l->lock, flags); + if (raw_res_spin_lock_irqsave(&loc_l->lock, flags)) { + WRITE_ONCE(node->pending_free, 1); + return; + } if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) { - raw_spin_unlock_irqrestore(&loc_l->lock, flags); + raw_res_spin_unlock_irqrestore(&loc_l->lock, + flags); goto check_lru_list; } node->type = BPF_LRU_LOCAL_LIST_T_FREE; bpf_lru_node_clear_ref(node); - list_move(&node->list, local_free_list(loc_l)); + list_del(&node->list); + + raw_res_spin_unlock_irqrestore(&loc_l->lock, flags); - raw_spin_unlock_irqrestore(&loc_l->lock, flags); + llist_add(&node->llist, &loc_l->free_llist); return; } @@ -538,11 +570,14 @@ static void bpf_percpu_lru_push_free(struct bpf_lru *lru, l = per_cpu_ptr(lru->percpu_lru, node->cpu); - raw_spin_lock_irqsave(&l->lock, flags); + if (raw_res_spin_lock_irqsave(&l->lock, flags)) { + WRITE_ONCE(node->pending_free, 1); + return; + } __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); - raw_spin_unlock_irqrestore(&l->lock, flags); + raw_res_spin_unlock_irqrestore(&l->lock, flags); } void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) @@ -565,6 +600,7 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, node = (struct bpf_lru_node *)(buf + node_offset); node->type = BPF_LRU_LIST_T_FREE; + node->pending_free = 0; bpf_lru_node_clear_ref(node); list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); buf += elem_size; @@ -594,6 +630,7 @@ again: node = (struct bpf_lru_node *)(buf + node_offset); node->cpu = cpu; node->type = BPF_LRU_LIST_T_FREE; + node->pending_free = 0; bpf_lru_node_clear_ref(node); list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); i++; @@ -618,14 +655,12 @@ void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu) { - int i; - - for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++) - INIT_LIST_HEAD(&loc_l->lists[i]); + INIT_LIST_HEAD(&loc_l->pending_list); + init_llist_head(&loc_l->free_llist); loc_l->next_steal = cpu; - raw_spin_lock_init(&loc_l->lock); + raw_res_spin_lock_init(&loc_l->lock); } static void bpf_lru_list_init(struct bpf_lru_list *l) @@ -640,7 +675,7 @@ static void bpf_lru_list_init(struct bpf_lru_list *l) l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE]; - raw_spin_lock_init(&l->lock); + raw_res_spin_lock_init(&l->lock); } int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index fe2661a58ea9..8d0ee61622af 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -6,11 +6,11 @@ #include <linux/cache.h> #include <linux/list.h> -#include <linux/spinlock_types.h> +#include <linux/llist.h> +#include <asm/rqspinlock.h> #define NR_BPF_LRU_LIST_T (3) #define NR_BPF_LRU_LIST_COUNT (2) -#define NR_BPF_LRU_LOCAL_LIST_T (2) #define BPF_LOCAL_LIST_T_OFFSET NR_BPF_LRU_LIST_T enum bpf_lru_list_type { @@ -22,10 +22,22 @@ enum bpf_lru_list_type { }; struct bpf_lru_node { - struct list_head list; + /* + * A node is in at most one list at a time. The free path on the + * per-CPU locallist uses an llist, so share storage via a union. + */ + union { + struct list_head list; + struct llist_node llist; + }; u16 cpu; u8 type; u8 ref; + /* + * Marks nodes whose *_push_free() lock acquire failed; reclaimed + * by flush/shrink which honor the flag instead of del_from_htab(). + */ + u8 pending_free; }; struct bpf_lru_list { @@ -34,13 +46,14 @@ struct bpf_lru_list { /* The next inactive list rotation starts from here */ struct list_head *next_inactive_rotation; - raw_spinlock_t lock ____cacheline_aligned_in_smp; + rqspinlock_t lock ____cacheline_aligned_in_smp; }; struct bpf_lru_locallist { - struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T]; + struct list_head pending_list; + struct llist_head free_llist; u16 next_steal; - raw_spinlock_t lock; + rqspinlock_t lock; }; struct bpf_common_lru { diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index c5c925f00202..564071a92d7d 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -427,6 +427,26 @@ BTF_ID(func, bpf_lsm_audit_rule_known) BTF_ID(func, bpf_lsm_inode_xattr_skipcap) BTF_SET_END(bool_lsm_hooks) +/* hooks returning void */ +#define LSM_HOOK_void(DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME) +#define LSM_HOOK_int(DEFAULT, NAME, ...) /* nothing */ +#define LSM_HOOK(RET, DEFAULT, NAME, ...) LSM_HOOK_##RET(DEFAULT, NAME, __VA_ARGS__) +BTF_SET_START(void_lsm_hooks) +#include <linux/lsm_hook_defs.h> +#undef LSM_HOOK +#undef LSM_HOOK_void +#undef LSM_HOOK_int +BTF_SET_END(void_lsm_hooks) + +bool bpf_lsm_hook_returns_errno(u32 btf_id) +{ + if (btf_id_set_contains(&bool_lsm_hooks, btf_id)) + return false; + if (btf_id_set_contains(&void_lsm_hooks, btf_id)) + return false; + return true; +} + int bpf_lsm_get_retval_range(const struct bpf_prog *prog, struct bpf_retval_range *retval_range) { diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 521cb9d7e8c7..51b16e5f5534 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -594,8 +594,8 @@ const struct bpf_link_ops bpf_struct_ops_link_lops = { .dealloc = bpf_struct_ops_link_dealloc, }; -int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, - struct bpf_tramp_link *link, +int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_nodes *tnodes, + struct bpf_tramp_node *node, const struct btf_func_model *model, void *stub_func, void **_image, u32 *_image_off, @@ -605,13 +605,13 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, void *image = *_image; int size; - tlinks[BPF_TRAMP_FENTRY].links[0] = link; - tlinks[BPF_TRAMP_FENTRY].nr_links = 1; + tnodes[BPF_TRAMP_FENTRY].nodes[0] = node; + tnodes[BPF_TRAMP_FENTRY].nr_nodes = 1; if (model->ret_size > 0) flags |= BPF_TRAMP_F_RET_FENTRY_RET; - size = arch_bpf_trampoline_size(model, flags, tlinks, stub_func); + size = arch_bpf_trampoline_size(model, flags, tnodes, stub_func); if (size <= 0) return size ? : -EFAULT; @@ -628,7 +628,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, size = arch_prepare_bpf_trampoline(NULL, image + image_off, image + image_off + size, - model, flags, tlinks, stub_func); + model, flags, tnodes, stub_func); if (size <= 0) { if (image != *_image) bpf_struct_ops_image_free(image); @@ -693,7 +693,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, const struct btf_type *module_type; const struct btf_member *member; const struct btf_type *t = st_ops_desc->type; - struct bpf_tramp_links *tlinks; + struct bpf_tramp_nodes *tnodes; void *udata, *kdata; int prog_fd, err; u32 i, trampoline_start, image_off = 0; @@ -720,8 +720,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, if (uvalue->common.state || refcount_read(&uvalue->common.refcnt)) return -EINVAL; - tlinks = kzalloc_objs(*tlinks, BPF_TRAMP_MAX); - if (!tlinks) + tnodes = kzalloc_objs(*tnodes, BPF_TRAMP_MAX); + if (!tnodes) return -ENOMEM; uvalue = (struct bpf_struct_ops_value *)st_map->uvalue; @@ -817,8 +817,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, err = -ENOMEM; goto reset_unlock; } - bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, - &bpf_struct_ops_link_lops, prog, prog->expected_attach_type); + bpf_tramp_link_init(link, BPF_LINK_TYPE_STRUCT_OPS, + &bpf_struct_ops_link_lops, prog, prog->expected_attach_type, 0); + *plink++ = &link->link; /* Poison pointer on error instead of return for backward compatibility */ @@ -832,7 +833,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, *pksym++ = ksym; trampoline_start = image_off; - err = bpf_struct_ops_prepare_trampoline(tlinks, link, + err = bpf_struct_ops_prepare_trampoline(tnodes, &link->node, &st_ops->func_models[i], *(void **)(st_ops->cfi_stubs + moff), &image, &image_off, @@ -911,7 +912,7 @@ reset_unlock: memset(uvalue, 0, map->value_size); memset(kvalue, 0, map->value_size); unlock: - kfree(tlinks); + kfree(tnodes); mutex_unlock(&st_map->lock); if (!err) bpf_struct_ops_map_add_ksyms(st_map); @@ -1204,6 +1205,42 @@ u32 bpf_struct_ops_id(const void *kdata) } EXPORT_SYMBOL_GPL(bpf_struct_ops_id); +/** + * bpf_struct_ops_for_each_prog - Invoke @cb for each member prog + * @kdata: kernel-side struct_ops vmtable (the @kdata arg to ->reg/->update/->unreg) + * @cb: callback invoked once per member prog; non-zero return stops iteration + * @data: opaque argument passed to @cb + * + * Walks the struct_ops member progs registered on the map containing @kdata. + * Intended for use from struct_ops ->reg() callbacks (and similar) that need to + * inspect the loaded BPF programs (for example to discover maps they reference + * via @prog->aux->used_maps). + * + * Return 0 if iteration completed, otherwise the first non-zero @cb return. + */ +int bpf_struct_ops_for_each_prog(const void *kdata, + int (*cb)(struct bpf_prog *prog, void *data), + void *data) +{ + struct bpf_struct_ops_value *kvalue; + struct bpf_struct_ops_map *st_map; + u32 i; + int ret; + + kvalue = container_of(kdata, struct bpf_struct_ops_value, data); + st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); + + for (i = 0; i < st_map->funcs_cnt; i++) { + if (!st_map->links[i]) + continue; + ret = cb(st_map->links[i]->prog, data); + if (ret) + return ret; + } + return 0; +} +EXPORT_SYMBOL_GPL(bpf_struct_ops_for_each_prog); + static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a62d78581207..15ae7c43f594 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -182,7 +182,6 @@ #define BITS_ROUNDUP_BYTES(bits) \ (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits)) -#define BTF_INFO_MASK 0x9f00ffff #define BTF_INT_MASK 0x0fffffff #define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE) #define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET) @@ -289,7 +288,7 @@ enum verifier_phase { struct resolve_vertex { const struct btf_type *t; u32 type_id; - u16 next_member; + u32 next_member; }; enum visit_state { @@ -2031,7 +2030,7 @@ static int env_stack_push(struct btf_verifier_env *env, } static void env_stack_set_next_member(struct btf_verifier_env *env, - u16 next_member) + u32 next_member) { env->stack[env->top_stack - 1].next_member = next_member; } @@ -3293,7 +3292,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, struct btf *btf = env->btf; u32 struct_size = t->size; u32 offset; - u16 i; + u32 i; meta_needed = btf_type_vlen(t) * sizeof(*member); if (meta_left < meta_needed) { @@ -3369,7 +3368,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env, { const struct btf_member *member; int err; - u16 i; + u32 i; /* Before continue resolving the next_member, * ensure the last member is indeed resolved to a @@ -3668,7 +3667,7 @@ end: static int btf_repeat_fields(struct btf_field_info *info, int info_cnt, u32 field_cnt, u32 repeat_cnt, u32 elem_size) { - u32 i, j; + u32 i, j, total_cnt, total_repeats; u32 cur; /* Ensure not repeating fields that should not be repeated. */ @@ -3686,10 +3685,9 @@ static int btf_repeat_fields(struct btf_field_info *info, int info_cnt, } } - /* The type of struct size or variable size is u32, - * so the multiplication will not overflow. - */ - if (field_cnt * (repeat_cnt + 1) > info_cnt) + if (check_add_overflow(repeat_cnt, 1, &total_repeats) || + check_mul_overflow(field_cnt, total_repeats, &total_cnt) || + total_cnt > (u32)info_cnt) return -E2BIG; cur = field_cnt; @@ -4447,7 +4445,7 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, const struct btf_enum *enums = btf_type_enum(t); struct btf *btf = env->btf; const char *fmt_str; - u16 i, nr_enums; + u32 i, nr_enums; u32 meta_needed; nr_enums = btf_type_vlen(t); @@ -4555,7 +4553,7 @@ static s32 btf_enum64_check_meta(struct btf_verifier_env *env, const struct btf_enum64 *enums = btf_type_enum64(t); struct btf *btf = env->btf; const char *fmt_str; - u16 i, nr_enums; + u32 i, nr_enums; u32 meta_needed; nr_enums = btf_type_vlen(t); @@ -4683,7 +4681,7 @@ static void btf_func_proto_log(struct btf_verifier_env *env, const struct btf_type *t) { const struct btf_param *args = (const struct btf_param *)(t + 1); - u16 nr_args = btf_type_vlen(t), i; + u32 nr_args = btf_type_vlen(t), i; btf_verifier_log(env, "return=%u args=(", t->type); if (!nr_args) { @@ -4929,7 +4927,7 @@ static int btf_datasec_resolve(struct btf_verifier_env *env, { const struct btf_var_secinfo *vsi; struct btf *btf = env->btf; - u16 i; + u32 i; env->resolve_mode = RESOLVE_TBD; for_each_vsi_from(i, v->next_member, v->t, vsi) { @@ -5183,7 +5181,7 @@ static int btf_func_proto_check(struct btf_verifier_env *env, const struct btf_type *ret_type; const struct btf_param *args; const struct btf *btf; - u16 nr_args, i; + u32 nr_args, i; int err; btf = env->btf; @@ -5278,7 +5276,7 @@ static int btf_func_check(struct btf_verifier_env *env, const struct btf_type *proto_type; const struct btf_param *args; const struct btf *btf; - u16 nr_args, i; + u32 nr_args, i; btf = env->btf; proto_type = btf_type_by_id(btf, t->type); @@ -5336,12 +5334,6 @@ static s32 btf_check_meta(struct btf_verifier_env *env, } meta_left -= sizeof(*t); - if (t->info & ~BTF_INFO_MASK) { - btf_verifier_log(env, "[%u] Invalid btf_info:%x", - env->log_type_id, t->info); - return -EINVAL; - } - if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX || BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) { btf_verifier_log(env, "[%u] Invalid kind:%u", @@ -5914,25 +5906,10 @@ static int btf_check_type_tags(struct btf_verifier_env *env, return 0; } -static int finalize_log(struct bpf_verifier_log *log, bpfptr_t uattr, u32 uattr_size) -{ - u32 log_true_size; - int err; - - err = bpf_vlog_finalize(log, &log_true_size); - - if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) && - copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size), - &log_true_size, sizeof(log_true_size))) - err = -EFAULT; - - return err; -} - -static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) +static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, + struct bpf_log_attr *attr_log) { bpfptr_t btf_data = make_bpfptr(attr->btf, uattr.is_kernel); - char __user *log_ubuf = u64_to_user_ptr(attr->btf_log_buf); struct btf_struct_metas *struct_meta_tab; struct btf_verifier_env *env = NULL; struct btf *btf = NULL; @@ -5949,8 +5926,7 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat /* user could have requested verbose verifier output * and supplied buffer to store the verification trace */ - err = bpf_vlog_init(&env->log, attr->btf_log_level, - log_ubuf, attr->btf_log_size); + err = bpf_vlog_init(&env->log, attr_log->level, attr_log->ubuf, attr_log->size); if (err) goto errout_free; @@ -6015,7 +5991,7 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat } } - err = finalize_log(&env->log, uattr, uattr_size); + err = bpf_log_attr_finalize(attr_log, &env->log); if (err) goto errout_free; @@ -6027,7 +6003,7 @@ errout_meta: btf_free_struct_meta_tab(btf); errout: /* overwrite err with -ENOSPC or -EFAULT */ - ret = finalize_log(&env->log, uattr, uattr_size); + ret = bpf_log_attr_finalize(attr_log, &env->log); if (ret) err = ret; errout_free: @@ -6980,7 +6956,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, info->reg_type = ctx_arg_info->reg_type; info->btf = ctx_arg_info->btf ? : btf_vmlinux; info->btf_id = ctx_arg_info->btf_id; - info->ref_obj_id = ctx_arg_info->ref_obj_id; + info->ref_id = ctx_arg_info->ref_id; return true; } } @@ -7825,6 +7801,134 @@ enum btf_arg_tag { ARG_TAG_ARENA = BIT_ULL(5), }; +static int btf_scan_decl_tags(struct bpf_verifier_env *env, + const struct btf *btf, + const struct btf_type *fn_t, + u32 arg_idx, bool is_global, u32 *tags) +{ + int id = btf_named_start_id(btf, false) - 1; + const char tag_key[] = "arg:"; + static const struct { + const char *tag_value; + enum btf_arg_tag arg_tag; + } tag_values[] = { + { "ctx", ARG_TAG_CTX }, + { "trusted", ARG_TAG_TRUSTED }, + { "untrusted", ARG_TAG_UNTRUSTED }, + { "nonnull", ARG_TAG_NONNULL }, + { "nullable", ARG_TAG_NULLABLE }, + { "arena", ARG_TAG_ARENA }, + }; + + /* + * The 'arg:<tag>' decl_tag takes precedence over the derivation + * of the register type from the BTF type itself. + */ + while ((id = btf_find_next_decl_tag(btf, fn_t, arg_idx, tag_key, id)) > 0) { + const struct btf_type *tag_t; + const char *tag; + int i; + bool found; + + /* disallow arg tags in static subprogs */ + if (!is_global) { + bpf_log(&env->log, + "arg#%d type tag is not supported in static functions\n", + arg_idx); + return -EOPNOTSUPP; + } + + tag_t = btf_type_by_id(btf, id); + tag = __btf_name_by_offset(btf, tag_t->name_off) + (sizeof(tag_key) - 1); + + found = false; + for (i = 0; i < ARRAY_SIZE(tag_values); ++i) { + if (!strcmp(tag, tag_values[i].tag_value)) { + *tags |= tag_values[i].arg_tag; + found = true; + break; + } + } + + if (!found) { + bpf_log(&env->log, "arg#%d has unsupported set of tags\n", arg_idx); + return -EOPNOTSUPP; + } + } + if (id != -ENOENT) { + bpf_log(&env->log, "arg#%d type tag fetching failure: %d\n", arg_idx, id); + return id; + } + + return 0; +} + +static int btf_scan_type_tags(struct bpf_verifier_env *env, + const struct btf *btf, u32 type_id, + u32 *tags) +{ + const struct btf_type *t; + + /* Find the first pointer type in the chain. */ + t = btf_type_skip_modifiers(btf, type_id, NULL); + + /* + * We currently reject type tags on non-pointer types, + * which neither LLVM nor GCC support anyway. + */ + if (!t || !btf_type_is_ptr(t)) + return 0; + + /* We got a pointer, get all associated type tags. */ + for (t = btf_type_by_id(btf, t->type); t && btf_type_is_modifier(t); + t = btf_type_by_id(btf, t->type)) { + + /* Skip non-type tag modifiers. */ + if (!btf_type_is_type_tag(t)) + continue; + + const char *tag = __btf_name_by_offset(btf, t->name_off); + + if (strcmp(tag, "arena") == 0) { + *tags |= ARG_TAG_ARENA; + } else { + bpf_log(&env->log, "function signature member has unsupported type tag '%s'\n", + tag); + return -EOPNOTSUPP; + } + } + + return 0; +} + +/* Check whether the type is a valid return type. */ +static int btf_validate_return_type(struct bpf_verifier_env *env, struct btf *btf, + const struct btf_type *t, int subprog) +{ + u32 tags = 0; + int err; + + err = btf_scan_type_tags(env, btf, t->type, &tags); + if (err) + return err; + + t = btf_type_skip_modifiers(btf, t->type, NULL); + + /* + * We allow all subprogs except for the main one to return any kind of arena pointer. + * General arena variables are not allowed, since it makes no sense to return by value + * a variable that's on the heap in the first place. + */ + if (subprog && (tags & ARG_TAG_ARENA) && btf_type_is_ptr(t)) + return 0; + + /* We always accept void or scalars. */ + if (btf_type_is_void(t) || btf_type_is_int(t) || btf_is_any_enum(t)) + return 0; + + return -EOPNOTSUPP; +} + /* Process BTF of a function to produce high-level expectation of function * arguments (like ARG_PTR_TO_CTX, or ARG_PTR_TO_MEM, etc). This information * is cached in subprog info for reuse. @@ -7843,6 +7947,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) struct btf *btf = prog->aux->btf; const struct btf_param *args; const struct btf_type *t, *ref_t, *fn_t; + int err; u32 i, nargs, btf_id; const char *tname; @@ -7887,25 +7992,36 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) } args = (const struct btf_param *)(t + 1); nargs = btf_type_vlen(t); + sub->arg_cnt = nargs; + if (nargs > MAX_BPF_FUNC_ARGS) { + bpf_log(log, "kernel supports at most %d parameters, function %s has %d\n", + MAX_BPF_FUNC_ARGS, tname, nargs); + return -EFAULT; + } if (nargs > MAX_BPF_FUNC_REG_ARGS) { - if (!is_global) - return -EINVAL; - bpf_log(log, "Global function %s() with %d > %d args. Buggy compiler.\n", + if (!bpf_jit_supports_stack_args()) { + bpf_log(log, "JIT does not support function %s() with %d args\n", + tname, nargs); + return -EFAULT; + } + sub->stack_arg_cnt = nargs - MAX_BPF_FUNC_REG_ARGS; + } + + if (is_global && nargs > MAX_BPF_FUNC_REG_ARGS) { + bpf_log(log, "global function %s has %d > %d args, stack args not supported\n", tname, nargs, MAX_BPF_FUNC_REG_ARGS); return -EINVAL; } - /* check that function is void or returns int, exception cb also requires this */ - t = btf_type_by_id(btf, t->type); - while (btf_type_is_modifier(t)) - t = btf_type_by_id(btf, t->type); - if (!btf_type_is_void(t) && !btf_type_is_int(t) && !btf_is_any_enum(t)) { - if (!is_global) - return -EINVAL; - bpf_log(log, - "Global function %s() return value not void or scalar. " - "Only those are supported.\n", - tname); - return -EINVAL; + + err = btf_validate_return_type(env, btf, t, subprog); + if (err) { + if (is_global) { + bpf_log(log, + "Global function %s() return value not void or scalar. " + "Only those are supported.\n", + tname); + } + return err; } /* Convert BTF function arguments into verifier types. @@ -7913,42 +8029,13 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) */ for (i = 0; i < nargs; i++) { u32 tags = 0; - int id = btf_named_start_id(btf, false) - 1; - - /* 'arg:<tag>' decl_tag takes precedence over derivation of - * register type from BTF type itself - */ - while ((id = btf_find_next_decl_tag(btf, fn_t, i, "arg:", id)) > 0) { - const struct btf_type *tag_t = btf_type_by_id(btf, id); - const char *tag = __btf_name_by_offset(btf, tag_t->name_off) + 4; - - /* disallow arg tags in static subprogs */ - if (!is_global) { - bpf_log(log, "arg#%d type tag is not supported in static functions\n", i); - return -EOPNOTSUPP; - } + err = btf_scan_decl_tags(env, btf, fn_t, i, is_global, &tags); + if (err) + return err; - if (strcmp(tag, "ctx") == 0) { - tags |= ARG_TAG_CTX; - } else if (strcmp(tag, "trusted") == 0) { - tags |= ARG_TAG_TRUSTED; - } else if (strcmp(tag, "untrusted") == 0) { - tags |= ARG_TAG_UNTRUSTED; - } else if (strcmp(tag, "nonnull") == 0) { - tags |= ARG_TAG_NONNULL; - } else if (strcmp(tag, "nullable") == 0) { - tags |= ARG_TAG_NULLABLE; - } else if (strcmp(tag, "arena") == 0) { - tags |= ARG_TAG_ARENA; - } else { - bpf_log(log, "arg#%d has unsupported set of tags\n", i); - return -EOPNOTSUPP; - } - } - if (id != -ENOENT) { - bpf_log(log, "arg#%d type tag fetching failure: %d\n", i, id); - return id; - } + err = btf_scan_type_tags(env, btf, args[i].type, &tags); + if (err) + return err; t = btf_type_by_id(btf, args[i].type); while (btf_type_is_modifier(t)) @@ -7973,7 +8060,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) bpf_log(log, "arg#%d has invalid combination of tags\n", i); return -EINVAL; } - sub->args[i].arg_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY; + sub->args[i].arg_type = ARG_PTR_TO_DYNPTR; continue; } if (tags & ARG_TAG_TRUSTED) { @@ -8074,7 +8161,6 @@ skip_pointer: return -EINVAL; } - sub->arg_cnt = nargs; sub->args_cached = true; return 0; @@ -8196,12 +8282,12 @@ static int __btf_new_fd(struct btf *btf) return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC); } -int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) +int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log) { struct btf *btf; int ret; - btf = btf_parse(attr, uattr, uattr_size); + btf = btf_parse(attr, uattr, attr_log); if (IS_ERR(btf)) return PTR_ERR(btf); @@ -8684,6 +8770,39 @@ static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name, return 0; } +static int btf_check_kfunc_name(struct btf *btf, const char *func_name, u32 kind) +{ +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + struct btf_module *btf_mod, *tmp; +#endif + s32 id; + + if (!btf_is_module(btf)) + return 0; + + id = btf_find_by_name_kind(bpf_get_btf_vmlinux(), func_name, kind); + if (id >= 0) { + pr_err("kfunc %s (id: %d) is already present in vmlinux.\n", + func_name, id); + return -EINVAL; + } + +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + guard(mutex)(&btf_module_mutex); + list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { + if (btf_mod->btf == btf) + continue; + id = btf_find_by_name_kind(btf_mod->btf, func_name, kind); + if (id >= 0) { + pr_err("kfunc %s (id: %d) is already present in module %s.\n", + func_name, id, btf_mod->module->name); + return -EINVAL; + } + } +#endif + return 0; +} + static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags) { const struct btf_type *func; @@ -8697,7 +8816,8 @@ static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags) /* sanity check kfunc name */ func_name = btf_name_by_offset(btf, func->name_off); - if (!func_name || !func_name[0]) + if (!func_name || !func_name[0] || + btf_check_kfunc_name(btf, func_name, BTF_INFO_KIND(func->info))) return -EINVAL; func = btf_type_by_id(btf, func->type); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 876f6a81a9b6..83ce66296ac1 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -55,6 +55,28 @@ void __init cgroup_bpf_lifetime_notifier_init(void) &cgroup_bpf_lifetime_nb)); } +#ifdef CONFIG_BPF_LSM +struct cgroup_lsm_atype { + u32 attach_btf_id; + int refcnt; + bool returns_errno; +}; + +static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM]; + +static bool cgroup_bpf_hook_returns_errno(enum cgroup_bpf_attach_type atype) +{ + if (atype >= CGROUP_LSM_START && atype <= CGROUP_LSM_END) + return READ_ONCE(cgroup_lsm_atype[atype - CGROUP_LSM_START].returns_errno); + return true; +} +#else +static bool cgroup_bpf_hook_returns_errno(enum cgroup_bpf_attach_type atype) +{ + return true; +} +#endif + /* __always_inline is necessary to prevent indirect call through run_prog * function pointer. */ @@ -83,7 +105,8 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp, *(ret_flags) |= (func_ret >> 1); func_ret &= 1; } - if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval)) + if (!func_ret && cgroup_bpf_hook_returns_errno(atype) && + !IS_ERR_VALUE((long)run_ctx.retval)) run_ctx.retval = -EPERM; item++; } @@ -156,13 +179,6 @@ unsigned int __cgroup_bpf_run_lsm_current(const void *ctx, } #ifdef CONFIG_BPF_LSM -struct cgroup_lsm_atype { - u32 attach_btf_id; - int refcnt; -}; - -static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM]; - static enum cgroup_bpf_attach_type bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id) { @@ -191,10 +207,13 @@ void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) lockdep_assert_held(&cgroup_mutex); - WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id && - cgroup_lsm_atype[i].attach_btf_id != attach_btf_id); - - cgroup_lsm_atype[i].attach_btf_id = attach_btf_id; + if (!cgroup_lsm_atype[i].attach_btf_id) { + cgroup_lsm_atype[i].attach_btf_id = attach_btf_id; + WRITE_ONCE(cgroup_lsm_atype[i].returns_errno, + bpf_lsm_hook_returns_errno(attach_btf_id)); + } else { + WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id != attach_btf_id); + } cgroup_lsm_atype[i].refcnt++; } @@ -203,8 +222,10 @@ void bpf_cgroup_atype_put(int cgroup_atype) int i = cgroup_atype - CGROUP_LSM_START; cgroup_lock(); - if (--cgroup_lsm_atype[i].refcnt <= 0) + if (--cgroup_lsm_atype[i].refcnt <= 0) { + WRITE_ONCE(cgroup_lsm_atype[i].returns_errno, true); cgroup_lsm_atype[i].attach_btf_id = 0; + } WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0); cgroup_unlock(); } @@ -1208,7 +1229,7 @@ static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, /* Must be called with cgroup_mutex held to avoid races. */ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, - union bpf_attr __user *uattr) + union bpf_attr __user *uattr, u32 uattr_size) { __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags); bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE; @@ -1259,7 +1280,8 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, return -EFAULT; if (!effective_query && from_atype == to_atype) revision = cgrp->bpf.revisions[from_atype]; - if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision))) + if (uattr_size >= offsetofend(union bpf_attr, query.revision) && + copy_to_user(&uattr->query.revision, &revision, sizeof(revision))) return -EFAULT; if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt) /* return early if user requested only program count + flags */ @@ -1312,12 +1334,12 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, } static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, - union bpf_attr __user *uattr) + union bpf_attr __user *uattr, u32 uattr_size) { int ret; cgroup_lock(); - ret = __cgroup_bpf_query(cgrp, attr, uattr); + ret = __cgroup_bpf_query(cgrp, attr, uattr, uattr_size); cgroup_unlock(); return ret; } @@ -1520,7 +1542,7 @@ out_put_cgroup: } int cgroup_bpf_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr) + union bpf_attr __user *uattr, u32 uattr_size) { struct cgroup *cgrp; int ret; @@ -1529,7 +1551,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, if (IS_ERR(cgrp)) return PTR_ERR(cgrp); - ret = cgroup_bpf_query(cgrp, attr, uattr); + ret = cgroup_bpf_query(cgrp, attr, uattr, uattr_size); cgroup_put(cgrp); return ret; @@ -1935,8 +1957,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, kfree(ctx.cur_val); - if (ret == 1 && ctx.new_updated) { - kfree(*buf); + if (!ret && ctx.new_updated) { + kvfree(*buf); *buf = ctx.new_val; *pcount = ctx.new_len; } else { @@ -2342,6 +2364,7 @@ BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx, return -E2BIG; memcpy(ctx->new_val, buf, buf_len); + ((char *)ctx->new_val)[buf_len] = '\0'; ctx->new_len = buf_len; ctx->new_updated = 1; diff --git a/kernel/bpf/cnum.c b/kernel/bpf/cnum.c new file mode 100644 index 000000000000..86142cb2aee5 --- /dev/null +++ b/kernel/bpf/cnum.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include <linux/bits.h> + +#define T 32 +#include "cnum_defs.h" +#undef T + +#define T 64 +#include "cnum_defs.h" +#undef T + +struct cnum32 cnum32_from_cnum64(struct cnum64 cnum) +{ + if (cnum64_is_empty(cnum)) + return CNUM32_EMPTY; + + if (cnum.size >= U32_MAX) + return (struct cnum32){ .base = 0, .size = U32_MAX }; + else + return (struct cnum32){ .base = (u32)cnum.base, .size = cnum.size }; +} + +/* + * Suppose 'a' and 'b' are laid out as follows: + * + * 64-bit number axis ---> + * + * N*2^32 (N+1)*2^32 (N+2)*2^32 (N+3)*2^32 + * ||------|---|=====|-------||----------|=====|-------||----------|=====|----|--|| + * | |< b >| |< b >| |< b >| | + * | | | | + * |<--+--------------------------- a ---------------------------+--->| + * | | + * |<-------------------------- t -------------------------->| + * + * In such a case it is possible to infer a more tight representation t + * such that ∀ v ∈ a, (u32)v ∈ b: v ∈ t. + */ +struct cnum64 cnum64_cnum32_intersect(struct cnum64 a, struct cnum32 b) +{ + /* + * To simplify reasoning, rotate the circles so that [virtual] a1 starts + * at u32 boundary, b1 represents b in this new frame of reference. + */ + struct cnum32 b1 = { b.base - (u32)a.base, b.size }; + struct cnum64 t = a; + u64 d, b1_max; + + if (cnum64_is_empty(a) || cnum32_is_empty(b)) + return CNUM64_EMPTY; + + if (cnum32_urange_overflow(b1)) { + b1_max = (u32)b1.base + (u32)b1.size; /* overflow here is fine and necessary */ + if ((u32)a.size > b1_max && (u32)a.size < b1.base) { + /* + * N*2^32 (N+1)*2^32 + * ||=====|------------|=====||=====|---------|---|=====|| + * |b1 ->| |<- b1||b1 ->| | |<- b1| + * |<----------------- a1 ------------------>| + * |<-------------- t ------------>|<-- d -->| (after adjustment) + * ^ + * b1_max + */ + d = (u32)a.size - b1_max; + t.size -= d; + } else { + /* + * No adjustments possible in the following cases: + * + * ||=====|------------|=====||===|=|-------------|=|===|| + * |b1 ->| |<- b1||b1 +>| |<+ b1| + * |<----------------- a1 ------>| | + * |<----------------- (or) a1 ------------------->| + */ + } + } else { + if (t.size < b1.base) + /* + * N*2^32 (N+1)*2^32 + * ||----------|--|=======|--||------> + * |<-- a1 -->| |<- b ->| + */ + return CNUM64_EMPTY; + /* + * N*2^32 (N+1)*2^32 + * ||-------------|========|-||-----| -------|========|-|| + * | |<- b1 ->| | |<- b1 ->| + * |<------------+ a1 ------------>| + * |<------ t ------>| (after adjustment) + */ + t.base += b1.base; + t.size -= b1.base; + b1_max = b1.base + b1.size; + d = 0; + if ((u32)a.size < b1.base) + /* + * N*2^32 (N+1)*2^32 + * ||-------------|========|-||------|-------|========|-|| + * | |<- b1 ->| | |<- b1 ->| + * |<------------+-- a1 --+-------->| + * |<- t ->|<-- d -->| (after adjustment) + */ + d = (u32)a.size + (BIT_ULL(32) - b1_max); + else if ((u32)a.size >= b1_max) + /* + * N*2^32 (N+1)*2^32 + * ||--|========|------------||--|========|-------|-----|| + * | |<- b1 ->| |<- b1 ->| | + * |<-+------------------ a1 ------------+------>| + * |<-------------- t --------------->|<- d ->| (after adjustment) + */ + d = (u32)a.size - b1_max; + if (t.size < d) + return CNUM64_EMPTY; + t.size -= d; + } + return t; +} diff --git a/kernel/bpf/cnum_defs.h b/kernel/bpf/cnum_defs.h new file mode 100644 index 000000000000..a90e317e3578 --- /dev/null +++ b/kernel/bpf/cnum_defs.h @@ -0,0 +1,247 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#ifndef T +#error "Define T (bit width: 32, 64) before including cnum_defs.h" +#endif + +#include <linux/cnum.h> +#include <linux/kernel.h> +#include <linux/limits.h> +#include <linux/minmax.h> +#include <linux/compiler_types.h> + +#define cnum_t __PASTE(cnum, T) +#define ut __PASTE(u, T) +#define st __PASTE(s, T) +#define UT_MAX __PASTE(__PASTE(U, T), _MAX) +#define ST_MAX __PASTE(__PASTE(S, T), _MAX) +#define ST_MIN __PASTE(__PASTE(S, T), _MIN) +#define EMPTY __PASTE(__PASTE(CNUM, T), _EMPTY) +#define FN(name) __PASTE(__PASTE(cnum, T), __PASTE(_, name)) + +struct cnum_t FN(from_urange)(ut min, ut max) +{ + return (struct cnum_t){ .base = min, .size = (ut)max - min }; +} + +struct cnum_t FN(from_srange)(st min, st max) +{ + ut size = (ut)max - (ut)min; + ut base = size == UT_MAX ? 0 : (ut)min; + + return (struct cnum_t){ .base = base, .size = size }; +} + +/* True if this cnum represents two unsigned ranges. */ +static inline bool FN(urange_overflow)(struct cnum_t cnum) +{ + /* Same as cnum.base + cnum.size > UT_MAX but avoids overflow */ + return cnum.size > UT_MAX - (ut)cnum.base; +} + +/* + * cnum{T}_umin / cnum{T}_umax query an unsigned range represented by this cnum. + * If cnum represents a range crossing the UT_MAX/0 boundary, the unbound range + * [0..UT_MAX] is returned. + */ +ut FN(umin)(struct cnum_t cnum) +{ + return FN(urange_overflow)(cnum) ? 0 : cnum.base; +} +EXPORT_SYMBOL_GPL(FN(umin)); + +ut FN(umax)(struct cnum_t cnum) +{ + return FN(urange_overflow)(cnum) ? UT_MAX : cnum.base + cnum.size; +} +EXPORT_SYMBOL_GPL(FN(umax)); + +/* True if this cnum represents two signed ranges. */ +static inline bool FN(srange_overflow)(struct cnum_t cnum) +{ + return FN(contains)(cnum, (ut)ST_MAX) && FN(contains)(cnum, (ut)ST_MIN); +} + +/* + * cnum{T}_smin / cnum{T}_smax query a signed range represented by this cnum. + * If cnum represents a range crossing the ST_MAX/ST_MIN boundary, the unbound range + * [ST_MIN..ST_MAX] is returned. + */ +st FN(smin)(struct cnum_t cnum) +{ + return FN(srange_overflow)(cnum) + ? ST_MIN + : min((st)cnum.base, (st)(cnum.base + cnum.size)); +} + +st FN(smax)(struct cnum_t cnum) +{ + return FN(srange_overflow)(cnum) + ? ST_MAX + : max((st)cnum.base, (st)(cnum.base + cnum.size)); +} + +/* + * Returns a possibly empty intersection of cnums 'a' and 'b'. + * If 'a' and 'b' intersect in two sub-arcs, the function over-approximates + * and returns either 'a' or 'b', whichever is smaller. + */ +struct cnum_t FN(intersect)(struct cnum_t a, struct cnum_t b) +{ + struct cnum_t b1; + ut dbase; + + if (FN(is_empty)(a) || FN(is_empty)(b)) + return EMPTY; + + if (a.base > b.base) + swap(a, b); + + /* + * Rotate frame of reference such that a.base is 0. + * 'b1' is 'b' in this frame of reference. + */ + dbase = b.base - a.base; + b1 = (struct cnum_t){ dbase, b.size }; + if (FN(urange_overflow)(b1)) { + if (b1.base <= a.size) { + /* + * Rotated frame (a.base at origin): + * + * 0 UT_MAX + * |--------------------------------------------| + * [=== a ==========================] | + * [= b1 tail =] [========= b1 main ==========>] + * ^-- b1.base <= a.size + * + * 'a' and 'b' intersect in two disjoint arcs, + * can't represent as single cnum, over-approximate + * the result. + */ + return a.size <= b.size ? a : b; + } else { + /* + * Rotated frame (a.base at origin): + * + * 0 UT_MAX + * |--------------------------------------------| + * [=== a =============] | | + * [= b1 tail =] [======= b1 main ====>] + * ^-- b1.base > a.size + * + * Only 'b' tail intersects 'a'. + */ + return (struct cnum_t) { + .base = a.base, + .size = min(a.size, (ut)(b1.base + b1.size)), + }; + } + } else if (a.size >= b1.base) { + /* + * Rotated frame (a.base at origin): + * + * 0 UT_MAX + * |--------------------------------------------------| + * [=== a ==================================] | + * [== b1 =====================] + * + * 0 UT_MAX + * |--------------------------------------------------| + * [=== a ==================================] | + * [== b1 ====] + * ^-- b1.base <= a.size + * |<-- a.size - dbase -->| + * + * 'a' and 'b' intersect as one cnum. + */ + return (struct cnum_t) { + .base = b.base, + .size = min((ut)(a.size - dbase), b.size), + }; + } else { + return EMPTY; + } +} + +void FN(intersect_with)(struct cnum_t *dst, struct cnum_t src) +{ + *dst = FN(intersect)(*dst, src); +} + +void FN(intersect_with_urange)(struct cnum_t *dst, ut min, ut max) +{ + FN(intersect_with)(dst, FN(from_urange)(min, max)); +} + +void FN(intersect_with_srange)(struct cnum_t *dst, st min, st max) +{ + FN(intersect_with)(dst, FN(from_srange)(min, max)); +} + +static inline struct cnum_t FN(normalize)(struct cnum_t cnum) +{ + if (cnum.size == UT_MAX && cnum.base != 0 && cnum.base != (ut)ST_MAX) + cnum.base = 0; + return cnum; +} + +struct cnum_t FN(add)(struct cnum_t a, struct cnum_t b) +{ + if (FN(is_empty)(a) || FN(is_empty)(b)) + return EMPTY; + if (a.size > UT_MAX - b.size) + return (struct cnum_t){ 0, (ut)UT_MAX }; + else + return FN(normalize)((struct cnum_t){ a.base + b.base, a.size + b.size }); +} + +struct cnum_t FN(negate)(struct cnum_t a) +{ + if (FN(is_empty)(a)) + return EMPTY; + return FN(normalize)((struct cnum_t){ -((ut)a.base + a.size), a.size }); +} + +bool FN(is_empty)(struct cnum_t cnum) +{ + return cnum.base == EMPTY.base && cnum.size == EMPTY.size; +} + +bool FN(contains)(struct cnum_t cnum, ut v) +{ + if (FN(is_empty)(cnum)) + return false; + if (FN(urange_overflow)(cnum)) + return v >= cnum.base || v <= (ut)cnum.base + cnum.size; + else + return v >= cnum.base && v <= (ut)cnum.base + cnum.size; +} + +bool FN(is_const)(struct cnum_t cnum) +{ + return cnum.size == 0; +} + +bool FN(is_subset)(struct cnum_t bigger, struct cnum_t smaller) +{ + if (FN(is_empty(smaller))) + return true; + if (FN(is_empty(bigger))) + return false; + /* rotate both arcs such that 'bigger' starts at origin, hence does not overflow */ + smaller.base -= bigger.base; + bigger.base = 0; + if (FN(urange_overflow)(smaller) && bigger.size < UT_MAX) + return false; + return smaller.base + smaller.size <= bigger.size; +} + +#undef EMPTY +#undef cnum_t +#undef ut +#undef st +#undef UT_MAX +#undef ST_MAX +#undef ST_MIN +#undef FN diff --git a/kernel/bpf/const_fold.c b/kernel/bpf/const_fold.c index db73c4740b1e..b2a19acadb91 100644 --- a/kernel/bpf/const_fold.c +++ b/kernel/bpf/const_fold.c @@ -58,6 +58,14 @@ static void const_reg_xfer(struct bpf_verifier_env *env, struct const_arg_info * u8 opcode = BPF_OP(insn->code) | BPF_SRC(insn->code); int r; + /* Stack arg stores (r11-based) are outside the tracked register set. */ + if (is_stack_arg_st(insn) || is_stack_arg_stx(insn)) + return; + if (is_stack_arg_ldx(insn)) { + ci_out[insn->dst_reg] = unknown; + return; + } + switch (class) { case BPF_ALU: case BPF_ALU64: diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 6aa2a8b24030..649cce41e13f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1299,8 +1299,8 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, u32 imm_rnd = get_random_u32(); s16 off; - BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); - BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG); + BUILD_BUG_ON(BPF_REG_PARAMS + 2 != MAX_BPF_JIT_REG); + BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); /* Constraints on AX register: * @@ -1582,6 +1582,16 @@ bool bpf_insn_is_indirect_target(const struct bpf_verifier_env *env, const struc insn_idx += prog->aux->subprog_start; return env->insn_aux_data[insn_idx].indirect_target; } + +u16 bpf_out_stack_arg_cnt(const struct bpf_verifier_env *env, const struct bpf_prog *prog) +{ + const struct bpf_subprog_info *sub; + + if (!env) + return 0; + sub = &env->subprog_info[prog->aux->func_idx]; + return sub->stack_arg_cnt - bpf_in_stack_arg_cnt(sub); +} #endif /* CONFIG_BPF_JIT */ /* Base function for offset calculation. Needs to go into .text section, @@ -2471,7 +2481,7 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map, cookie = aux->cgroup_storage[i] ? aux->cgroup_storage[i]->cookie : 0; ret = map->owner->storage_cookie[i] == cookie || - !cookie; + (!cookie && !aux->tail_call_reachable); } if (ret && map->owner->attach_func_proto != aux->attach_func_proto) { @@ -3228,6 +3238,11 @@ bool __weak bpf_jit_supports_kfunc_call(void) return false; } +bool __weak bpf_jit_supports_stack_args(void) +{ + return false; +} + bool __weak bpf_jit_supports_far_kfunc_call(void) { return false; @@ -3363,6 +3378,12 @@ __weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena) } #ifdef CONFIG_BPF_SYSCALL +__weak bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, + unsigned long fault_ip) +{ + return false; +} + static int __init bpf_global_ma_init(void) { int ret; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index cc0a43ebab6b..dc7b859e8bbf 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -581,6 +581,10 @@ static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj, { struct xdp_frame *nxdpf; + /* Frags live outside the linear frame and cannot be cloned safely. */ + if (unlikely(xdp_frame_has_frags(xdpf))) + return -EOPNOTSUPP; + nxdpf = xdpf_clone(xdpf); if (!nxdpf) return -ENOMEM; @@ -706,6 +710,18 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, if (unlikely(err)) return err; + if (dst->xdp_prog && skb_cloned(skb)) { + struct sk_buff *nskb; + + nskb = skb_copy(skb, GFP_ATOMIC); + if (!nskb) + return -ENOMEM; + + nskb->mac_len = skb->mac_len; + consume_skb(skb); + skb = nskb; + } + /* Redirect has already succeeded semantically at this point, so we just * return 0 even if packet is dropped. Helper below takes care of * freeing skb. @@ -726,6 +742,9 @@ static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst, struct sk_buff *nskb; int err; + if (unlikely(skb_is_nonlinear(skb))) + return -EOPNOTSUPP; + nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return -ENOMEM; diff --git a/kernel/bpf/fixups.c b/kernel/bpf/fixups.c index 3692adf62558..3cf2cc6e3ab6 100644 --- a/kernel/bpf/fixups.c +++ b/kernel/bpf/fixups.c @@ -870,7 +870,7 @@ int bpf_convert_ctx_accesses(struct bpf_verifier_env *env) case PTR_TO_BTF_ID: case PTR_TO_BTF_ID | PTR_UNTRUSTED: /* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike - * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot + * PTR_TO_BTF_ID, and an active referenced id, but the same cannot * be said once it is marked PTR_UNTRUSTED, hence we must handle * any faults for loads into such types. BPF_WRITE is disallowed * for this case. @@ -1265,6 +1265,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) prog->aux->real_func_cnt = env->subprog_cnt; prog->aux->bpf_exception_cb = (void *)func[env->exception_callback_subprog]->bpf_func; prog->aux->exception_boundary = func[0]->aux->exception_boundary; + prog->aux->stack_arg_sp_adjust = func[0]->aux->stack_arg_sp_adjust; bpf_prog_jit_attempt_done(prog); return 0; out_free: @@ -1378,9 +1379,21 @@ int bpf_fixup_call_args(struct bpf_verifier_env *env) struct bpf_prog *prog = env->prog; struct bpf_insn *insn = prog->insnsi; bool has_kfunc_call = bpf_prog_has_kfunc_call(prog); - int i, depth; + int depth; #endif - int err = 0; + int i, err = 0; + + for (i = 0; i < env->subprog_cnt; i++) { + struct bpf_subprog_info *subprog = &env->subprog_info[i]; + u16 outgoing = subprog->stack_arg_cnt - bpf_in_stack_arg_cnt(subprog); + + if (subprog->max_out_stack_arg_cnt > outgoing) { + verbose(env, + "func#%d writes %u stack arg slots, but calls only require %u\n", + i, subprog->max_out_stack_arg_cnt, outgoing); + return -EINVAL; + } + } if (env->prog->jit_requested && !bpf_prog_is_offloaded(env->prog->aux)) { @@ -1395,6 +1408,12 @@ int bpf_fixup_call_args(struct bpf_verifier_env *env) verbose(env, "calling kernel functions are not allowed in non-JITed programs\n"); return -EINVAL; } + for (i = 0; i < env->subprog_cnt; i++) { + if (bpf_in_stack_arg_cnt(&env->subprog_info[i])) { + verbose(env, "stack args are not supported in non-JITed programs\n"); + return -EINVAL; + } + } if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) { /* When JIT fails the progs with bpf2bpf calls and tail_calls * have to be rejected, since interpreter doesn't support them yet. @@ -2167,6 +2186,8 @@ patch_map_ops_generic: insn->imm == BPF_FUNC_get_func_ret) { if (eatype == BPF_TRACE_FEXIT || eatype == BPF_TRACE_FSESSION || + eatype == BPF_TRACE_FEXIT_MULTI || + eatype == BPF_TRACE_FSESSION_MULTI || eatype == BPF_MODIFY_RETURN) { /* Load nr_args from ctx - 8 */ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3dd9b4924ae4..9f394e1aa2e8 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -9,6 +9,7 @@ #include <linux/rculist_nulls.h> #include <linux/rcupdate_wait.h> #include <linux/random.h> +#include <linux/rhashtable.h> #include <uapi/linux/btf.h> #include <linux/rcupdate_trace.h> #include <linux/btf_ids.h> @@ -242,6 +243,10 @@ static void htab_free_prealloced_fields(struct bpf_htab *htab) if (IS_ERR_OR_NULL(htab->map.record)) return; + /* + * Preallocated maps do not have a bpf_mem_alloc destructor, so fully + * destroy every element, including the extra elements. + */ if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); for (i = 0; i < num_entries; i++) { @@ -496,28 +501,26 @@ static void htab_dtor_ctx_free(void *ctx) kfree(ctx); } -static int htab_set_dtor(struct bpf_htab *htab, void (*dtor)(void *, void *)) +static int bpf_ma_set_dtor(struct bpf_map *map, struct bpf_mem_alloc *ma, + void (*dtor)(void *, void *)) { - u32 key_size = htab->map.key_size; - struct bpf_mem_alloc *ma; struct htab_btf_record *hrec; int err; /* No need for dtors. */ - if (IS_ERR_OR_NULL(htab->map.record)) + if (IS_ERR_OR_NULL(map->record)) return 0; hrec = kzalloc(sizeof(*hrec), GFP_KERNEL); if (!hrec) return -ENOMEM; - hrec->key_size = key_size; - hrec->record = btf_record_dup(htab->map.record); + hrec->key_size = map->key_size; + hrec->record = btf_record_dup(map->record); if (IS_ERR(hrec->record)) { err = PTR_ERR(hrec->record); kfree(hrec); return err; } - ma = htab_is_percpu(htab) ? &htab->pcpu_ma : &htab->ma; bpf_mem_alloc_set_dtor(ma, dtor, htab_dtor_ctx_free, hrec); return 0; } @@ -534,9 +537,9 @@ static int htab_map_check_btf(struct bpf_map *map, const struct btf *btf, * populated in htab_map_alloc(), so it will always appear as NULL. */ if (htab_is_percpu(htab)) - return htab_set_dtor(htab, htab_pcpu_mem_dtor); + return bpf_ma_set_dtor(map, &htab->pcpu_ma, htab_pcpu_mem_dtor); else - return htab_set_dtor(htab, htab_mem_dtor); + return bpf_ma_set_dtor(map, &htab->ma, htab_mem_dtor); } static struct bpf_map *htab_map_alloc(union bpf_attr *attr) @@ -834,8 +837,8 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map, return insn - insn_buf; } -static void check_and_free_fields(struct bpf_htab *htab, - struct htab_elem *elem) +static void check_and_cancel_fields(struct bpf_htab *htab, + struct htab_elem *elem) { if (IS_ERR_OR_NULL(htab->map.record)) return; @@ -845,11 +848,11 @@ static void check_and_free_fields(struct bpf_htab *htab, int cpu; for_each_possible_cpu(cpu) - bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu)); + bpf_obj_cancel_fields(&htab->map, per_cpu_ptr(pptr, cpu)); } else { void *map_value = htab_elem_value(elem, htab->map.key_size); - bpf_obj_free_fields(htab->map.record, map_value); + bpf_obj_cancel_fields(&htab->map, map_value); } } @@ -884,7 +887,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) htab_unlock_bucket(b, flags); if (l == tgt_l) - check_and_free_fields(htab, l); + check_and_cancel_fields(htab, l); return l == tgt_l; } @@ -949,7 +952,7 @@ find_first_elem: static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) { - check_and_free_fields(htab, l); + check_and_cancel_fields(htab, l); if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr); @@ -1002,7 +1005,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) if (htab_is_prealloc(htab)) { bpf_map_dec_elem_count(&htab->map); - check_and_free_fields(htab, l); + check_and_cancel_fields(htab, l); pcpu_freelist_push(&htab->freelist, &l->fnode); } else { dec_elem_count(htab); @@ -1019,7 +1022,7 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, /* copy true value_size bytes */ ptr = this_cpu_ptr(pptr); copy_map_value(&htab->map, ptr, value); - bpf_obj_free_fields(htab->map.record, ptr); + bpf_obj_cancel_fields(&htab->map, ptr); } else { u32 size = round_up(htab->map.value_size, 8); void *val; @@ -1029,7 +1032,7 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, cpu = map_flags >> 32; ptr = per_cpu_ptr(pptr, cpu); copy_map_value(&htab->map, ptr, value); - bpf_obj_free_fields(htab->map.record, ptr); + bpf_obj_cancel_fields(&htab->map, ptr); return; } @@ -1037,7 +1040,7 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, ptr = per_cpu_ptr(pptr, cpu); val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu; copy_map_value(&htab->map, ptr, val); - bpf_obj_free_fields(htab->map.record, ptr); + bpf_obj_cancel_fields(&htab->map, ptr); } } } @@ -1253,11 +1256,11 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (l_old) { hlist_nulls_del_rcu(&l_old->hash_node); - /* l_old has already been stashed in htab->extra_elems, free - * its special fields before it is available for reuse. + /* l_old has already been stashed in htab->extra_elems, cancel + * its reusable special fields before it is available for reuse. */ if (htab_is_prealloc(htab)) - check_and_free_fields(htab, l_old); + check_and_cancel_fields(htab, l_old); } htab_unlock_bucket(b, flags); if (l_old && !htab_is_prealloc(htab)) @@ -1270,7 +1273,7 @@ err: static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem) { - check_and_free_fields(htab, elem); + check_and_cancel_fields(htab, elem); bpf_map_dec_elem_count(&htab->map); bpf_lru_push_free(&htab->lru, &elem->lru_node); } @@ -2739,3 +2742,794 @@ const struct bpf_map_ops htab_of_maps_map_ops = { BATCH_OPS(htab), .map_btf_id = &htab_map_btf_ids[0], }; + +struct rhtab_elem { + struct rhash_head node; + /* key bytes, then value bytes follow */ + u8 data[] __aligned(8); +}; + +struct bpf_rhtab { + struct bpf_map map; + struct rhashtable ht; + struct bpf_mem_alloc ma; + u32 elem_size; + bool freeing_internal; +}; + +static const struct rhashtable_params rhtab_params = { + .head_offset = offsetof(struct rhtab_elem, node), + .key_offset = offsetof(struct rhtab_elem, data), +}; + +static inline void *rhtab_elem_value(struct rhtab_elem *l, u32 key_size) +{ + return l->data + round_up(key_size, 8); +} + +/* Specialize hash function and objcmp for long sized key */ +static __always_inline int rhtab_key_cmp_long(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const unsigned long key1 = *(const unsigned long *)arg->key; + const struct rhtab_elem *key2 = ptr; + + return key1 != *(const unsigned long *)key2->data; +} + +static __always_inline u32 rhtab_hashfn_long(const void *data, u32 len, u32 seed) +{ + u64 k = *(const unsigned long *)data; + + return (u32)(k ^ (k >> 32)) ^ seed; +} + +static const struct rhashtable_params rhtab_params_long = { + .head_offset = offsetof(struct rhtab_elem, node), + .key_offset = offsetof(struct rhtab_elem, data), + .key_len = sizeof(long), + .hashfn = rhtab_hashfn_long, + .obj_cmpfn = rhtab_key_cmp_long, +}; + +static struct bpf_map *rhtab_map_alloc(union bpf_attr *attr) +{ + struct rhashtable_params params; + struct bpf_rhtab *rhtab; + int err = 0; + + rhtab = bpf_map_area_alloc(sizeof(*rhtab), NUMA_NO_NODE); + if (!rhtab) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&rhtab->map, attr); + + if (rhtab->map.max_entries > 1UL << 31) { + err = -E2BIG; + goto free_rhtab; + } + + rhtab->elem_size = sizeof(struct rhtab_elem) + round_up(rhtab->map.key_size, 8) + + round_up(rhtab->map.value_size, 8); + + params = rhtab_params; + params.key_len = rhtab->map.key_size; + params.nelem_hint = (u32)attr->map_extra; + params.automatic_shrinking = true; + + if (rhtab->map.key_size == sizeof(long)) { + params.hashfn = rhtab_hashfn_long; + params.obj_cmpfn = rhtab_key_cmp_long; + } + + err = rhashtable_init(&rhtab->ht, ¶ms); + if (err) + goto free_rhtab; + + /* Set max_elems after rhashtable_init() since init zeroes the struct */ + rhtab->ht.max_elems = rhtab->map.max_entries; + + err = bpf_mem_alloc_init(&rhtab->ma, rhtab->elem_size, false); + if (err) + goto destroy_rhtab; + + return &rhtab->map; + +destroy_rhtab: + rhashtable_destroy(&rhtab->ht); +free_rhtab: + bpf_map_area_free(rhtab); + return ERR_PTR(err); +} + +static int rhtab_map_alloc_check(union bpf_attr *attr) +{ + if (!(attr->map_flags & BPF_F_NO_PREALLOC)) + return -EINVAL; + + if (attr->map_flags & BPF_F_ZERO_SEED) + return -EINVAL; + + if (attr->key_size > U16_MAX) + return -E2BIG; + + if (attr->map_extra >> 32) + return -EINVAL; + + if ((u32)attr->map_extra > U16_MAX) + return -E2BIG; + + if ((u32)attr->map_extra > attr->max_entries) + return -EINVAL; + + return htab_map_alloc_check(attr); +} + +static void rhtab_check_and_free_fields(struct bpf_rhtab *rhtab, + struct rhtab_elem *elem) +{ + if (IS_ERR_OR_NULL(rhtab->map.record)) + return; + + bpf_obj_free_fields(rhtab->map.record, + rhtab_elem_value(elem, rhtab->map.key_size)); +} + +static void rhtab_mem_dtor(void *obj, void *ctx) +{ + struct htab_btf_record *hrec = ctx; + struct rhtab_elem *elem = obj; + + if (IS_ERR_OR_NULL(hrec->record)) + return; + + bpf_obj_free_fields(hrec->record, + rhtab_elem_value(elem, hrec->key_size)); +} + +static void rhtab_free_elem(void *ptr, void *arg) +{ + struct bpf_rhtab *rhtab = arg; + struct rhtab_elem *elem = ptr; + + bpf_map_free_internal_structs(&rhtab->map, rhtab_elem_value(elem, rhtab->map.key_size)); + bpf_mem_cache_free_rcu(&rhtab->ma, elem); +} + +static void rhtab_map_free(struct bpf_map *map) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + + rhashtable_free_and_destroy(&rhtab->ht, rhtab_free_elem, rhtab); + bpf_mem_alloc_destroy(&rhtab->ma); + bpf_map_area_free(rhtab); +} + +static void *rhtab_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + + /* Hold RCU lock in case sleepable program calls via gen_lookup */ + guard(rcu)(); + + if (map->key_size == sizeof(long)) + return rhashtable_lookup_likely(&rhtab->ht, key, rhtab_params_long); + + return rhashtable_lookup_likely(&rhtab->ht, key, rhtab_params); +} + +static void *rhtab_map_lookup_elem(struct bpf_map *map, void *key) __must_hold(RCU) +{ + struct rhtab_elem *l; + + l = rhtab_lookup_elem(map, key); + return l ? rhtab_elem_value(l, map->key_size) : NULL; +} + +static void rhtab_read_elem_value(struct bpf_map *map, void *dst, struct rhtab_elem *elem, + u64 flags) +{ + void *src = rhtab_elem_value(elem, map->key_size); + + if (flags & BPF_F_LOCK) + copy_map_value_locked(map, dst, src, true); + else + copy_map_value(map, dst, src); +} + +static int rhtab_delete_elem(struct bpf_rhtab *rhtab, struct rhtab_elem *elem, void *copy, + u64 flags) +{ + int err; + + /* + * disable_instrumentation() mitigates the deadlock for programs running in NMI context. + * rhashtable locks bucket with local_irq_save(). Only NMI programs may reenter + * rhashtable code, bpf_disable_instrumentation() disables programs running in NMI, except + * raw tracepoints, which we don't have in rhashtable. + */ + bpf_disable_instrumentation(); + + if (rhtab->map.key_size == sizeof(long)) + err = rhashtable_remove_fast(&rhtab->ht, &elem->node, rhtab_params_long); + else + err = rhashtable_remove_fast(&rhtab->ht, &elem->node, rhtab_params); + + bpf_enable_instrumentation(); + + if (err) + return err; + + if (copy) { + rhtab_read_elem_value(&rhtab->map, copy, elem, flags); + check_and_init_map_value(&rhtab->map, copy); + } + /* Release internal structs: kptr, bpf_timer, task_work, wq */ + rhtab_check_and_free_fields(rhtab, elem); + bpf_mem_cache_free_rcu(&rhtab->ma, elem); + return 0; +} + + +static long rhtab_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + struct rhtab_elem *elem; + + guard(rcu)(); + + elem = rhtab_lookup_elem(map, key); + if (!elem) + return -ENOENT; + + return rhtab_delete_elem(rhtab, elem, NULL, 0); +} + +static int rhtab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, void *value, u64 flags) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + struct rhtab_elem *elem; + int err; + + err = bpf_map_check_op_flags(map, flags, BPF_F_LOCK); + if (err) + return err; + + guard(rcu)(); + + elem = rhtab_lookup_elem(map, key); + if (!elem) + return -ENOENT; + + return rhtab_delete_elem(rhtab, elem, value, flags); +} + +static long rhtab_map_update_existing(struct bpf_map *map, struct rhtab_elem *elem, void *value, + u64 map_flags) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + void *old_val = rhtab_elem_value(elem, map->key_size); + + if (map_flags & BPF_NOEXIST) + return -EEXIST; + + if (map_flags & BPF_F_LOCK) + copy_map_value_locked(map, old_val, value, false); + else + copy_map_value(map, old_val, value); + + /* + * Torn reads: a concurrent reader without BPF_F_LOCK may observe + * the value mid-copy. Callers requiring consistent reads must use + * BPF_F_LOCK, matching arraymap semantics. + * + * copy_map_value() skips special-field offsets, so old timers/ + * kptrs/etc. still sit in the slot. Cancel them after the copy + * to match arraymap's update semantics. + */ + rhtab_check_and_free_fields(rhtab, elem); + return 0; +} + +static long rhtab_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + struct rhtab_elem *elem, *tmp; + + if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) + return -EINVAL; + + if ((map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)) + return -EINVAL; + + guard(rcu)(); + elem = rhtab_lookup_elem(map, key); + if (elem) + return rhtab_map_update_existing(map, elem, value, map_flags); + + if (map_flags & BPF_EXIST) + return -ENOENT; + + /* + * Reject new insertions while map_release_uref cleanup walks the + * table. Without this, new elements could keep triggering rehash + * and prevent the walk from terminating. + */ + if (READ_ONCE(rhtab->freeing_internal)) + return -EBUSY; + + /* Check max_entries limit before inserting new element */ + if (atomic_read(&rhtab->ht.nelems) >= map->max_entries) + return -E2BIG; + + elem = bpf_mem_cache_alloc(&rhtab->ma); + if (!elem) + return -ENOMEM; + + memcpy(elem->data, key, map->key_size); + copy_map_value(map, rhtab_elem_value(elem, map->key_size), value); + check_and_init_map_value(map, rhtab_elem_value(elem, map->key_size)); + + /* Prevent deadlock for NMI programs attempting to take bucket lock */ + bpf_disable_instrumentation(); + + if (map->key_size == sizeof(long)) + tmp = rhashtable_lookup_get_insert_fast(&rhtab->ht, &elem->node, rhtab_params_long); + else + tmp = rhashtable_lookup_get_insert_fast(&rhtab->ht, &elem->node, rhtab_params); + + bpf_enable_instrumentation(); + + if (tmp) { + bpf_mem_cache_free(&rhtab->ma, elem); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); + + return rhtab_map_update_existing(map, tmp, value, map_flags); + } + + return 0; +} + +static int rhtab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + const int ret = BPF_REG_0; + + BUILD_BUG_ON(!__same_type(&rhtab_lookup_elem, + (void *(*)(struct bpf_map *map, void *key)) NULL)); + *insn++ = BPF_EMIT_CALL(rhtab_lookup_elem); + *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); + *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, + offsetof(struct rhtab_elem, data) + round_up(map->key_size, 8)); + + return insn - insn_buf; +} + +static int rhtab_map_check_btf(struct bpf_map *map, const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + + return bpf_ma_set_dtor(map, &rhtab->ma, rhtab_mem_dtor); +} + +static void rhtab_map_free_internal_structs(struct bpf_map *map) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + struct rhashtable_iter iter; + struct rhtab_elem *elem; + + if (!bpf_map_has_internal_structs(map)) + return; + + /* + * Block new insertions. Once observed, no new growth is triggered, + * so any in-flight rehash will drain and the walker is guaranteed + * to stop returning -EAGAIN. Treat -EAGAIN as "rehash in progress, + * retry"; do not wait for the worker. + */ + WRITE_ONCE(rhtab->freeing_internal, true); + + rhashtable_walk_enter(&rhtab->ht, &iter); + rhashtable_walk_start(&iter); + + while ((elem = rhashtable_walk_next(&iter))) { + if (IS_ERR(elem)) { + if (PTR_ERR(elem) == -EAGAIN) + continue; + break; + } + + bpf_map_free_internal_structs(map, rhtab_elem_value(elem, map->key_size)); + + if (need_resched()) { /* Avoid stalls on large maps */ + rhashtable_walk_stop(&iter); + cond_resched(); + rhashtable_walk_start(&iter); + } + } + + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); + WRITE_ONCE(rhtab->freeing_internal, false); +} + +static int rhtab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) + __must_hold_shared(RCU) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + struct rhtab_elem *elem; + + elem = rhashtable_next_key(&rhtab->ht, key); + + /* if not found, return the first key */ + if (PTR_ERR(elem) == -ENOENT) + elem = rhashtable_next_key(&rhtab->ht, NULL); + + if (IS_ERR(elem)) + return PTR_ERR(elem); + if (!elem) + return -ENOENT; + + memcpy(next_key, elem->data, map->key_size); + return 0; +} + +static void rhtab_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) +{ + void *value; + + /* Guarantee that hashtab value is not freed */ + guard(rcu)(); + + value = rhtab_map_lookup_elem(map, key); + if (!value) + return; + + btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); + seq_puts(m, ": "); + btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); + seq_putc(m, '\n'); +} + +static long bpf_each_rhash_elem(struct bpf_map *map, bpf_callback_t callback_fn, + void *callback_ctx, u64 flags) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + void *prev_key = NULL; + struct rhtab_elem *elem; + int num_elems = 0; + u64 ret = 0; + + cant_migrate(); + + if (flags != 0) + return -EINVAL; + + rcu_read_lock(); + /* + * Best-effort iteration: if rhashtable is concurrently resized or + * elements are deleted/inserted, there may be missed or duplicate + * elements visited. + */ + while ((elem = rhashtable_next_key(&rhtab->ht, prev_key))) { + if (IS_ERR(elem)) + break; + num_elems++; + ret = callback_fn((u64)(long)map, + (u64)(long)elem->data, + (u64)(long)rhtab_elem_value(elem, map->key_size), + (u64)(long)callback_ctx, 0); + if (ret) + break; + + prev_key = elem->data; /* valid while RCU held */ + } + rcu_read_unlock(); + + return num_elems; +} + +static u64 rhtab_map_mem_usage(const struct bpf_map *map) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + u64 num_entries; + + /* Excludes rhashtable bucket overhead (~ nelems * sizeof(void *) at 75% load). */ + num_entries = atomic_read(&rhtab->ht.nelems); + return sizeof(struct bpf_rhtab) + rhtab->elem_size * num_entries; +} + +static int __rhtab_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr, + bool do_delete) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + void __user *uvalues = u64_to_user_ptr(attr->batch.values); + void __user *ukeys = u64_to_user_ptr(attr->batch.keys); + void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); + void *cursor = NULL, *keys = NULL, *values = NULL, *dst_key, *dst_val; + struct rhtab_elem **del_elems = NULL; + u32 max_count, total, key_size, value_size, i; + bool has_next_cursor = false; + struct rhtab_elem *elem; + u64 elem_map_flags, map_flags; + int ret = 0; + + elem_map_flags = attr->batch.elem_flags; + ret = bpf_map_check_op_flags(map, elem_map_flags, BPF_F_LOCK); + if (ret) + return ret; + + map_flags = attr->batch.flags; + if (map_flags) + return -EINVAL; + + max_count = attr->batch.count; + if (!max_count) + return 0; + + if (put_user(0, &uattr->batch.count)) + return -EFAULT; + + key_size = map->key_size; + value_size = map->value_size; + + keys = kvmalloc_array(max_count, key_size, GFP_USER | __GFP_NOWARN); + values = kvmalloc_array(max_count, value_size, GFP_USER | __GFP_NOWARN); + if (do_delete) + del_elems = kvmalloc_array(max_count, sizeof(void *), + GFP_USER | __GFP_NOWARN); + cursor = kmalloc(key_size, GFP_USER | __GFP_NOWARN); + + if (!keys || !values || !cursor || (do_delete && !del_elems)) { + ret = -ENOMEM; + goto free; + } + + if (ubatch && copy_from_user(cursor, ubatch, key_size)) { + ret = -EFAULT; + goto free; + } + + dst_key = keys; + dst_val = values; + total = 0; + + rcu_read_lock(); + + /* + * Cursor stores the key of the next-to-process element (stashed by + * the previous batch). Look it up directly so the element is included + * here rather than skipped by next_key(). If the cursor was deleted + * concurrently (or by the previous do_delete batch), return -EAGAIN + * so userspace can distinguish a lost cursor from end-of-iteration + * (-ENOENT) and restart from a NULL cursor. + */ + if (ubatch) { + elem = rhtab_lookup_elem(map, cursor); + if (!elem) { + rcu_read_unlock(); + ret = -EAGAIN; + goto free; + } + } else { + elem = rhashtable_next_key(&rhtab->ht, NULL); + } + + while (elem && !IS_ERR(elem) && total < max_count) { + memcpy(dst_key, elem->data, key_size); + rhtab_read_elem_value(map, dst_val, elem, elem_map_flags); + check_and_init_map_value(map, dst_val); + + if (do_delete) + del_elems[total] = elem; + + elem = rhashtable_next_key(&rhtab->ht, dst_key); + dst_key += key_size; + dst_val += value_size; + total++; + + /* Bail to userspace to avoid stalls. */ + if (need_resched()) + break; + } + + if (elem && !IS_ERR(elem)) { + /* Stash next-to-process key as cursor for the next batch. */ + memcpy(cursor, elem->data, key_size); + has_next_cursor = true; + } + + if (do_delete) { + for (i = 0; i < total; i++) + rhtab_delete_elem(rhtab, del_elems[i], NULL, 0); + } + + rcu_read_unlock(); + + if (total == 0) { + ret = -ENOENT; + goto free; + } + + /* No more elements after this batch. */ + if (!has_next_cursor) + ret = -ENOENT; + + if (copy_to_user(ukeys, keys, (size_t)total * key_size) || + copy_to_user(uvalues, values, (size_t)total * value_size) || + put_user(total, &uattr->batch.count) || + (has_next_cursor && + copy_to_user(u64_to_user_ptr(attr->batch.out_batch), + cursor, key_size))) { + ret = -EFAULT; + goto free; + } + +free: + kfree(cursor); + kvfree(keys); + kvfree(values); + kvfree(del_elems); + return ret; +} + +static int rhtab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __rhtab_map_lookup_and_delete_batch(map, attr, uattr, false); +} + +static int rhtab_map_lookup_and_delete_batch(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __rhtab_map_lookup_and_delete_batch(map, attr, uattr, true); +} + +struct bpf_iter_seq_rhash_map_info { + struct bpf_map *map; + struct bpf_rhtab *rhtab; + struct rhashtable_iter iter; +}; + +static void *bpf_rhash_map_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + struct bpf_iter_seq_rhash_map_info *info = seq->private; + struct rhtab_elem *elem; + + rhashtable_walk_start(&info->iter); + /* + * Re-deliver the element returned by walk_next() at the end of the + * previous read() — bpf_seq_read may have stopped before show() + * consumed it. Rehash rewinds the walker; retry on -EAGAIN. + */ + do { + elem = rhashtable_walk_peek(&info->iter); + } while (PTR_ERR(elem) == -EAGAIN); + + if (IS_ERR(elem)) + return NULL; + + if (elem && *pos == 0) + ++*pos; + return elem; +} + +static void *bpf_rhash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_rhash_map_info *info = seq->private; + struct rhtab_elem *elem; + + ++*pos; + + /* Rehash rewinds the walker; retry until it stops returning -EAGAIN. */ + do { + elem = rhashtable_walk_next(&info->iter); + } while (PTR_ERR(elem) == -EAGAIN); + + if (IS_ERR(elem)) + return NULL; + return elem; +} + +static int __bpf_rhash_map_seq_show(struct seq_file *seq, + struct rhtab_elem *elem) +{ + struct bpf_iter_seq_rhash_map_info *info = seq->private; + struct bpf_iter__bpf_map_elem ctx = {}; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, elem == NULL); + if (prog) { + ctx.meta = &meta; + ctx.map = info->map; + if (elem) { + ctx.key = elem->data; + ctx.value = rhtab_elem_value(elem, info->map->key_size); + } + ret = bpf_iter_run_prog(prog, &ctx); + } + + return ret; +} + +static int bpf_rhash_map_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_rhash_map_seq_show(seq, v); +} + +static void bpf_rhash_map_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + struct bpf_iter_seq_rhash_map_info *info = seq->private; + + if (!v) + (void)__bpf_rhash_map_seq_show(seq, NULL); + + rhashtable_walk_stop(&info->iter); +} + +static int bpf_iter_init_rhash_map(void *priv_data, struct bpf_iter_aux_info *aux) +{ + struct bpf_iter_seq_rhash_map_info *info = priv_data; + struct bpf_map *map = aux->map; + + bpf_map_inc_with_uref(map); + info->map = map; + info->rhtab = container_of(map, struct bpf_rhtab, map); + rhashtable_walk_enter(&info->rhtab->ht, &info->iter); + return 0; +} + +static void bpf_iter_fini_rhash_map(void *priv_data) +{ + struct bpf_iter_seq_rhash_map_info *info = priv_data; + + rhashtable_walk_exit(&info->iter); + bpf_map_put_with_uref(info->map); +} + +static const struct seq_operations bpf_rhash_map_seq_ops = { + .start = bpf_rhash_map_seq_start, + .next = bpf_rhash_map_seq_next, + .stop = bpf_rhash_map_seq_stop, + .show = bpf_rhash_map_seq_show, +}; + +static const struct bpf_iter_seq_info rhash_iter_seq_info = { + .seq_ops = &bpf_rhash_map_seq_ops, + .init_seq_private = bpf_iter_init_rhash_map, + .fini_seq_private = bpf_iter_fini_rhash_map, + .seq_priv_size = sizeof(struct bpf_iter_seq_rhash_map_info), +}; + +BTF_ID_LIST_SINGLE(rhtab_map_btf_ids, struct, bpf_rhtab) +const struct bpf_map_ops rhtab_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc_check = rhtab_map_alloc_check, + .map_alloc = rhtab_map_alloc, + .map_free = rhtab_map_free, + .map_get_next_key = rhtab_map_get_next_key, + .map_release_uref = rhtab_map_free_internal_structs, + .map_check_btf = rhtab_map_check_btf, + .map_lookup_elem = rhtab_map_lookup_elem, + .map_lookup_and_delete_elem = rhtab_map_lookup_and_delete_elem, + .map_update_elem = rhtab_map_update_elem, + .map_delete_elem = rhtab_map_delete_elem, + .map_gen_lookup = rhtab_map_gen_lookup, + .map_seq_show_elem = rhtab_map_seq_show_elem, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_each_rhash_elem, + .map_mem_usage = rhtab_map_mem_usage, + BATCH_OPS(rhtab), + .map_btf_id = &rhtab_map_btf_ids[0], + .iter_seq_info = &rhash_iter_seq_info, +}; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b5314c9fed3c..8e196c9b7c50 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1944,7 +1944,7 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, - .arg3_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, + .arg3_type = ARG_PTR_TO_DYNPTR, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -2001,7 +2001,7 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = { .func = bpf_dynptr_write, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, + .arg1_type = ARG_PTR_TO_DYNPTR, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, @@ -2044,7 +2044,7 @@ static const struct bpf_func_proto bpf_dynptr_data_proto = { .func = bpf_dynptr_data, .gpl_only = false, .ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL, - .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, + .arg1_type = ARG_PTR_TO_DYNPTR, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, }; @@ -2247,10 +2247,11 @@ EXPORT_SYMBOL_GPL(bpf_base_func_proto); void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock) { - struct list_head *head = list_head, *orig_head = list_head; + struct list_head *head = list_head, drain, *pos, *n; BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head)); BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head)); + INIT_LIST_HEAD(&drain); /* Do the actual list draining outside the lock to not hold the lock for * too long, and also prevent deadlocks if tracing programs end up @@ -2261,20 +2262,30 @@ void bpf_list_head_free(const struct btf_field *field, void *list_head, __bpf_spin_lock_irqsave(spin_lock); if (!head->next || list_empty(head)) goto unlock; - head = head->next; + list_for_each_safe(pos, n, head) { + struct bpf_list_node_kern *node; + + node = container_of(pos, struct bpf_list_node_kern, list_head); + WRITE_ONCE(node->owner, BPF_PTR_POISON); + list_move_tail(pos, &drain); + } unlock: - INIT_LIST_HEAD(orig_head); + INIT_LIST_HEAD(head); __bpf_spin_unlock_irqrestore(spin_lock); - while (head != orig_head) { - void *obj = head; + while (!list_empty(&drain)) { + struct bpf_list_node_kern *node; - obj -= field->graph_root.node_offset; - head = head->next; + pos = drain.next; + node = container_of(pos, struct bpf_list_node_kern, list_head); + list_del_init(pos); + /* Ensure __bpf_list_add() sees the node as unlinked. */ + smp_store_release(&node->owner, NULL); /* The contained type can also have resources, including a * bpf_list_head which needs to be freed. */ - __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); + __bpf_obj_drop_impl((char *)pos - field->graph_root.node_offset, + field->graph_root.value_rec, false); } } @@ -2295,6 +2306,7 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root, struct bpf_spin_lock *spin_lock) { struct rb_root_cached orig_root, *root = rb_root; + struct bpf_rb_node_kern *node; struct rb_node *pos, *n; void *obj; @@ -2303,14 +2315,20 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root, __bpf_spin_lock_irqsave(spin_lock); orig_root = *root; + bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) { + node = rb_entry(pos, struct bpf_rb_node_kern, rb_node); + WRITE_ONCE(node->owner, BPF_PTR_POISON); + } *root = RB_ROOT_CACHED; __bpf_spin_unlock_irqrestore(spin_lock); bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) { obj = pos; obj -= field->graph_root.node_offset; - - + node = rb_entry(pos, struct bpf_rb_node_kern, rb_node); + RB_CLEAR_NODE(pos); + /* Ensure __bpf_rbtree_add() sees the node as unlinked. */ + smp_store_release(&node->owner, NULL); __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); } } @@ -2467,9 +2485,11 @@ __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta static int __bpf_list_add(struct bpf_list_node_kern *node, struct bpf_list_head *head, - bool tail, struct btf_record *rec, u64 off) + struct list_head **prev_ptr, + struct btf_record *rec, u64 off) { struct list_head *n = &node->list_head, *h = (void *)head; + struct list_head *prev; /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't * called on its fields, so init here @@ -2477,19 +2497,31 @@ static int __bpf_list_add(struct bpf_list_node_kern *node, if (unlikely(!h->next)) INIT_LIST_HEAD(h); + prev = *prev_ptr; + + /* When prev is not the list head, it must be a node in this list. */ + if (prev != h) { + struct bpf_list_node_kern *prev_kn = + container_of(prev, struct bpf_list_node_kern, list_head); + + if (unlikely(READ_ONCE(prev_kn->owner) != head)) + goto fail; + } + /* node->owner != NULL implies !list_empty(n), no need to separately * check the latter */ - if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) { - /* Only called from BPF prog, no need to migrate_disable */ - __bpf_obj_drop_impl((void *)n - off, rec, false); - return -EINVAL; - } + if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) + goto fail; - tail ? list_add_tail(n, h) : list_add(n, h); + list_add(n, prev); WRITE_ONCE(node->owner, head); - return 0; + +fail: + /* Only called from BPF prog, no need to migrate_disable */ + __bpf_obj_drop_impl((void *)n - off, rec, false); + return -EINVAL; } /** @@ -2510,8 +2542,9 @@ __bpf_kfunc int bpf_list_push_front(struct bpf_list_head *head, u64 off) { struct bpf_list_node_kern *n = (void *)node; + struct list_head *h = (void *)head; - return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off); + return __bpf_list_add(n, head, &h, meta ? meta->record : NULL, off); } __bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head, @@ -2539,8 +2572,9 @@ __bpf_kfunc int bpf_list_push_back(struct bpf_list_head *head, u64 off) { struct bpf_list_node_kern *n = (void *)node; + struct list_head *h = (void *)head; - return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off); + return __bpf_list_add(n, head, &h->prev, meta ? meta->record : NULL, off); } __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head, @@ -2550,37 +2584,63 @@ __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head, return bpf_list_push_back(head, node, meta__ign, off); } -static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail) +__bpf_kfunc int bpf_list_add(struct bpf_list_head *head, struct bpf_list_node *new, + struct bpf_list_node *prev__nonown_allowed, + struct btf_struct_meta *meta, u64 off) +{ + struct bpf_list_node_kern *n = (void *)new, *p = (void *)prev__nonown_allowed; + struct list_head *prev_ptr = &p->list_head; + + return __bpf_list_add(n, head, &prev_ptr, meta ? meta->record : NULL, off); +} + +static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, + struct list_head *n) { - struct list_head *n, *h = (void *)head; + struct list_head *h = (void *)head; struct bpf_list_node_kern *node; /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't * called on its fields, so init here */ - if (unlikely(!h->next)) + if (unlikely(!h->next)) { INIT_LIST_HEAD(h); + return NULL; + } if (list_empty(h)) return NULL; - n = tail ? h->prev : h->next; node = container_of(n, struct bpf_list_node_kern, list_head); - if (WARN_ON_ONCE(READ_ONCE(node->owner) != head)) + if (unlikely(READ_ONCE(node->owner) != head)) return NULL; list_del_init(n); - WRITE_ONCE(node->owner, NULL); + /* Ensure __bpf_list_add() sees the node as unlinked. */ + smp_store_release(&node->owner, NULL); return (struct bpf_list_node *)n; } __bpf_kfunc struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) { - return __bpf_list_del(head, false); + struct list_head *h = (void *)head; + + return __bpf_list_del(head, h->next); } __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) { - return __bpf_list_del(head, true); + struct list_head *h = (void *)head; + + return __bpf_list_del(head, h->prev); +} + +__bpf_kfunc struct bpf_list_node *bpf_list_del(struct bpf_list_head *head, + struct bpf_list_node *node__nonown_allowed) +{ + struct bpf_list_node_kern *kn = (void *)node__nonown_allowed; + + /* verifier guarantees node is a list node rather than list head */ + return __bpf_list_del(head, &kn->list_head); } __bpf_kfunc struct bpf_list_node *bpf_list_front(struct bpf_list_head *head) @@ -2603,6 +2663,43 @@ __bpf_kfunc struct bpf_list_node *bpf_list_back(struct bpf_list_head *head) return (struct bpf_list_node *)h->prev; } +__bpf_kfunc bool bpf_list_is_first(struct bpf_list_head *head, + struct bpf_list_node *node__nonown_allowed) +{ + struct list_head *h = (struct list_head *)head; + struct bpf_list_node_kern *kn = (struct bpf_list_node_kern *)node__nonown_allowed; + + if (READ_ONCE(kn->owner) != head) + return false; + + return list_is_first(&kn->list_head, h); +} + +__bpf_kfunc bool bpf_list_is_last(struct bpf_list_head *head, + struct bpf_list_node *node__nonown_allowed) +{ + struct list_head *h = (struct list_head *)head; + struct bpf_list_node_kern *kn = (struct bpf_list_node_kern *)node__nonown_allowed; + + if (READ_ONCE(kn->owner) != head) + return false; + + return list_is_last(&kn->list_head, h); +} + +__bpf_kfunc bool bpf_list_empty(struct bpf_list_head *head) +{ + struct list_head *h = (struct list_head *)head; + + /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't + * called on its fields, so init here + */ + if (unlikely(!h->next)) + INIT_LIST_HEAD(h); + + return list_empty(h); +} + __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, struct bpf_rb_node *node) { @@ -2912,11 +3009,13 @@ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid) { struct task_struct *p; - rcu_read_lock(); + guard(rcu)(); + if (!task_active_pid_ns(current)) + return NULL; + p = find_task_by_vpid(vpid); if (p) p = bpf_task_acquire(p); - rcu_read_unlock(); return p; } @@ -3072,7 +3171,7 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, return bpf_dynptr_slice(p, offset, buffer__nullable, buffer__szk); } -__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end) +__bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr *p, u64 start, u64 end) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; u64 size; @@ -3093,14 +3192,14 @@ __bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end __bpf_kfunc bool bpf_dynptr_is_null(const struct bpf_dynptr *p) { - struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; return !ptr->data; } __bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p) { - struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; if (!ptr->data) return false; @@ -3110,7 +3209,7 @@ __bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p) __bpf_kfunc u64 bpf_dynptr_size(const struct bpf_dynptr *p) { - struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; if (!ptr->data) return -EINVAL; @@ -3122,7 +3221,7 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p, struct bpf_dynptr *clone__uninit) { struct bpf_dynptr_kern *clone = (struct bpf_dynptr_kern *)clone__uninit; - struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; if (!ptr->data) { bpf_dynptr_set_null(clone); @@ -3145,11 +3244,11 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p, * Copies data from source dynptr to destination dynptr. * Returns 0 on success; negative error, otherwise. */ -__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u64 dst_off, - struct bpf_dynptr *src_ptr, u64 src_off, u64 size) +__bpf_kfunc int bpf_dynptr_copy(const struct bpf_dynptr *dst_ptr, u64 dst_off, + const struct bpf_dynptr *src_ptr, u64 src_off, u64 size) { - struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr; - struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr; + const struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr; + const struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr; void *src_slice, *dst_slice; char buf[256]; u64 off; @@ -3200,9 +3299,9 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u64 dst_off, * at @offset with the constant byte @val. * Returns 0 on success; negative error, otherwise. */ -__bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u64 offset, u64 size, u8 val) +__bpf_kfunc int bpf_dynptr_memset(const struct bpf_dynptr *p, u64 offset, u64 size, u8 val) { - struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; u64 chunk_sz, write_off; char buf[256]; void* slice; @@ -3301,7 +3400,7 @@ __bpf_kfunc void bpf_throw(u64 cookie) * which skips compiler generated instrumentation to do the same. */ kasan_unpoison_task_stack_below((void *)(long)ctx.sp); - ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp, 0, 0); + ctx.aux->bpf_exception_cb(cookie, ctx.sp + ctx.aux->stack_arg_sp_adjust, ctx.bp, 0, 0); WARN(1, "A call to BPF exception callback should never return\n"); } @@ -4214,13 +4313,13 @@ __bpf_kfunc void bpf_key_put(struct bpf_key *bkey) * * Return: 0 on success, a negative value on error. */ -__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, - struct bpf_dynptr *sig_p, +__bpf_kfunc int bpf_verify_pkcs7_signature(const struct bpf_dynptr *data_p, + const struct bpf_dynptr *sig_p, struct bpf_key *trusted_keyring) { #ifdef CONFIG_SYSTEM_DATA_VERIFICATION - struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p; - struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p; + const struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p; + const struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p; const void *data, *sig; u32 data_len, sig_len; int ret; @@ -4718,10 +4817,15 @@ BTF_ID_FLAGS(func, bpf_list_push_front, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_list_push_front_impl) BTF_ID_FLAGS(func, bpf_list_push_back, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_list_push_back_impl) +BTF_ID_FLAGS(func, bpf_list_add, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_list_del, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_front, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_list_is_first) +BTF_ID_FLAGS(func, bpf_list_is_last) +BTF_ID_FLAGS(func, bpf_list_empty) BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL) @@ -4862,7 +4966,7 @@ BTF_ID_FLAGS(func, bpf_stream_print_stack, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_dynptr_from_file) -BTF_ID_FLAGS(func, bpf_dynptr_file_discard) +BTF_ID_FLAGS(func, bpf_dynptr_file_discard, KF_RELEASE) BTF_ID_FLAGS(func, bpf_timer_cancel_async) BTF_KFUNCS_END(common_btf_ids) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 25c06a011825..7837968c0842 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -21,6 +21,9 @@ #include <linux/bpf.h> #include <linux/bpf_trace.h> #include <linux/kstrtox.h> +#include <linux/xattr.h> +#include <linux/security.h> + #include "preload/bpf_preload.h" enum bpf_type { @@ -30,6 +33,23 @@ enum bpf_type { BPF_TYPE_LINK, }; +struct bpf_fs_inode { + struct list_head xattrs; + struct simple_xattr_limits xlimits; + struct inode vfs_inode; +}; + +static inline struct bpf_fs_inode *BPF_FS_I(struct inode *inode) +{ + return container_of(inode, struct bpf_fs_inode, vfs_inode); +} + +static struct kmem_cache *bpf_fs_inode_cachep __ro_after_init; + +static int bpf_fs_initxattrs(struct inode *inode, + const struct xattr *xattr_array, void *fs_info); +static ssize_t bpf_fs_listxattr(struct dentry *dentry, char *buf, size_t size); + static void *bpf_any_get(void *raw, enum bpf_type type) { switch (type) { @@ -94,10 +114,17 @@ static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type) } static const struct inode_operations bpf_dir_iops; +static const struct inode_operations bpf_symlink_iops; -static const struct inode_operations bpf_prog_iops = { }; -static const struct inode_operations bpf_map_iops = { }; -static const struct inode_operations bpf_link_iops = { }; +static const struct inode_operations bpf_prog_iops = { + .listxattr = bpf_fs_listxattr, +}; +static const struct inode_operations bpf_map_iops = { + .listxattr = bpf_fs_listxattr, +}; +static const struct inode_operations bpf_link_iops = { + .listxattr = bpf_fs_listxattr, +}; struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir, @@ -153,11 +180,19 @@ static struct dentry *bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; + int ret; inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR); if (IS_ERR(inode)) return ERR_CAST(inode); + ret = security_inode_init_security(inode, dir, &dentry->d_name, + bpf_fs_initxattrs, NULL); + if (ret && ret != -EOPNOTSUPP) { + iput(inode); + return ERR_PTR(ret); + } + inode->i_op = &bpf_dir_iops; inode->i_fop = &simple_dir_operations; @@ -330,10 +365,20 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, const struct file_operations *fops) { struct inode *dir = dentry->d_parent->d_inode; - struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode); + struct inode *inode; + int ret; + + inode = bpf_get_inode(dir->i_sb, dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); + ret = security_inode_init_security(inode, dir, &dentry->d_name, + bpf_fs_initxattrs, NULL); + if (ret && ret != -EOPNOTSUPP) { + iput(inode); + return ret; + } + inode->i_op = iops; inode->i_fop = fops; inode->i_private = raw; @@ -382,9 +427,11 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *target) { - char *link = kstrdup(target, GFP_USER | __GFP_NOWARN); struct inode *inode; + char *link; + int ret; + link = kstrdup(target, GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!link) return -ENOMEM; @@ -394,13 +441,25 @@ static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir, return PTR_ERR(inode); } - inode->i_op = &simple_symlink_inode_operations; + inode->i_op = &bpf_symlink_iops; inode->i_link = link; + ret = security_inode_init_security(inode, dir, &dentry->d_name, + bpf_fs_initxattrs, NULL); + if (ret && ret != -EOPNOTSUPP) { + iput(inode); + return ret; + } + bpf_dentry_finalize(dentry, inode, dir); return 0; } +static const struct inode_operations bpf_symlink_iops = { + .get_link = simple_get_link, + .listxattr = bpf_fs_listxattr, +}; + static const struct inode_operations bpf_dir_iops = { .lookup = bpf_lookup, .mkdir = bpf_mkdir, @@ -409,6 +468,7 @@ static const struct inode_operations bpf_dir_iops = { .rename = simple_rename, .link = simple_link, .unlink = simple_unlink, + .listxattr = bpf_fs_listxattr, }; /* pin iterator link into bpffs */ @@ -762,22 +822,151 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) return 0; } +static struct inode *bpf_fs_alloc_inode(struct super_block *sb) +{ + struct bpf_fs_inode *bi; + + bi = alloc_inode_sb(sb, bpf_fs_inode_cachep, GFP_KERNEL); + if (!bi) + return NULL; + INIT_LIST_HEAD_RCU(&bi->xattrs); + simple_xattr_limits_init(&bi->xlimits); + return &bi->vfs_inode; +} + static void bpf_destroy_inode(struct inode *inode) { + struct bpf_mount_opts *opts = inode->i_sb->s_fs_info; + struct bpf_fs_inode *bi = BPF_FS_I(inode); enum bpf_type type; - if (S_ISLNK(inode->i_mode)) - kfree(inode->i_link); if (!bpf_inode_type(inode, &type)) bpf_any_put(inode->i_private, type); - free_inode_nonrcu(inode); + simple_xattrs_free(&opts->xa_cache, &bi->xattrs, NULL); +} + +/* + * Called after RCU grace period - safe to free inode and anything + * that might be accessed by RCU pathwalk (inode fields, i_link). + */ +static void bpf_free_inode(struct inode *inode) +{ + if (S_ISLNK(inode->i_mode)) + kfree(inode->i_link); + kmem_cache_free(bpf_fs_inode_cachep, BPF_FS_I(inode)); +} + +static int bpf_fs_xattr_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *value, size_t size) +{ + struct bpf_mount_opts *opts = inode->i_sb->s_fs_info; + struct bpf_fs_inode *bi = BPF_FS_I(inode); + + name = xattr_full_name(handler, name); + return simple_xattr_get(&opts->xa_cache, &bi->xattrs, name, value, size); +} + +enum { + BPF_FS_XATTR_UNSPEC, + BPF_FS_XATTR_SECURITY, + BPF_FS_XATTR_TRUSTED, +}; + +static int bpf_fs_xattr_set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, struct dentry *unused, + struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + struct bpf_mount_opts *opts = inode->i_sb->s_fs_info; + struct bpf_fs_inode *bi = BPF_FS_I(inode); + struct simple_xattr *old; + int err = -EINVAL; + + name = xattr_full_name(handler, name); + switch (handler->flags) { + case BPF_FS_XATTR_SECURITY: + err = simple_xattr_set_limited(&opts->xa_cache, &bi->xattrs, + &bi->xlimits, name, value, size, + flags); + break; + case BPF_FS_XATTR_TRUSTED: + old = simple_xattr_set(&opts->xa_cache, &bi->xattrs, name, + value, size, flags); + err = IS_ERR(old) ? PTR_ERR(old) : 0; + if (!err) + simple_xattr_free_rcu(old); + break; + } + if (err) + return err; + inode_set_ctime_current(inode); + return 0; +} + +static const struct xattr_handler bpf_fs_trusted_xattr_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .flags = BPF_FS_XATTR_TRUSTED, + .get = bpf_fs_xattr_get, + .set = bpf_fs_xattr_set, +}; + +static const struct xattr_handler bpf_fs_security_xattr_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .flags = BPF_FS_XATTR_SECURITY, + .get = bpf_fs_xattr_get, + .set = bpf_fs_xattr_set, +}; + +static const struct xattr_handler * const bpf_fs_xattr_handlers[] = { + &bpf_fs_trusted_xattr_handler, + &bpf_fs_security_xattr_handler, + NULL, +}; + +static ssize_t bpf_fs_listxattr(struct dentry *dentry, char *buf, size_t size) +{ + struct inode *inode = d_inode(dentry); + + return simple_xattr_list(inode, &BPF_FS_I(inode)->xattrs, buf, size); +} + +static int bpf_fs_initxattrs(struct inode *inode, + const struct xattr *xattr_array, void *fs_info) +{ + struct bpf_mount_opts *opts = inode->i_sb->s_fs_info; + struct bpf_fs_inode *bi = BPF_FS_I(inode); + const struct xattr *xattr; + int err; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + CLASS(simple_xattr, new_xattr)(xattr->value, xattr->value_len); + if (IS_ERR(new_xattr)) + return PTR_ERR(new_xattr); + + new_xattr->name = kasprintf(GFP_KERNEL_ACCOUNT, + XATTR_SECURITY_PREFIX "%s", + xattr->name); + if (!new_xattr->name) + return -ENOMEM; + + err = simple_xattr_add_limited(&opts->xa_cache, &bi->xattrs, + &bi->xlimits, new_xattr); + if (err) + return err; + + retain_and_null_ptr(new_xattr); + } + return 0; } const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = inode_just_drop, .show_options = bpf_show_options, + .alloc_inode = bpf_fs_alloc_inode, .destroy_inode = bpf_destroy_inode, + .free_inode = bpf_free_inode, }; enum { @@ -996,25 +1185,38 @@ out: static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) { - static const struct tree_descr bpf_rfiles[] = { { "" } }; struct bpf_mount_opts *opts = sb->s_fs_info; struct inode *inode; - int ret; /* Mounting an instance of BPF FS requires privileges */ if (fc->user_ns != &init_user_ns && !capable(CAP_SYS_ADMIN)) return -EPERM; - ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles); - if (ret) - return ret; - + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; + sb->s_magic = BPF_FS_MAGIC; sb->s_op = &bpf_super_ops; + sb->s_xattr = bpf_fs_xattr_handlers; + sb->s_iflags |= SB_I_NOEXEC; + sb->s_iflags |= SB_I_NODEV; + sb->s_time_gran = 1; - inode = sb->s_root->d_inode; + inode = bpf_get_inode(sb, NULL, S_IFDIR | 0777); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_ino = 1; + inode->i_op = &bpf_dir_iops; + inode->i_fop = &simple_dir_operations; + set_nlink(inode, 2); + + sb->s_root = d_make_root(inode); + if (!sb->s_root) + return -ENOMEM; + + inode = d_inode(sb->s_root); inode->i_uid = opts->uid; inode->i_gid = opts->gid; - inode->i_op = &bpf_dir_iops; inode->i_mode &= ~S_IALLUGO; populate_bpffs(sb->s_root); inode->i_mode |= S_ISVTX | opts->mode; @@ -1068,6 +1270,7 @@ static void bpf_kill_super(struct super_block *sb) struct bpf_mount_opts *opts = sb->s_fs_info; kill_anon_super(sb); + simple_xattr_cache_cleanup(&opts->xa_cache); kfree(opts); } @@ -1080,18 +1283,37 @@ static struct file_system_type bpf_fs_type = { .fs_flags = FS_USERNS_MOUNT, }; +static void bpf_fs_inode_init_once(void *foo) +{ + struct bpf_fs_inode *bi = foo; + + inode_init_once(&bi->vfs_inode); +} + static int __init bpf_init(void) { int ret; + bpf_fs_inode_cachep = kmem_cache_create("bpf_fs_inode_cache", + sizeof(struct bpf_fs_inode), + 0, SLAB_ACCOUNT, + bpf_fs_inode_init_once); + if (!bpf_fs_inode_cachep) + return -ENOMEM; + ret = sysfs_create_mount_point(fs_kobj, "bpf"); if (ret) - return ret; + goto out_cache; ret = register_filesystem(&bpf_fs_type); - if (ret) + if (ret) { sysfs_remove_mount_point(fs_kobj, "bpf"); + goto out_cache; + } + return 0; +out_cache: + kmem_cache_destroy(bpf_fs_inode_cachep); return ret; } fs_initcall(bpf_init); diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c index 58197d73b120..0aadfbae0acc 100644 --- a/kernel/bpf/liveness.c +++ b/kernel/bpf/liveness.c @@ -610,6 +610,21 @@ enum arg_track_state { /* Track callee stack slots fp-8 through fp-512 (64 slots of 8 bytes each) */ #define MAX_ARG_SPILL_SLOTS 64 +/* + * Combined register + stack arg tracking: R0-R10 at indices 0-10, + * outgoing stack arg slots at indices MAX_BPF_REG..MAX_BPF_REG+6. + */ +#define MAX_AT_TRACK_REGS (MAX_BPF_REG + MAX_STACK_ARG_SLOTS) + +static int stack_arg_off_to_slot(s16 off) +{ + int aoff = off < 0 ? -off : off; + + if (aoff / 8 > MAX_STACK_ARG_SLOTS) + return -1; + return aoff / 8 - 1; +} + static bool arg_is_visited(const struct arg_track *at) { return at->frame != ARG_UNVISITED; @@ -791,7 +806,9 @@ static bool arg_track_join(struct bpf_verifier_env *env, int idx, int target, in return true; verbose(env, "arg JOIN insn %d -> %d ", idx, target); - if (r >= 0) + if (r >= MAX_BPF_REG) + verbose(env, "sa%d: ", r - MAX_BPF_REG); + else if (r >= 0) verbose(env, "r%d: ", r); else verbose(env, "fp%+d: ", r * 8); @@ -1032,6 +1049,21 @@ static void arg_track_log(struct bpf_verifier_env *env, struct bpf_insn *insn, i verbose(env, "\tr%d: ", i); verbose_arg_track(env, &at_in[i]); verbose(env, " -> "); verbose_arg_track(env, &at_out[i]); } + /* Log outgoing stack arg slot transitions at indices MAX_BPF_REG..MAX_AT_TRACK_REGS-1 */ + for (i = 0; i < MAX_STACK_ARG_SLOTS; i++) { + int ai = MAX_BPF_REG + i; + + if (arg_track_eq(&at_out[ai], &at_in[ai])) + continue; + if (!printed) { + verbose(env, "%3d: ", idx); + bpf_verbose_insn(env, insn); + bpf_vlog_reset(&env->log, env->log.end_pos - 1); + printed = true; + } + verbose(env, "\tsa%d: ", i); verbose_arg_track(env, &at_in[ai]); + verbose(env, " -> "); verbose_arg_track(env, &at_out[ai]); + } for (i = 0; i < MAX_ARG_SPILL_SLOTS; i++) { if (arg_track_eq(&at_stack_out[i], &at_stack_in[i])) continue; @@ -1062,6 +1094,7 @@ static bool can_be_local_fp(int depth, int regno, struct arg_track *at) static void arg_track_xfer(struct bpf_verifier_env *env, struct bpf_insn *insn, int insn_idx, struct arg_track *at_out, struct arg_track *at_stack_out, + const struct arg_track *at_stack_arg_entry, struct func_instance *instance, u32 *callsites) { @@ -1071,9 +1104,21 @@ static void arg_track_xfer(struct bpf_verifier_env *env, struct bpf_insn *insn, struct arg_track *dst = &at_out[insn->dst_reg]; struct arg_track *src = &at_out[insn->src_reg]; struct arg_track none = { .frame = ARG_NONE }; - int r; - - if (class == BPF_ALU64 && BPF_SRC(insn->code) == BPF_K) { + int r, slot; + + /* Handle stack arg stores and loads. */ + if (is_stack_arg_st(insn) || is_stack_arg_stx(insn)) { + slot = stack_arg_off_to_slot(insn->off); + if (slot >= 0) { + if (is_stack_arg_stx(insn)) + at_out[MAX_BPF_REG + slot] = at_out[insn->src_reg]; + else + at_out[MAX_BPF_REG + slot] = none; + } + } else if (is_stack_arg_ldx(insn)) { + slot = stack_arg_off_to_slot(insn->off); + at_out[insn->dst_reg] = (slot >= 0) ? at_stack_arg_entry[slot] : none; + } else if (class == BPF_ALU64 && BPF_SRC(insn->code) == BPF_K) { if (code == BPF_MOV) { *dst = none; } else if (dst->frame >= 0) { @@ -1297,6 +1342,16 @@ static int record_load_store_access(struct bpf_verifier_env *env, struct arg_track resolved, *ptr; int oi; + /* + * Stack arg insns use dst_reg/src_reg=BPF_REG_PARAMS(11). Since at[] + * is extended to MAX_AT_TRACK_REGS, at[11] holds the arg_track for + * outgoing stack arg slot 0 — not the pointer used for the memory + * access. Skip so the slot's tracked value isn't confused with the + * base register that record_stack_access() expects. + */ + if (is_stack_arg_stx(insn) || is_stack_arg_st(insn) || is_stack_arg_ldx(insn)) + return 0; + switch (class) { case BPF_LDX: ptr = &at[insn->src_reg]; @@ -1343,6 +1398,42 @@ static int record_load_store_access(struct bpf_verifier_env *env, return 0; } +static int record_arg_access(struct bpf_verifier_env *env, + struct func_instance *instance, + struct bpf_insn *insn, + struct arg_track *at, int arg_idx, + int insn_idx) +{ + int depth = instance->depth; + int frame = at->frame; + int err = 0; + s64 bytes; + + if (!arg_is_fp(at)) + return 0; + + if (bpf_helper_call(insn)) { + bytes = bpf_helper_stack_access_bytes(env, insn, arg_idx, insn_idx); + } else if (bpf_pseudo_kfunc_call(insn)) { + bytes = bpf_kfunc_stack_access_bytes(env, insn, arg_idx, insn_idx); + } else { + for (int f = 0; f <= depth; f++) { + err = mark_stack_read(instance, f, insn_idx, SPIS_ALL); + if (err) + return err; + } + return 0; + } + if (bytes == 0) + return 0; + + if (frame >= 0 && frame <= depth) + err = record_stack_access(instance, at, bytes, frame, insn_idx); + else if (frame == ARG_IMPRECISE) + err = record_imprecise(instance, at->mask, insn_idx); + return err; +} + /* Record stack access for a given 'at' state of helper/kfunc 'insn' */ static int record_call_access(struct bpf_verifier_env *env, struct func_instance *instance, @@ -1350,9 +1441,8 @@ static int record_call_access(struct bpf_verifier_env *env, int insn_idx) { struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; - int depth = instance->depth; struct bpf_call_summary cs; - int r, err = 0, num_params = 5; + int r, err, num_params = 5; if (bpf_pseudo_call(insn)) return 0; @@ -1360,32 +1450,15 @@ static int record_call_access(struct bpf_verifier_env *env, if (bpf_get_call_summary(env, insn, &cs)) num_params = cs.num_params; - for (r = BPF_REG_1; r < BPF_REG_1 + num_params; r++) { - int frame = at[r].frame; - s64 bytes; - - if (!arg_is_fp(&at[r])) - continue; - - if (bpf_helper_call(insn)) { - bytes = bpf_helper_stack_access_bytes(env, insn, r - 1, insn_idx); - } else if (bpf_pseudo_kfunc_call(insn)) { - bytes = bpf_kfunc_stack_access_bytes(env, insn, r - 1, insn_idx); - } else { - for (int f = 0; f <= depth; f++) { - err = mark_stack_read(instance, f, insn_idx, SPIS_ALL); - if (err) - return err; - } - return 0; - } - if (bytes == 0) - continue; + for (r = BPF_REG_1; r < BPF_REG_1 + min(num_params, MAX_BPF_FUNC_REG_ARGS); r++) { + err = record_arg_access(env, instance, insn, &at[r], r - 1, insn_idx); + if (err) + return err; + } - if (frame >= 0 && frame <= depth) - err = record_stack_access(instance, &at[r], bytes, frame, insn_idx); - else if (frame == ARG_IMPRECISE) - err = record_imprecise(instance, at[r].mask, insn_idx); + for (r = 0; r < MAX_STACK_ARG_SLOTS && r < num_params - MAX_BPF_FUNC_REG_ARGS; r++) { + err = record_arg_access(env, instance, insn, &at[MAX_BPF_REG + r], + r + MAX_BPF_FUNC_REG_ARGS, insn_idx); if (err) return err; } @@ -1445,7 +1518,7 @@ static int find_callback_subprog(struct bpf_verifier_env *env, /* Per-subprog intermediate state kept alive across analysis phases */ struct subprog_at_info { - struct arg_track (*at_in)[MAX_BPF_REG]; + struct arg_track (*at_in)[MAX_AT_TRACK_REGS]; int len; }; @@ -1479,6 +1552,9 @@ static void print_subprog_arg_access(struct bpf_verifier_env *env, for (r = 0; r < MAX_BPF_REG - 1; r++) if (arg_is_fp(&info->at_in[i][r])) has_extra = true; + for (r = 0; r < MAX_STACK_ARG_SLOTS; r++) + if (arg_is_fp(&info->at_in[i][MAX_BPF_REG + r])) + has_extra = true; } if (is_ldx_stx_call) { for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++) @@ -1503,6 +1579,12 @@ static void print_subprog_arg_access(struct bpf_verifier_env *env, verbose(env, " r%d=", r); verbose_arg_track(env, &info->at_in[i][r]); } + for (r = 0; r < MAX_STACK_ARG_SLOTS; r++) { + if (!arg_is_fp(&info->at_in[i][MAX_BPF_REG + r])) + continue; + verbose(env, " sa%d=", r); + verbose_arg_track(env, &info->at_in[i][MAX_BPF_REG + r]); + } } if (is_ldx_stx_call) { @@ -1525,7 +1607,7 @@ static void print_subprog_arg_access(struct bpf_verifier_env *env, * Runs forward fixed-point with arg_track_xfer(), then records * memory accesses in a single linear pass over converged state. * - * @callee_entry: pre-populated entry state for R1-R5 + * @callee_entry: pre-populated entry state for R1-R5 and stack args * NULL for main (subprog 0). * @info: stores at_in, len for debug printing. */ @@ -1543,10 +1625,11 @@ static int compute_subprog_args(struct bpf_verifier_env *env, int end = env->subprog_info[subprog + 1].start; int po_end = env->subprog_info[subprog + 1].postorder_start; int len = end - start; - struct arg_track (*at_in)[MAX_BPF_REG] = NULL; - struct arg_track at_out[MAX_BPF_REG]; + struct arg_track (*at_in)[MAX_AT_TRACK_REGS] = NULL; + struct arg_track at_out[MAX_AT_TRACK_REGS]; struct arg_track (*at_stack_in)[MAX_ARG_SPILL_SLOTS] = NULL; struct arg_track *at_stack_out = NULL; + struct arg_track at_stack_arg_entry[MAX_STACK_ARG_SLOTS]; struct arg_track unvisited = { .frame = ARG_UNVISITED }; struct arg_track none = { .frame = ARG_NONE }; bool changed; @@ -1565,13 +1648,13 @@ static int compute_subprog_args(struct bpf_verifier_env *env, goto err_free; for (i = 0; i < len; i++) { - for (r = 0; r < MAX_BPF_REG; r++) + for (r = 0; r < MAX_AT_TRACK_REGS; r++) at_in[i][r] = unvisited; for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++) at_stack_in[i][r] = unvisited; } - for (r = 0; r < MAX_BPF_REG; r++) + for (r = 0; r < MAX_AT_TRACK_REGS; r++) at_in[0][r] = none; /* Entry: R10 is always precisely the current frame's FP */ @@ -1587,6 +1670,10 @@ static int compute_subprog_args(struct bpf_verifier_env *env, for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++) at_stack_in[0][r] = none; + /* Entry: incoming stack args from caller, or ARG_NONE for main */ + for (r = 0; r < MAX_STACK_ARG_SLOTS; r++) + at_stack_arg_entry[r] = callee_entry ? callee_entry[MAX_BPF_REG + r] : none; + if (env->log.level & BPF_LOG_LEVEL2) verbose(env, "subprog#%d: analyzing (depth %d)...\n", subprog, depth); @@ -1605,7 +1692,8 @@ redo: memcpy(at_out, at_in[i], sizeof(at_out)); memcpy(at_stack_out, at_stack_in[i], MAX_ARG_SPILL_SLOTS * sizeof(*at_stack_out)); - arg_track_xfer(env, insn, idx, at_out, at_stack_out, instance, callsites); + arg_track_xfer(env, insn, idx, at_out, at_stack_out, + at_stack_arg_entry, instance, callsites); arg_track_log(env, insn, idx, at_in[i], at_stack_in[i], at_out, at_stack_out); /* Propagate to successors within this subprogram */ @@ -1619,7 +1707,7 @@ redo: continue; ti = target - start; - for (r = 0; r < MAX_BPF_REG; r++) + for (r = 0; r < MAX_AT_TRACK_REGS; r++) changed |= arg_track_join(env, idx, target, r, &at_in[ti][r], at_out[r]); @@ -1674,11 +1762,14 @@ err_free: return err; } -/* Return true if any of R1-R5 is derived from a frame pointer. */ +/* Return true if any of R1-R5 or stack args is derived from a frame pointer. */ static bool has_fp_args(struct arg_track *args) { for (int r = BPF_REG_1; r <= BPF_REG_5; r++) - if (args[r].frame != ARG_NONE) + if (arg_is_fp(&args[r])) + return true; + for (int r = 0; r < MAX_STACK_ARG_SLOTS; r++) + if (arg_is_fp(&args[MAX_BPF_REG + r])) return true; return false; } @@ -1803,7 +1894,7 @@ static int analyze_subprog(struct bpf_verifier_env *env, /* For each reachable call site in the subprog, recurse into callees */ for (int p = po_start; p < po_end; p++) { int idx = env->cfg.insn_postorder[p]; - struct arg_track callee_args[BPF_REG_5 + 1]; + struct arg_track callee_args[MAX_AT_TRACK_REGS] = {}; struct arg_track none = { .frame = ARG_NONE }; struct bpf_insn *insn = &insns[idx]; struct func_instance *callee_instance; @@ -1818,9 +1909,11 @@ static int analyze_subprog(struct bpf_verifier_env *env, if (callee < 0) continue; - /* Build entry args: R1-R5 from at_in at call site */ + /* Build entry args: R1-R5 and stack args from at_in at call site */ for (int r = BPF_REG_1; r <= BPF_REG_5; r++) callee_args[r] = info[subprog].at_in[j][r]; + for (int r = 0; r < MAX_STACK_ARG_SLOTS; r++) + callee_args[MAX_BPF_REG + r] = info[subprog].at_in[j][MAX_BPF_REG + r]; } else if (bpf_calls_callback(env, idx)) { callee = find_callback_subprog(env, insn, idx, &caller_reg, &cb_callee_reg); if (callee == -2) { @@ -1842,6 +1935,8 @@ static int analyze_subprog(struct bpf_verifier_env *env, for (int r = BPF_REG_1; r <= BPF_REG_5; r++) callee_args[r] = none; + for (int r = 0; r < MAX_STACK_ARG_SLOTS; r++) + callee_args[MAX_BPF_REG + r] = none; callee_args[cb_callee_reg] = info[subprog].at_in[j][caller_reg]; } else { continue; @@ -2085,7 +2180,7 @@ static void compute_insn_live_regs(struct bpf_verifier_env *env, def = ALL_CALLER_SAVED_REGS; use = def & ~BIT(BPF_REG_0); if (bpf_get_call_summary(env, insn, &cs)) - use = GENMASK(cs.num_params, 1); + use = GENMASK(min_t(u8, cs.num_params, MAX_BPF_FUNC_REG_ARGS), 1); break; default: def = 0; diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 011e4ec25acd..b740fa73ee26 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -13,17 +13,17 @@ #define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args) -static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log) +static bool bpf_verifier_log_attr_valid(u32 log_level, char __user *log_buf, u32 log_size) { /* ubuf and len_total should both be specified (or not) together */ - if (!!log->ubuf != !!log->len_total) + if (!!log_buf != !!log_size) return false; /* log buf without log_level is meaningless */ - if (log->ubuf && log->level == 0) + if (log_buf && log_level == 0) return false; - if (log->level & ~BPF_LOG_MASK) + if (log_level & ~BPF_LOG_MASK) return false; - if (log->len_total > UINT_MAX >> 2) + if (log_size > UINT_MAX >> 2) return false; return true; } @@ -36,7 +36,7 @@ int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level, log->len_total = log_size; /* log attributes have to be sane */ - if (!bpf_verifier_log_attr_valid(log)) + if (!bpf_verifier_log_attr_valid(log_level, log_buf, log_size)) return -EINVAL; return 0; @@ -571,20 +571,20 @@ static void print_scalar_ranges(struct bpf_verifier_env *env, u64 val; bool omit; } minmaxs[] = { - {"smin", reg->smin_value, reg->smin_value == S64_MIN}, - {"smax", reg->smax_value, reg->smax_value == S64_MAX}, - {"umin", reg->umin_value, reg->umin_value == 0}, - {"umax", reg->umax_value, reg->umax_value == U64_MAX}, + {"smin", reg_smin(reg), reg_smin(reg) == S64_MIN}, + {"smax", reg_smax(reg), reg_smax(reg) == S64_MAX}, + {"umin", reg_umin(reg), reg_umin(reg) == 0}, + {"umax", reg_umax(reg), reg_umax(reg) == U64_MAX}, {"smin32", - is_snum_decimal((s64)reg->s32_min_value) - ? (s64)reg->s32_min_value - : (u32)reg->s32_min_value, reg->s32_min_value == S32_MIN}, + is_snum_decimal((s64)reg_s32_min(reg)) + ? (s64)reg_s32_min(reg) + : (u32)reg_s32_min(reg), reg_s32_min(reg) == S32_MIN}, {"smax32", - is_snum_decimal((s64)reg->s32_max_value) - ? (s64)reg->s32_max_value - : (u32)reg->s32_max_value, reg->s32_max_value == S32_MAX}, - {"umin32", reg->u32_min_value, reg->u32_min_value == 0}, - {"umax32", reg->u32_max_value, reg->u32_max_value == U32_MAX}, + is_snum_decimal((s64)reg_s32_max(reg)) + ? (s64)reg_s32_max(reg) + : (u32)reg_s32_max(reg), reg_s32_max(reg) == S32_MAX}, + {"umin32", reg_u32_min(reg), reg_u32_min(reg) == 0}, + {"umax32", reg_u32_max(reg), reg_u32_max(reg) == U32_MAX}, }, *m1, *m2, *mend = &minmaxs[ARRAY_SIZE(minmaxs)]; bool neg1, neg2; @@ -665,8 +665,8 @@ static void print_reg_state(struct bpf_verifier_env *env, verbose_a("id=%d", reg->id & ~BPF_ADD_CONST); if (reg->id & BPF_ADD_CONST) verbose(env, "%+d", reg->delta); - if (reg->ref_obj_id) - verbose_a("ref_obj_id=%d", reg->ref_obj_id); + if (reg->parent_id) + verbose_a("parent_id=%d", reg->parent_id); if (type_is_non_owning_ref(reg->type)) verbose_a("%s", "non_own_ref"); if (type_is_map_ptr(t)) { @@ -768,21 +768,19 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie verbose(env, "=dynptr_%s(", dynptr_type_str(reg->dynptr.type)); if (reg->id) verbose_a("id=%d", reg->id); - if (reg->ref_obj_id) - verbose_a("ref_id=%d", reg->ref_obj_id); - if (reg->dynptr_id) - verbose_a("dynptr_id=%d", reg->dynptr_id); + if (reg->parent_id) + verbose_a("parent_id=%d", reg->parent_id); verbose(env, ")"); break; case STACK_ITER: - /* only main slot has ref_obj_id set; skip others */ - if (!reg->ref_obj_id) + /* only main slot has id set; skip others */ + if (!reg->id) continue; - verbose(env, " fp%d=iter_%s(ref_id=%d,state=%s,depth=%u)", + verbose(env, " fp%d=iter_%s(id=%d,state=%s,depth=%u)", (-i - 1) * BPF_REG_SIZE, iter_type_str(reg->iter.btf, reg->iter.btf_id), - reg->ref_obj_id, iter_state_str(reg->iter.state), + reg->id, iter_state_str(reg->iter.state), reg->iter.depth); break; case STACK_MISC: @@ -825,3 +823,81 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_st } print_verifier_state(env, vstate, frameno, false); } + +int bpf_log_attr_init(struct bpf_log_attr *log, u64 log_buf, u32 log_size, u32 log_level, + u32 offsetof_log_true_size, bpfptr_t uattr, struct bpf_common_attr *common, + bpfptr_t uattr_common, u32 size_common) +{ + char __user *ubuf_common = u64_to_user_ptr(common->log_buf); + char __user *ubuf = u64_to_user_ptr(log_buf); + + if (!bpf_verifier_log_attr_valid(common->log_level, ubuf_common, common->log_size) || + !bpf_verifier_log_attr_valid(log_level, ubuf, log_size)) + return -EINVAL; + + if (ubuf && ubuf_common && (ubuf != ubuf_common || log_size != common->log_size || + log_level != common->log_level)) + return -EINVAL; + + memset(log, 0, sizeof(*log)); + log->ubuf = ubuf; + log->size = log_size; + log->level = log_level; + log->offsetof_true_size = offsetof_log_true_size; + log->uattr = uattr; + + if (!ubuf && ubuf_common) { + log->ubuf = ubuf_common; + log->size = common->log_size; + log->level = common->log_level; + log->uattr = uattr_common; + log->offsetof_true_size = 0; + if (size_common >= offsetofend(struct bpf_common_attr, log_true_size)) + log->offsetof_true_size = offsetof(struct bpf_common_attr, log_true_size); + } + return 0; +} + +struct bpf_verifier_log *bpf_log_attr_create_vlog(struct bpf_log_attr *attr_log, + struct bpf_common_attr *common, bpfptr_t uattr, + u32 size) +{ + struct bpf_verifier_log *log; + int err; + + memset(attr_log, 0, sizeof(*attr_log)); + attr_log->uattr = uattr; + if (size >= offsetofend(struct bpf_common_attr, log_true_size)) + attr_log->offsetof_true_size = offsetof(struct bpf_common_attr, log_true_size); + + if (!size) + return NULL; + + log = kzalloc_obj(*log, GFP_KERNEL); + if (!log) + return ERR_PTR(-ENOMEM); + + err = bpf_vlog_init(log, common->log_level, u64_to_user_ptr(common->log_buf), + common->log_size); + if (err) { + kfree(log); + return ERR_PTR(err); + } + + return log; +} + +int bpf_log_attr_finalize(struct bpf_log_attr *attr, struct bpf_verifier_log *log) +{ + u32 log_true_size; + int err; + + err = bpf_vlog_finalize(log, &log_true_size); + + if (attr->offsetof_true_size && + copy_to_bpfptr_offset(attr->uattr, attr->offsetof_true_size, &log_true_size, + sizeof(log_true_size))) + return -EFAULT; + + return err; +} diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 0f57608b385d..4d6f25db9ba1 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -246,7 +246,7 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key) /* Start walking the trie from the root node ... */ - for (node = rcu_dereference_check(trie->root, rcu_read_lock_bh_held()); + for (node = rcu_dereference_check(trie->root, bpf_rcu_lock_held()); node;) { unsigned int next_bit; size_t matchlen; @@ -280,7 +280,7 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key) */ next_bit = extract_bit(key->data, node->prefixlen); node = rcu_dereference_check(node->child[next_bit], - rcu_read_lock_bh_held()); + bpf_rcu_lock_held()); } if (!found) @@ -359,7 +359,7 @@ static long trie_update_elem(struct bpf_map *map, */ slot = &trie->root; - while ((node = rcu_dereference(*slot))) { + while ((node = rcu_dereference_protected(*slot, 1))) { matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || @@ -482,7 +482,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) trim = &trie->root; trim2 = trim; parent = NULL; - while ((node = rcu_dereference(*trim))) { + while ((node = rcu_dereference_protected(*trim, 1))) { matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 645bd30bc9a9..d2cbab4bdf64 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -20,7 +20,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) /* Does not support >1 level map-in-map */ if (inner_map->inner_map_meta) return ERR_PTR(-EINVAL); - + if (inner_map->excl_prog_sha) + return ERR_PTR(-ENOTSUPP); if (!inner_map->ops->map_meta_equal) return ERR_PTR(-ENOTSUPP); @@ -101,6 +102,8 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map, inner_map = __bpf_map_get(f); if (IS_ERR(inner_map)) return inner_map; + if (inner_map->excl_prog_sha) + return ERR_PTR(-ENOTSUPP); inner_map_meta = map->inner_map_meta; if (inner_map_meta->ops->map_meta_equal(inner_map_meta, inner_map)) diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 261a03ea73d3..c19b360bad9e 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -112,6 +112,10 @@ static int bpf_iter_attach_map(struct bpf_prog *prog, map = bpf_map_get_with_uref(linfo->map.map_fd); if (IS_ERR(map)) return PTR_ERR(map); + if (map->excl_prog_sha) { + err = -EPERM; + goto put_map; + } if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || @@ -119,7 +123,8 @@ static int bpf_iter_attach_map(struct bpf_prog *prog, is_percpu = true; else if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && - map->map_type != BPF_MAP_TYPE_ARRAY) + map->map_type != BPF_MAP_TYPE_ARRAY && + map->map_type != BPF_MAP_TYPE_RHASH) goto put_map; key_acc_size = prog->aux->max_rdonly_access; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index da3d328f5c15..77ba03216c09 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -9,6 +9,7 @@ #include <linux/perf_event.h> #include <linux/btf_ids.h> #include <linux/buildid.h> +#include <linux/mmap_lock.h> #include "percpu_freelist.h" #include "mmap_unlock_work.h" @@ -152,6 +153,180 @@ static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, b : build_id_parse_nofault(vma, build_id, NULL); } +static inline void stack_map_build_id_set_ip(struct bpf_stack_build_id *id) +{ + id->status = BPF_STACK_BUILD_ID_IP; + memset(id->build_id, 0, BUILD_ID_SIZE_MAX); +} + +static inline u64 stack_map_build_id_offset(unsigned long vm_pgoff, + unsigned long vm_start, u64 ip) +{ + return (vm_pgoff << PAGE_SHIFT) + ip - vm_start; +} + +static inline void stack_map_build_id_set_valid(struct bpf_stack_build_id *id, + u64 offset, + const unsigned char *build_id) +{ + id->status = BPF_STACK_BUILD_ID_VALID; + id->offset = offset; + if (id->build_id != build_id) + memcpy(id->build_id, build_id, BUILD_ID_SIZE_MAX); +} + +struct stack_map_vma_lock { + struct vm_area_struct *vma; + struct mm_struct *mm; +}; + +/* + * Acquire a stable read-side reference on the VMA covering @ip. + * + * With CONFIG_PER_VMA_LOCK=y this returns a VMA with its per-VMA read + * lock held and mmap_lock dropped, so the caller may sleep. + * + * With CONFIG_PER_VMA_LOCK=n it returns a VMA with mmap_lock still + * held; the caller must snapshot any fields it needs and pin vm_file + * with get_file() before stack_map_unlock_vma() drops mmap_lock, as + * the VMA may be split, merged, or freed after that. + * + * Returns NULL on failure, in which case no lock is held. + */ +static struct vm_area_struct * +stack_map_lock_vma(struct stack_map_vma_lock *lock, unsigned long ip) +{ + struct mm_struct *mm = lock->mm; + struct vm_area_struct *vma; + + /* noop under !CONFIG_PER_VMA_LOCK */ + vma = lock_vma_under_rcu(mm, ip); + if (vma) { + lock->vma = vma; + return vma; + } + + /* + * Taking mmap_read_lock() is unsafe here, because the caller BPF + * program might already hold it, causing a deadlock. + */ + if (!mmap_read_trylock(mm)) + return NULL; + + vma = vma_lookup(mm, ip); + if (!vma) { + mmap_read_unlock(mm); + return NULL; + } + +#ifdef CONFIG_PER_VMA_LOCK + if (!vma_start_read_locked(vma)) { + mmap_read_unlock(mm); + return NULL; + } + mmap_read_unlock(mm); +#endif + + lock->vma = vma; + return vma; +} + +static void stack_map_unlock_vma(struct stack_map_vma_lock *lock) +{ +#ifdef CONFIG_PER_VMA_LOCK + vma_end_read(lock->vma); +#else + mmap_read_unlock(lock->mm); +#endif + lock->vma = NULL; +} + +static void stack_map_get_build_id_offset_sleepable(struct bpf_stack_build_id *id_offs, + u32 trace_nr) +{ + struct mm_struct *mm = current->mm; + struct stack_map_vma_lock lock = { .mm = mm }; + struct { + struct file *file; + const unsigned char *build_id; + unsigned long vm_start; + unsigned long vm_end; + unsigned long vm_pgoff; + } cache = {}; + unsigned long vm_pgoff, vm_start, vm_end; + struct vm_area_struct *vma; + struct file *file; + u64 offset; + u64 ip; + + for (u32 i = 0; i < trace_nr; i++) { + ip = READ_ONCE(id_offs[i].ip); + + /* + * Range cache fast path: if ip falls within the previously + * resolved VMA range, reuse the cache build_id without + * re-acquiring the VMA lock. + */ + if (cache.build_id && ip >= cache.vm_start && ip < cache.vm_end) { + offset = stack_map_build_id_offset(cache.vm_pgoff, cache.vm_start, ip); + stack_map_build_id_set_valid(&id_offs[i], offset, cache.build_id); + continue; + } + + vma = stack_map_lock_vma(&lock, ip); + if (!vma) { + stack_map_build_id_set_ip(&id_offs[i]); + continue; + } + if (vma_is_anonymous(vma) || !vma->vm_file) { + stack_map_build_id_set_ip(&id_offs[i]); + stack_map_unlock_vma(&lock); + continue; + } + + file = vma->vm_file; + vm_pgoff = vma->vm_pgoff; + vm_start = vma->vm_start; + vm_end = vma->vm_end; + offset = stack_map_build_id_offset(vm_pgoff, vm_start, ip); + + /* + * Same backing file as previous (e.g. different VMAs + * of the same ELF binary). Reuse the cache build_id. + */ + if (file == cache.file) { + stack_map_unlock_vma(&lock); + stack_map_build_id_set_valid(&id_offs[i], offset, cache.build_id); + cache.vm_start = vm_start; + cache.vm_end = vm_end; + cache.vm_pgoff = vm_pgoff; + continue; + } + + file = get_file(file); + stack_map_unlock_vma(&lock); + + /* build_id_parse_file() may block on filesystem reads */ + if (build_id_parse_file(file, id_offs[i].build_id, NULL)) { + stack_map_build_id_set_ip(&id_offs[i]); + fput(file); + continue; + } + + stack_map_build_id_set_valid(&id_offs[i], offset, id_offs[i].build_id); + if (cache.file) + fput(cache.file); + cache.file = file; + cache.build_id = id_offs[i].build_id; + cache.vm_start = vm_start; + cache.vm_end = vm_end; + cache.vm_pgoff = vm_pgoff; + } + + if (cache.file) + fput(cache.file); +} + /* * Expects all id_offs[i].ip values to be set to correct initial IPs. * They will be subsequently: @@ -165,44 +340,50 @@ static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, b static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, u32 trace_nr, bool user, bool may_fault) { - int i; struct mmap_unlock_irq_work *work = NULL; bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work); + bool has_user_ctx = user && current && current->mm; struct vm_area_struct *vma, *prev_vma = NULL; - const char *prev_build_id; + const unsigned char *prev_build_id = NULL; + int i; + + if (may_fault && has_user_ctx) { + stack_map_get_build_id_offset_sleepable(id_offs, trace_nr); + return; + } /* If the irq_work is in use, fall back to report ips. Same * fallback is used for kernel stack (!user) on a stackmap with * build_id. */ - if (!user || !current || !current->mm || irq_work_busy || - !mmap_read_trylock(current->mm)) { + if (!has_user_ctx || irq_work_busy || !mmap_read_trylock(current->mm)) { /* cannot access current->mm, fall back to ips */ - for (i = 0; i < trace_nr; i++) { - id_offs[i].status = BPF_STACK_BUILD_ID_IP; - memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); - } + for (i = 0; i < trace_nr; i++) + stack_map_build_id_set_ip(&id_offs[i]); return; } for (i = 0; i < trace_nr; i++) { u64 ip = READ_ONCE(id_offs[i].ip); + u64 offset; - if (range_in_vma(prev_vma, ip, ip)) { + if (prev_build_id && range_in_vma(prev_vma, ip, ip)) { vma = prev_vma; - memcpy(id_offs[i].build_id, prev_build_id, BUILD_ID_SIZE_MAX); - goto build_id_valid; + offset = stack_map_build_id_offset(vma->vm_pgoff, vma->vm_start, ip); + stack_map_build_id_set_valid(&id_offs[i], offset, prev_build_id); + continue; } vma = find_vma(current->mm, ip); - if (!vma || fetch_build_id(vma, id_offs[i].build_id, may_fault)) { + if (!vma || vma_is_anonymous(vma) || + fetch_build_id(vma, id_offs[i].build_id, may_fault)) { /* per entry fall back to ips */ - id_offs[i].status = BPF_STACK_BUILD_ID_IP; - memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); + stack_map_build_id_set_ip(&id_offs[i]); + prev_vma = vma; + prev_build_id = NULL; continue; } -build_id_valid: - id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ip - vma->vm_start; - id_offs[i].status = BPF_STACK_BUILD_ID_VALID; + offset = stack_map_build_id_offset(vma->vm_pgoff, vma->vm_start, ip); + stack_map_build_id_set_valid(&id_offs[i], offset, id_offs[i].build_id); prev_vma = vma; prev_build_id = id_offs[i].build_id; } diff --git a/kernel/bpf/states.c b/kernel/bpf/states.c index 8478d2c6ed5b..32f346ce3ffc 100644 --- a/kernel/bpf/states.c +++ b/kernel/bpf/states.c @@ -2,6 +2,7 @@ /* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ #include <linux/bpf.h> #include <linux/bpf_verifier.h> +#include <linux/cnum.h> #include <linux/filter.h> #define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args) @@ -301,14 +302,8 @@ int bpf_update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_s static bool range_within(const struct bpf_reg_state *old, const struct bpf_reg_state *cur) { - return old->umin_value <= cur->umin_value && - old->umax_value >= cur->umax_value && - old->smin_value <= cur->smin_value && - old->smax_value >= cur->smax_value && - old->u32_min_value <= cur->u32_min_value && - old->u32_max_value >= cur->u32_max_value && - old->s32_min_value <= cur->s32_min_value && - old->s32_max_value >= cur->s32_max_value; + return cnum64_is_subset(old->r64, cur->r64) && + cnum32_is_subset(old->r32, cur->r32); } /* If in the old state two registers had the same id, then they need to have @@ -348,8 +343,12 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) return true; } - /* We ran out of idmap slots, which should be impossible */ - WARN_ON_ONCE(1); + /* + * idmap slots are bounded by the number of registers and stack slots. + * Since referenced dynptrs acquire intermediate references that do + * not live in either, so the map can be exhausted. Since it is unlikely, + * fail the verification by treating the states as not equivalent. + */ return false; } @@ -494,7 +493,7 @@ static bool regs_exact(const struct bpf_reg_state *rold, { return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && check_ids(rold->id, rcur->id, idmap) && - check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap); + check_ids(rold->parent_id, rcur->parent_id, idmap); } enum exact_level { @@ -619,7 +618,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off) && check_ids(rold->id, rcur->id, idmap) && - check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap); + check_ids(rold->parent_id, rcur->parent_id, idmap); case PTR_TO_PACKET_META: case PTR_TO_PACKET: /* We must have at least as much range as the old ptr @@ -799,7 +798,8 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, cur_reg = &cur->stack[spi].spilled_ptr; if (old_reg->dynptr.type != cur_reg->dynptr.type || old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot || - !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) + !check_ids(old_reg->id, cur_reg->id, idmap) || + !check_ids(old_reg->parent_id, cur_reg->parent_id, idmap)) return false; break; case STACK_ITER: @@ -815,13 +815,13 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, old_reg->iter.btf_id != cur_reg->iter.btf_id || old_reg->iter.state != cur_reg->iter.state || /* ignore {old_reg,cur_reg}->iter.depth, see above */ - !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) + !check_ids(old_reg->id, cur_reg->id, idmap)) return false; break; case STACK_IRQ_FLAG: old_reg = &old->stack[spi].spilled_ptr; cur_reg = &cur->stack[spi].spilled_ptr; - if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap) || + if (!check_ids(old_reg->id, cur_reg->id, idmap) || old_reg->irq.kfunc_class != cur_reg->irq.kfunc_class) return false; break; @@ -838,6 +838,32 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, return true; } +/* + * Compare stack arg slots between old and current states. + * Outgoing stack args are path-local state and must agree for pruning. + */ +static bool stack_arg_safe(struct bpf_verifier_env *env, struct bpf_func_state *old, + struct bpf_func_state *cur, struct bpf_idmap *idmap, + enum exact_level exact) +{ + int i, nslots; + + nslots = max(old->out_stack_arg_cnt, cur->out_stack_arg_cnt); + for (i = 0; i < nslots; i++) { + struct bpf_reg_state *old_arg, *cur_arg; + struct bpf_reg_state not_init = { .type = NOT_INIT }; + + old_arg = i < old->out_stack_arg_cnt ? + &old->stack_arg_regs[i] : ¬_init; + cur_arg = i < cur->out_stack_arg_cnt ? + &cur->stack_arg_regs[i] : ¬_init; + if (!regsafe(env, old_arg, cur_arg, idmap, exact)) + return false; + } + + return true; +} + static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *cur, struct bpf_idmap *idmap) { @@ -868,6 +894,9 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c return false; switch (old->refs[i].type) { case REF_TYPE_PTR: + if (!check_ids(old->refs[i].parent_id, cur->refs[i].parent_id, idmap)) + return false; + break; case REF_TYPE_IRQ: break; case REF_TYPE_LOCK: @@ -920,6 +949,9 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat if (old->callback_depth > cur->callback_depth) return false; + if (!old->no_stack_arg_load && cur->no_stack_arg_load) + return false; + for (i = 0; i < MAX_BPF_REG; i++) if (((1 << i) & live_regs) && !regsafe(env, &old->regs[i], &cur->regs[i], @@ -929,6 +961,9 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat if (!stacksafe(env, old, cur, &env->idmap_scratch, exact)) return false; + if (!stack_arg_safe(env, old, cur, &env->idmap_scratch, exact)) + return false; + return true; } @@ -1376,7 +1411,7 @@ hit: */ err = 0; if (bpf_is_jmp_point(env, env->insn_idx)) - err = bpf_push_jmp_history(env, cur, 0, 0); + err = bpf_push_jmp_history(env, cur, 0, 0, 0, 0); err = err ? : propagate_precision(env, &sl->state, cur, NULL); if (err) return err; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 630d530782fe..b44106c8ea75 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -41,6 +41,7 @@ #include <linux/overflow.h> #include <linux/cookie.h> #include <linux/verification.h> +#include <linux/btf_ids.h> #include <net/netfilter/nf_bpf_link.h> #include <net/netkit.h> @@ -807,6 +808,11 @@ void bpf_obj_free_task_work(const struct btf_record *rec, void *obj) bpf_task_work_cancel_and_free(obj + rec->task_work_off); } +void bpf_obj_cancel_fields(struct bpf_map *map, void *obj) +{ + bpf_map_free_internal_structs(map, obj); +} + void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; @@ -1280,6 +1286,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, case BPF_SPIN_LOCK: case BPF_RES_SPIN_LOCK: if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_RHASH && map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && map->map_type != BPF_MAP_TYPE_SK_STORAGE && @@ -1294,6 +1301,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, case BPF_WORKQUEUE: case BPF_TASK_WORK: if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_RHASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY) { ret = -EOPNOTSUPP; @@ -1305,6 +1313,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, case BPF_KPTR_PERCPU: case BPF_REFCOUNT: if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_RHASH && map->map_type != BPF_MAP_TYPE_PERCPU_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH && @@ -1359,7 +1368,8 @@ free_map_tab: #define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size /* called via syscall */ -static int map_create(union bpf_attr *attr, bpfptr_t uattr) +static int map_create_alloc(union bpf_attr *attr, bpfptr_t uattr, struct bpf_verifier_log *log, + struct bpf_map **mapp, struct bpf_token **tokenp) { const struct bpf_map_ops *ops; struct bpf_token *token = NULL; @@ -1367,12 +1377,13 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) u32 map_type = attr->map_type; struct bpf_map *map; bool token_flag; - int f_flags; int err; err = CHECK_ATTR(BPF_MAP_CREATE); - if (err) + if (err) { + bpf_log(log, "Invalid attr.\n"); return -EINVAL; + } /* check BPF_F_TOKEN_FD flag, remember if it's set, and then clear it * to avoid per-map type checks tripping on unknown flag @@ -1381,31 +1392,40 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) attr->map_flags &= ~BPF_F_TOKEN_FD; if (attr->btf_vmlinux_value_type_id) { - if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS || - attr->btf_key_type_id || attr->btf_value_type_id) + if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS) { + bpf_log(log, "btf_vmlinux_value_type_id can only be used with struct_ops maps.\n"); return -EINVAL; + } + if (attr->btf_key_type_id || attr->btf_value_type_id) { + bpf_log(log, "btf_vmlinux_value_type_id is mutually exclusive with btf_key_type_id and btf_value_type_id.\n"); + return -EINVAL; + } } else if (attr->btf_key_type_id && !attr->btf_value_type_id) { + bpf_log(log, "Invalid btf_value_type_id.\n"); return -EINVAL; } if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER && attr->map_type != BPF_MAP_TYPE_ARENA && - attr->map_extra != 0) + attr->map_type != BPF_MAP_TYPE_RHASH && + attr->map_extra != 0) { + bpf_log(log, "Invalid map_extra.\n"); return -EINVAL; - - f_flags = bpf_get_file_flag(attr->map_flags); - if (f_flags < 0) - return f_flags; + } if (numa_node != NUMA_NO_NODE && ((unsigned int)numa_node >= nr_node_ids || - !node_online(numa_node))) + !node_online(numa_node))) { + bpf_log(log, "Invalid numa_node.\n"); return -EINVAL; + } /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ map_type = attr->map_type; - if (map_type >= ARRAY_SIZE(bpf_map_types)) + if (map_type >= ARRAY_SIZE(bpf_map_types)) { + bpf_log(log, "Invalid map_type.\n"); return -EINVAL; + } map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types)); ops = bpf_map_types[map_type]; if (!ops) @@ -1423,8 +1443,10 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) if (token_flag) { token = bpf_token_get_from_fd(attr->map_token_fd); - if (IS_ERR(token)) + if (IS_ERR(token)) { + bpf_log(log, "Invalid map_token_fd.\n"); return PTR_ERR(token); + } /* if current token doesn't grant map creation permissions, * then we can't use this token, so ignore it and rely on @@ -1457,6 +1479,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_MAP_TYPE_CGROUP_ARRAY: case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH: + case BPF_MAP_TYPE_RHASH: case BPF_MAP_TYPE_PERCPU_HASH: case BPF_MAP_TYPE_HASH_OF_MAPS: case BPF_MAP_TYPE_RINGBUF: @@ -1507,8 +1530,10 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) err = bpf_obj_name_cpy(map->name, attr->map_name, sizeof(attr->map_name)); - if (err < 0) + if (err < 0) { + bpf_log(log, "Invalid map_name.\n"); goto free_map; + } preempt_disable(); map->cookie = gen_cookie_next(&bpf_map_cookie); @@ -1531,6 +1556,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) btf = btf_get_by_fd(attr->btf_fd); if (IS_ERR(btf)) { + bpf_log(log, "Invalid btf_fd.\n"); err = PTR_ERR(btf); goto free_map; } @@ -1558,6 +1584,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) bpfptr_t uprog_hash = make_bpfptr(attr->excl_prog_hash, uattr.is_kernel); if (attr->excl_prog_hash_size != SHA256_DIGEST_SIZE) { + bpf_log(log, "Invalid excl_prog_hash_size.\n"); err = -EINVAL; goto free_map; } @@ -1572,11 +1599,62 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) err = -EFAULT; goto free_map; } + + /* See libbpf: emit_signature_match() */ + BUILD_BUG_ON(offsetof(struct bpf_map, excl) != SHA256_DIGEST_SIZE); + BUILD_BUG_ON(!__same_type(map->excl, u32)); + BUILD_BUG_ON(offsetof(struct bpf_map, sha) != 0); + BUILD_BUG_ON(!__same_type(map->sha, u8[SHA256_DIGEST_SIZE])); + map->excl = 1; } else if (attr->excl_prog_hash_size) { + bpf_log(log, "Invalid excl_prog_hash_size.\n"); err = -EINVAL; goto free_map; } + *mapp = map; + *tokenp = token; + return 0; + +free_map: + bpf_map_free(map); +put_token: + bpf_token_put(token); + return err; +} + +static int map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_common_attr *attr_common, + bpfptr_t uattr_common, u32 size_common) +{ + struct bpf_token *token = NULL; + struct bpf_verifier_log *log; + struct bpf_log_attr attr_log; + struct bpf_map *map = NULL; + int err, ret; + int f_flags; + + log = bpf_log_attr_create_vlog(&attr_log, attr_common, uattr_common, size_common); + if (IS_ERR(log)) + return PTR_ERR(log); + + err = map_create_alloc(attr, uattr, log, &map, &token); + + /* preserve original error even if log finalization is successful */ + ret = bpf_log_attr_finalize(&attr_log, log); + if (ret) + err = ret; + + kfree(log); + + if (err) + goto free_map; + + f_flags = bpf_get_file_flag(attr->map_flags); + if (f_flags < 0) { + err = f_flags; + goto free_map; + } + err = security_bpf_map_create(map, attr, token, uattr.is_kernel); if (err) goto free_map_sec; @@ -1605,8 +1683,8 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) free_map_sec: security_bpf_map_free(map); free_map: - bpf_map_free(map); -put_token: + if (map) + bpf_map_free(map); bpf_token_put(token); return err; } @@ -2192,6 +2270,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_RHASH || map->map_type == BPF_MAP_TYPE_STACK_TRACE) { if (!bpf_map_is_offloaded(map)) { bpf_disable_instrumentation(); @@ -2646,7 +2725,8 @@ static int bpf_prog_load_check_attach(enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type, struct btf *attach_btf, u32 btf_id, - struct bpf_prog *dst_prog) + struct bpf_prog *dst_prog, + bool multi_func) { if (btf_id) { if (btf_id > BTF_MAX_TYPE) @@ -2666,6 +2746,14 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, } } + if (multi_func) { + if (prog_type != BPF_PROG_TYPE_TRACING) + return -EINVAL; + if (!attach_btf || btf_id) + return -EINVAL; + return 0; + } + if (attach_btf && (!btf_id || dst_prog)) return -EINVAL; @@ -2798,8 +2886,22 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) } } +static enum bpf_sig_keyring bpf_classify_keyring(s32 keyring_id) +{ + switch (keyring_id) { + case 0: + return BPF_SIG_KEYRING_BUILTIN; + case (s32)(unsigned long)VERIFY_USE_SECONDARY_KEYRING: + return BPF_SIG_KEYRING_SECONDARY; + case (s32)(unsigned long)VERIFY_USE_PLATFORM_KEYRING: + return BPF_SIG_KEYRING_PLATFORM; + default: + return BPF_SIG_KEYRING_USER; + } +} + static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr, - bool is_kernel) + bool is_kernel, s32 *keyring_serial) { bpfptr_t usig = make_bpfptr(attr->signature, is_kernel); struct bpf_dynptr_kern sig_ptr, insns_ptr; @@ -2835,7 +2937,8 @@ static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr err = bpf_verify_pkcs7_signature((struct bpf_dynptr *)&insns_ptr, (struct bpf_dynptr *)&sig_ptr, key); - + if (!err) + *keyring_serial = bpf_key_serial(key); bpf_key_put(key); kvfree(sig); return err; @@ -2858,10 +2961,15 @@ static int bpf_prog_mark_insn_arrays_ready(struct bpf_prog *prog) return 0; } +extern int bpf_multi_func(void); +int __init __used bpf_multi_func(void) { return 0; } + +BTF_ID_LIST_GLOBAL_SINGLE(bpf_multi_func_btf_id, func, bpf_multi_func) + /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD keyring_id -static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) +static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log) { enum bpf_prog_type type = attr->prog_type; struct bpf_prog *prog, *dst_prog = NULL; @@ -2870,6 +2978,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) bool bpf_cap; int err; char license[128]; + bool multi_func; if (CHECK_ATTR(BPF_PROG_LOAD)) return -EINVAL; @@ -2936,6 +3045,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON)) goto put_token; + multi_func = is_tracing_multi(attr->expected_attach_type); + /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog * or btf, we need to check which one it is */ @@ -2957,7 +3068,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) goto put_token; } } - } else if (attr->attach_btf_id) { + } else if (attr->attach_btf_id || multi_func) { /* fall back to vmlinux BTF, if BTF type ID is specified */ attach_btf = bpf_get_btf_vmlinux(); if (IS_ERR(attach_btf)) { @@ -2973,7 +3084,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (bpf_prog_load_check_attach(type, attr->expected_attach_type, attach_btf, attr->attach_btf_id, - dst_prog)) { + dst_prog, multi_func)) { if (dst_prog) bpf_prog_put(dst_prog); if (attach_btf) @@ -2996,7 +3107,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) prog->expected_attach_type = attr->expected_attach_type; prog->sleepable = !!(attr->prog_flags & BPF_F_SLEEPABLE); prog->aux->attach_btf = attach_btf; - prog->aux->attach_btf_id = attr->attach_btf_id; + prog->aux->attach_btf_id = multi_func ? bpf_multi_func_btf_id[0] : attr->attach_btf_id; prog->aux->dst_prog = dst_prog; prog->aux->dev_bound = !!attr->prog_ifindex; prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS; @@ -3022,13 +3133,17 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) /* eBPF programs must be GPL compatible to use GPL-ed functions */ prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0; - if (attr->signature) { - err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel); + err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel, + &prog->aux->sig.keyring_serial); if (err) goto free_prog; + prog->aux->sig.keyring_type = bpf_classify_keyring(attr->keyring_id); + prog->aux->sig.verdict = BPF_SIG_VERIFIED; + } else { + prog->aux->sig.keyring_type = BPF_SIG_KEYRING_NONE; + prog->aux->sig.verdict = BPF_SIG_UNSIGNED; } - prog->orig_prog = NULL; prog->jited = 0; @@ -3076,10 +3191,10 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) err = security_bpf_prog_load(prog, attr, token, uattr.is_kernel); if (err) - goto free_prog_sec; + goto free_prog; /* run eBPF verifier */ - err = bpf_check(&prog, attr, uattr, uattr_size); + err = bpf_check(&prog, attr, uattr, attr_log); if (err < 0) goto free_used_maps; @@ -3122,8 +3237,6 @@ free_used_maps: __bpf_prog_put_noref(prog, prog->aux->real_func_cnt); return err; -free_prog_sec: - security_bpf_prog_free(prog); free_prog: free_uid(prog->aux->user); if (prog->aux->attach_btf) @@ -3198,6 +3311,15 @@ void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, bpf_link_init_sleepable(link, type, ops, prog, attach_type, false); } +void bpf_tramp_link_init(struct bpf_tramp_link *link, enum bpf_link_type type, + const struct bpf_link_ops *ops, struct bpf_prog *prog, + enum bpf_attach_type attach_type, u64 cookie) +{ + bpf_link_init(&link->link, type, ops, prog, attach_type); + link->node.link = &link->link; + link->node.cookie = cookie; +} + static void bpf_link_free_id(int id) { if (!id) @@ -3358,7 +3480,7 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_KPROBE_MULTI_RETURN ? "kretprobe_multi" : "kprobe_multi"); else if (link->type == BPF_LINK_TYPE_UPROBE_MULTI) - seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_UPROBE_MULTI_RETURN ? + seq_printf(m, "link_type:\t%s\n", link->flags & BPF_F_UPROBE_MULTI_RETURN ? "uretprobe_multi" : "uprobe_multi"); else seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]); @@ -3505,7 +3627,7 @@ static void bpf_tracing_link_release(struct bpf_link *link) struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); - WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link, + WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link.node, tr_link->trampoline, tr_link->tgt_prog)); @@ -3518,8 +3640,7 @@ static void bpf_tracing_link_release(struct bpf_link *link) static void bpf_tracing_link_dealloc(struct bpf_link *link) { - struct bpf_tracing_link *tr_link = - container_of(link, struct bpf_tracing_link, link.link); + struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); kfree(tr_link); } @@ -3527,8 +3648,8 @@ static void bpf_tracing_link_dealloc(struct bpf_link *link) static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { - struct bpf_tracing_link *tr_link = - container_of(link, struct bpf_tracing_link, link.link); + struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); + u32 target_btf_id, target_obj_id; bpf_trampoline_unpack_key(tr_link->trampoline->key, @@ -3541,17 +3662,16 @@ static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, link->attach_type, target_obj_id, target_btf_id, - tr_link->link.cookie); + tr_link->link.node.cookie); } static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { - struct bpf_tracing_link *tr_link = - container_of(link, struct bpf_tracing_link, link.link); + struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); info->tracing.attach_type = link->attach_type; - info->tracing.cookie = tr_link->link.cookie; + info->tracing.cookie = tr_link->link.node.cookie; bpf_trampoline_unpack_key(tr_link->trampoline->key, &info->tracing.target_obj_id, &info->tracing.target_btf_id); @@ -3633,29 +3753,18 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id); } - if (prog->expected_attach_type == BPF_TRACE_FSESSION) { - struct bpf_fsession_link *fslink; - - fslink = kzalloc_obj(*fslink, GFP_USER); - if (fslink) { - bpf_link_init(&fslink->fexit.link, BPF_LINK_TYPE_TRACING, - &bpf_tracing_link_lops, prog, attach_type); - fslink->fexit.cookie = bpf_cookie; - link = &fslink->link; - } else { - link = NULL; - } - } else { - link = kzalloc_obj(*link, GFP_USER); - } + link = kzalloc_obj(*link, GFP_USER); if (!link) { err = -ENOMEM; goto out_put_prog; } - bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING, - &bpf_tracing_link_lops, prog, attach_type); + bpf_tramp_link_init(&link->link, BPF_LINK_TYPE_TRACING, + &bpf_tracing_link_lops, prog, attach_type, bpf_cookie); - link->link.cookie = bpf_cookie; + if (prog->expected_attach_type == BPF_TRACE_FSESSION) { + link->fexit.link = &link->link.link; + link->fexit.cookie = bpf_cookie; + } mutex_lock(&prog->aux->dst_mutex); @@ -3758,7 +3867,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, if (err) goto out_unlock; - err = bpf_trampoline_link_prog(&link->link, tr, tgt_prog); + err = bpf_trampoline_link_prog(&link->link.node, tr, tgt_prog); if (err) { bpf_link_cleanup(&link_primer); link = NULL; @@ -4281,6 +4390,11 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, if (!btp) return -ENOENT; + if (prog->sleepable && !tracepoint_is_faultable(btp->tp)) { + bpf_put_raw_tracepoint(btp); + return -EINVAL; + } + link = kzalloc_obj(*link, GFP_USER); if (!link) { err = -ENOMEM; @@ -4389,6 +4503,9 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_TRACE_FSESSION: + case BPF_TRACE_FSESSION_MULTI: + case BPF_TRACE_FENTRY_MULTI: + case BPF_TRACE_FEXIT_MULTI: case BPF_MODIFY_RETURN: return BPF_PROG_TYPE_TRACING; case BPF_LSM_MAC: @@ -4654,7 +4771,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) #define BPF_PROG_QUERY_LAST_FIELD query.revision static int bpf_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr) + union bpf_attr __user *uattr, u32 uattr_size) { if (!bpf_net_capable()) return -EPERM; @@ -4693,7 +4810,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_GETSOCKOPT: case BPF_CGROUP_SETSOCKOPT: case BPF_LSM_CGROUP: - return cgroup_bpf_prog_query(attr, uattr); + return cgroup_bpf_prog_query(attr, uattr, uattr_size); case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); case BPF_FLOW_DISSECTOR: @@ -5045,10 +5162,11 @@ static int bpf_prog_get_info_by_fd(struct file *file, u32 info_len = attr->info.info_len; struct bpf_prog_kstats stats; char __user *uinsns; - u32 ulen; + u32 ulen, len; int err; - err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); + len = offsetofend(struct bpf_prog_info, attach_btf_id); + err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), len, info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); @@ -5330,10 +5448,11 @@ static int bpf_map_get_info_by_fd(struct file *file, { struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info); struct bpf_map_info info; - u32 info_len = attr->info.info_len; + u32 info_len = attr->info.info_len, len; int err; - err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); + len = offsetofend(struct bpf_map_info, hash_size); + err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), len, info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); @@ -5371,18 +5490,16 @@ static int bpf_map_get_info_by_fd(struct file *file, if (!map->ops->map_get_hash) return -EINVAL; - - if (info.hash_size != SHA256_DIGEST_SIZE) + if (info.hash_size != sizeof(map->sha)) return -EINVAL; - if (!READ_ONCE(map->frozen)) return -EPERM; - err = map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, map->sha); + err = map->ops->map_get_hash(map); if (err != 0) return err; - if (copy_to_user(uhash, map->sha, SHA256_DIGEST_SIZE) != 0) + if (copy_to_user(uhash, map->sha, sizeof(map->sha)) != 0) return -EFAULT; } else if (info.hash_size) { return -EINVAL; @@ -5495,7 +5612,7 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, #define BPF_BTF_LOAD_LAST_FIELD btf_token_fd -static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) +static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log) { struct bpf_token *token = NULL; @@ -5522,7 +5639,7 @@ static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_ bpf_token_put(token); - return btf_new_fd(attr, uattr, uattr_size); + return btf_new_fd(attr, uattr, attr_log); } #define BPF_BTF_GET_FD_BY_ID_LAST_FIELD fd_by_id_token_fd @@ -5723,7 +5840,7 @@ err_put: return err; } -#define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.pid +#define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.path_fd static int link_create(union bpf_attr *attr, bpfptr_t uattr) { struct bpf_prog *prog; @@ -5774,6 +5891,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = bpf_iter_link_attach(attr, uattr, prog); else if (prog->expected_attach_type == BPF_LSM_CGROUP) ret = cgroup_bpf_link_attach(attr, prog); + else if (is_tracing_multi(prog->expected_attach_type)) + ret = bpf_tracing_multi_attach(prog, attr); else ret = bpf_tracing_prog_attach(prog, attr->link_create.target_fd, @@ -6232,8 +6351,12 @@ put_prog: return ret; } -static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) +static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size, + bpfptr_t uattr_common, unsigned int size_common) { + struct bpf_common_attr attr_common; + u32 offsetof_log_true_size = 0; + struct bpf_log_attr attr_log; union bpf_attr attr; int err; @@ -6247,13 +6370,29 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) if (copy_from_bpfptr(&attr, uattr, size) != 0) return -EFAULT; + memset(&attr_common, 0, sizeof(attr_common)); + if (cmd & BPF_COMMON_ATTRS) { + err = bpf_check_uarg_tail_zero(uattr_common, + offsetofend(struct bpf_common_attr, log_true_size), + size_common); + if (err) + return err; + + cmd &= ~BPF_COMMON_ATTRS; + size_common = min_t(u32, size_common, sizeof(attr_common)); + if (copy_from_bpfptr(&attr_common, uattr_common, size_common) != 0) + return -EFAULT; + } else { + size_common = 0; + } + err = security_bpf(cmd, &attr, size, uattr.is_kernel); if (err < 0) return err; switch (cmd) { case BPF_MAP_CREATE: - err = map_create(&attr, uattr); + err = map_create(&attr, uattr, &attr_common, uattr_common, size_common); break; case BPF_MAP_LOOKUP_ELEM: err = map_lookup_elem(&attr); @@ -6271,7 +6410,12 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) err = map_freeze(&attr); break; case BPF_PROG_LOAD: - err = bpf_prog_load(&attr, uattr, size); + if (size >= offsetofend(union bpf_attr, log_true_size)) + offsetof_log_true_size = offsetof(union bpf_attr, log_true_size); + err = bpf_log_attr_init(&attr_log, attr.log_buf, attr.log_size, attr.log_level, + offsetof_log_true_size, uattr, &attr_common, uattr_common, + size_common); + err = err ?: bpf_prog_load(&attr, uattr, &attr_log); break; case BPF_OBJ_PIN: err = bpf_obj_pin(&attr); @@ -6286,7 +6430,7 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) err = bpf_prog_detach(&attr); break; case BPF_PROG_QUERY: - err = bpf_prog_query(&attr, uattr.user); + err = bpf_prog_query(&attr, uattr.user, size); break; case BPF_PROG_TEST_RUN: err = bpf_prog_test_run(&attr, uattr.user); @@ -6316,7 +6460,12 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) err = bpf_raw_tracepoint_open(&attr); break; case BPF_BTF_LOAD: - err = bpf_btf_load(&attr, uattr, size); + if (size >= offsetofend(union bpf_attr, btf_log_true_size)) + offsetof_log_true_size = offsetof(union bpf_attr, btf_log_true_size); + err = bpf_log_attr_init(&attr_log, attr.btf_log_buf, attr.btf_log_size, + attr.btf_log_level, offsetof_log_true_size, uattr, + &attr_common, uattr_common, size_common); + err = err ?: bpf_btf_load(&attr, uattr, &attr_log); break; case BPF_BTF_GET_FD_BY_ID: err = bpf_btf_get_fd_by_id(&attr); @@ -6382,9 +6531,10 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) return err; } -SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) +SYSCALL_DEFINE5(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size, + struct bpf_common_attr __user *, uattr_common, unsigned int, size_common) { - return __sys_bpf(cmd, USER_BPFPTR(uattr), size); + return __sys_bpf(cmd, USER_BPFPTR(uattr), size, USER_BPFPTR(uattr_common), size_common); } static bool syscall_prog_is_valid_access(int off, int size, @@ -6414,7 +6564,7 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) default: return -EINVAL; } - return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size); + return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size, KERNEL_BPFPTR(NULL), 0); } diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index f02254a21585..1a721fc4bef5 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -30,8 +30,46 @@ static struct hlist_head trampoline_ip_table[TRAMPOLINE_TABLE_SIZE]; /* serializes access to trampoline tables */ static DEFINE_MUTEX(trampoline_mutex); +/* + * Keep 32 trampoline locks (5 bits) in the pool so trampoline_lock_all() + * stays below MAX_LOCK_DEPTH. Each pool slot has a distinct lockdep + * class because trampoline_lock_all() takes all pool mutexes at once; + * otherwise lockdep would report recursive locking on same-class mutexes. + */ +#define TRAMPOLINE_LOCKS_BITS 5 +#define TRAMPOLINE_LOCKS_TABLE_SIZE (1 << TRAMPOLINE_LOCKS_BITS) + +static struct { + struct mutex mutex; + struct lock_class_key key; +} trampoline_locks[TRAMPOLINE_LOCKS_TABLE_SIZE]; + +static struct mutex *select_trampoline_lock(struct bpf_trampoline *tr) +{ + return &trampoline_locks[hash_ptr(tr, TRAMPOLINE_LOCKS_BITS)].mutex; +} + +static void trampoline_lock(struct bpf_trampoline *tr) +{ + mutex_lock(select_trampoline_lock(tr)); +} + +static void trampoline_unlock(struct bpf_trampoline *tr) +{ + mutex_unlock(select_trampoline_lock(tr)); +} + +struct bpf_trampoline_ops { + int (*register_fentry)(struct bpf_trampoline *tr, struct bpf_tramp_image *im, void *data); + int (*unregister_fentry)(struct bpf_trampoline *tr, u32 orig_flags, void *data); + int (*modify_fentry)(struct bpf_trampoline *tr, u32 orig_flags, struct bpf_tramp_image *im, + bool lock_direct_mutex, void *data); +}; + #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS -static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex); +static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex, + const struct bpf_trampoline_ops *ops, void *data); +static const struct bpf_trampoline_ops trampoline_ops; #ifdef CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS static struct bpf_trampoline *direct_ops_ip_lookup(struct ftrace_ops *ops, unsigned long ip) @@ -69,9 +107,9 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip, if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) { /* This is called inside register_ftrace_direct_multi(), so - * tr->mutex is already locked. + * trampoline's mutex is already locked. */ - lockdep_assert_held_once(&tr->mutex); + lockdep_assert_held_once(select_trampoline_lock(tr)); /* Instead of updating the trampoline here, we propagate * -EAGAIN to register_ftrace_direct(). Then we can @@ -91,7 +129,7 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip, } /* The normal locking order is - * tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c) + * select_trampoline_lock(tr) => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c) * * The following two commands are called from * @@ -99,12 +137,12 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip, * cleanup_direct_functions_after_ipmodify * * In both cases, direct_mutex is already locked. Use - * mutex_trylock(&tr->mutex) to avoid deadlock in race condition - * (something else is making changes to this same trampoline). + * mutex_trylock(select_trampoline_lock(tr)) to avoid deadlock in race condition + * (something else holds the same pool lock). */ - if (!mutex_trylock(&tr->mutex)) { - /* sleep 1 ms to make sure whatever holding tr->mutex makes - * some progress. + if (!mutex_trylock(select_trampoline_lock(tr))) { + /* sleep 1 ms to make sure whatever holding select_trampoline_lock(tr) + * makes some progress. */ msleep(1); return -EAGAIN; @@ -116,20 +154,22 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip, if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) && !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) - ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */); + ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */, + &trampoline_ops, NULL); break; case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER: tr->flags &= ~BPF_TRAMP_F_SHARE_IPMODIFY; if (tr->flags & BPF_TRAMP_F_ORIG_STACK) - ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */); + ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */, + &trampoline_ops, NULL); break; default: ret = -EINVAL; break; } - mutex_unlock(&tr->mutex); + trampoline_unlock(tr); return ret; } #endif @@ -142,7 +182,9 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog) switch (ptype) { case BPF_PROG_TYPE_TRACING: if (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || - eatype == BPF_MODIFY_RETURN || eatype == BPF_TRACE_FSESSION) + eatype == BPF_MODIFY_RETURN || eatype == BPF_TRACE_FSESSION || + eatype == BPF_TRACE_FENTRY_MULTI || eatype == BPF_TRACE_FEXIT_MULTI || + eatype == BPF_TRACE_FSESSION_MULTI) return true; return false; case BPF_PROG_TYPE_LSM: @@ -359,7 +401,6 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key, unsigned long ip) head = &trampoline_ip_table[hash_64(tr->ip, TRAMPOLINE_HASH_BITS)]; hlist_add_head(&tr->hlist_ip, head); refcount_set(&tr->refcnt, 1); - mutex_init(&tr->mutex); for (i = 0; i < BPF_TRAMP_MAX; i++) INIT_HLIST_HEAD(&tr->progs_hlist[i]); out: @@ -386,9 +427,11 @@ static int bpf_trampoline_update_fentry(struct bpf_trampoline *tr, u32 orig_flag return bpf_arch_text_poke(ip, old_t, new_t, old_addr, new_addr); } -static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags, - void *old_addr) +static void bpf_tramp_image_put(struct bpf_tramp_image *im); + +static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags, void *data __maybe_unused) { + void *old_addr = tr->cur_image->image; int ret; if (tr->func.ftrace_managed) @@ -396,13 +439,19 @@ static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags, else ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, NULL); - return ret; + if (ret) + return ret; + + bpf_tramp_image_put(tr->cur_image); + tr->cur_image = NULL; + return 0; } -static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, - void *old_addr, void *new_addr, - bool lock_direct_mutex) +static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, struct bpf_tramp_image *im, + bool lock_direct_mutex, void *data __maybe_unused) { + void *old_addr = tr->cur_image->image; + void *new_addr = im->image; int ret; if (tr->func.ftrace_managed) { @@ -411,12 +460,20 @@ static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, new_addr); } - return ret; + + if (ret) + return ret; + + bpf_tramp_image_put(tr->cur_image); + tr->cur_image = im; + return 0; } /* first time registering */ -static int register_fentry(struct bpf_trampoline *tr, void *new_addr) +static int register_fentry(struct bpf_trampoline *tr, struct bpf_tramp_image *im, + void *data __maybe_unused) { + void *new_addr = im->image; void *ip = tr->func.addr; unsigned long faddr; int ret; @@ -434,33 +491,42 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) ret = bpf_trampoline_update_fentry(tr, 0, NULL, new_addr); } - return ret; + if (ret) + return ret; + + tr->cur_image = im; + return 0; } -static struct bpf_tramp_links * +static const struct bpf_trampoline_ops trampoline_ops = { + .register_fentry = register_fentry, + .unregister_fentry = unregister_fentry, + .modify_fentry = modify_fentry, +}; + +static struct bpf_tramp_nodes * bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg) { - struct bpf_tramp_link *link; - struct bpf_tramp_links *tlinks; - struct bpf_tramp_link **links; + struct bpf_tramp_node *node, **nodes; + struct bpf_tramp_nodes *tnodes; int kind; *total = 0; - tlinks = kzalloc_objs(*tlinks, BPF_TRAMP_MAX); - if (!tlinks) + tnodes = kzalloc_objs(*tnodes, BPF_TRAMP_MAX); + if (!tnodes) return ERR_PTR(-ENOMEM); for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { - tlinks[kind].nr_links = tr->progs_cnt[kind]; + tnodes[kind].nr_nodes = tr->progs_cnt[kind]; *total += tr->progs_cnt[kind]; - links = tlinks[kind].links; + nodes = tnodes[kind].nodes; - hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) { - *ip_arg |= link->link.prog->call_get_func_ip; - *links++ = link; + hlist_for_each_entry(node, &tr->progs_hlist[kind], tramp_hlist) { + *ip_arg |= node->link->prog->call_get_func_ip; + *nodes++ = node; } } - return tlinks; + return tnodes; } static void bpf_tramp_image_free(struct bpf_tramp_image *im) @@ -604,30 +670,29 @@ out: return ERR_PTR(err); } -static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex) +static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex, + const struct bpf_trampoline_ops *ops, void *data) { struct bpf_tramp_image *im; - struct bpf_tramp_links *tlinks; + struct bpf_tramp_nodes *tnodes; u32 orig_flags = tr->flags; bool ip_arg = false; int err, total, size; - tlinks = bpf_trampoline_get_progs(tr, &total, &ip_arg); - if (IS_ERR(tlinks)) - return PTR_ERR(tlinks); + tnodes = bpf_trampoline_get_progs(tr, &total, &ip_arg); + if (IS_ERR(tnodes)) + return PTR_ERR(tnodes); if (total == 0) { - err = unregister_fentry(tr, orig_flags, tr->cur_image->image); - bpf_tramp_image_put(tr->cur_image); - tr->cur_image = NULL; + err = ops->unregister_fentry(tr, orig_flags, data); goto out; } /* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */ tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX); - if (tlinks[BPF_TRAMP_FEXIT].nr_links || - tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) { + if (tnodes[BPF_TRAMP_FEXIT].nr_nodes || + tnodes[BPF_TRAMP_MODIFY_RETURN].nr_nodes) { /* NOTE: BPF_TRAMP_F_RESTORE_REGS and BPF_TRAMP_F_SKIP_FRAME * should not be set together. */ @@ -658,7 +723,7 @@ again: #endif size = arch_bpf_trampoline_size(&tr->func.model, tr->flags, - tlinks, tr->func.addr); + tnodes, tr->func.addr); if (size < 0) { err = size; goto out; @@ -676,7 +741,7 @@ again: } err = arch_prepare_bpf_trampoline(im, im->image, im->image + size, - &tr->func.model, tr->flags, tlinks, + &tr->func.model, tr->flags, tnodes, tr->func.addr); if (err < 0) goto out_free; @@ -685,14 +750,12 @@ again: if (err) goto out_free; - WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) /* progs already running at this address */ - err = modify_fentry(tr, orig_flags, tr->cur_image->image, - im->image, lock_direct_mutex); + err = ops->modify_fentry(tr, orig_flags, im, lock_direct_mutex, data); else /* first time registering */ - err = register_fentry(tr, im->image); + err = ops->register_fentry(tr, im, data); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS if (err == -EAGAIN) { @@ -704,34 +767,31 @@ again: goto again; } #endif - if (err) - goto out_free; - if (tr->cur_image) - bpf_tramp_image_put(tr->cur_image); - tr->cur_image = im; +out_free: + if (err) + bpf_tramp_image_free(im); out: /* If any error happens, restore previous flags */ if (err) tr->flags = orig_flags; - kfree(tlinks); + kfree(tnodes); return err; - -out_free: - bpf_tramp_image_free(im); - goto out; } static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) { switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: + case BPF_TRACE_FENTRY_MULTI: return BPF_TRAMP_FENTRY; case BPF_MODIFY_RETURN: return BPF_TRAMP_MODIFY_RETURN; case BPF_TRACE_FEXIT: + case BPF_TRACE_FEXIT_MULTI: return BPF_TRAMP_FEXIT; case BPF_TRACE_FSESSION: + case BPF_TRACE_FSESSION_MULTI: return BPF_TRAMP_FSESSION; case BPF_LSM_MAC: if (!prog->aux->attach_func_proto->type) @@ -764,39 +824,33 @@ static int bpf_freplace_check_tgt_prog(struct bpf_prog *tgt_prog) return 0; } -static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, - struct bpf_trampoline *tr, - struct bpf_prog *tgt_prog) +static struct bpf_tramp_node *fsession_exit(struct bpf_tramp_node *node) { - struct bpf_fsession_link *fslink = NULL; - enum bpf_tramp_prog_type kind; - struct bpf_tramp_link *link_exiting; - struct hlist_head *prog_list; - int err = 0; - int cnt = 0, i; + if (node->link->type == BPF_LINK_TYPE_TRACING) { + struct bpf_tracing_link *link; - kind = bpf_attach_type_to_tramp(link->link.prog); - if (tr->extension_prog) - /* cannot attach fentry/fexit if extension prog is attached. - * cannot overwrite extension prog either. - */ - return -EBUSY; + link = container_of(node->link, struct bpf_tracing_link, link.link); + return &link->fexit; + } else if (node->link->type == BPF_LINK_TYPE_TRACING_MULTI) { + struct bpf_tracing_multi_link *link; + struct bpf_tracing_multi_node *mnode; - for (i = 0; i < BPF_TRAMP_MAX; i++) - cnt += tr->progs_cnt[i]; - - if (kind == BPF_TRAMP_REPLACE) { - /* Cannot attach extension if fentry/fexit are in use. */ - if (cnt) - return -EBUSY; - err = bpf_freplace_check_tgt_prog(tgt_prog); - if (err) - return err; - tr->extension_prog = link->link.prog; - return bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP, - BPF_MOD_JUMP, NULL, - link->link.prog->bpf_func); + link = container_of(node->link, struct bpf_tracing_multi_link, link); + mnode = container_of(node, struct bpf_tracing_multi_node, node); + return &link->fexits[mnode - link->nodes]; } + return NULL; +} + +static int bpf_trampoline_add_prog(struct bpf_trampoline *tr, + struct bpf_tramp_node *node, + int cnt) +{ + enum bpf_tramp_prog_type kind; + struct bpf_tramp_node *node_existing, *fexit; + struct hlist_head *prog_list; + + kind = bpf_attach_type_to_tramp(node->link->prog); if (kind == BPF_TRAMP_FSESSION) { prog_list = &tr->progs_hlist[BPF_TRAMP_FENTRY]; cnt++; @@ -805,59 +859,112 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, } if (cnt >= BPF_MAX_TRAMP_LINKS) return -E2BIG; - if (!hlist_unhashed(&link->tramp_hlist)) + if (!hlist_unhashed(&node->tramp_hlist)) /* prog already linked */ return -EBUSY; - hlist_for_each_entry(link_exiting, prog_list, tramp_hlist) { - if (link_exiting->link.prog != link->link.prog) + hlist_for_each_entry(node_existing, prog_list, tramp_hlist) { + if (node_existing->link->prog != node->link->prog) continue; /* prog already linked */ return -EBUSY; } - hlist_add_head(&link->tramp_hlist, prog_list); + hlist_add_head(&node->tramp_hlist, prog_list); if (kind == BPF_TRAMP_FSESSION) { tr->progs_cnt[BPF_TRAMP_FENTRY]++; - fslink = container_of(link, struct bpf_fsession_link, link.link); - hlist_add_head(&fslink->fexit.tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]); + fexit = fsession_exit(node); + if (WARN_ON_ONCE(!fexit)) + return -EINVAL; + hlist_add_head(&fexit->tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]); tr->progs_cnt[BPF_TRAMP_FEXIT]++; } else { tr->progs_cnt[kind]++; } - err = bpf_trampoline_update(tr, true /* lock_direct_mutex */); - if (err) { - hlist_del_init(&link->tramp_hlist); - if (kind == BPF_TRAMP_FSESSION) { - tr->progs_cnt[BPF_TRAMP_FENTRY]--; - hlist_del_init(&fslink->fexit.tramp_hlist); - tr->progs_cnt[BPF_TRAMP_FEXIT]--; - } else { - tr->progs_cnt[kind]--; - } + return 0; +} + +static void bpf_trampoline_remove_prog(struct bpf_trampoline *tr, + struct bpf_tramp_node *node) +{ + enum bpf_tramp_prog_type kind; + struct bpf_tramp_node *fexit; + + kind = bpf_attach_type_to_tramp(node->link->prog); + if (kind == BPF_TRAMP_FSESSION) { + fexit = fsession_exit(node); + if (WARN_ON_ONCE(!fexit)) + return; + hlist_del_init(&fexit->tramp_hlist); + tr->progs_cnt[BPF_TRAMP_FEXIT]--; + kind = BPF_TRAMP_FENTRY; } + hlist_del_init(&node->tramp_hlist); + tr->progs_cnt[kind]--; +} + +static int __bpf_trampoline_link_prog(struct bpf_tramp_node *node, + struct bpf_trampoline *tr, + struct bpf_prog *tgt_prog, + const struct bpf_trampoline_ops *ops, + void *data) +{ + enum bpf_tramp_prog_type kind; + int err = 0; + int cnt = 0, i; + + kind = bpf_attach_type_to_tramp(node->link->prog); + if (tr->extension_prog) + /* cannot attach fentry/fexit if extension prog is attached. + * cannot overwrite extension prog either. + */ + return -EBUSY; + + for (i = 0; i < BPF_TRAMP_MAX; i++) + cnt += tr->progs_cnt[i]; + + if (kind == BPF_TRAMP_REPLACE) { + /* Cannot attach extension if fentry/fexit are in use. */ + if (cnt) + return -EBUSY; + err = bpf_freplace_check_tgt_prog(tgt_prog); + if (err) + return err; + tr->extension_prog = node->link->prog; + return bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP, + BPF_MOD_JUMP, NULL, + node->link->prog->bpf_func); + } + err = bpf_trampoline_add_prog(tr, node, cnt); + if (err) + return err; + err = bpf_trampoline_update(tr, true /* lock_direct_mutex */, ops, data); + if (err) + bpf_trampoline_remove_prog(tr, node); return err; } -int bpf_trampoline_link_prog(struct bpf_tramp_link *link, +int bpf_trampoline_link_prog(struct bpf_tramp_node *node, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { int err; - mutex_lock(&tr->mutex); - err = __bpf_trampoline_link_prog(link, tr, tgt_prog); - mutex_unlock(&tr->mutex); + trampoline_lock(tr); + err = __bpf_trampoline_link_prog(node, tr, tgt_prog, &trampoline_ops, NULL); + trampoline_unlock(tr); return err; } -static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, +static int __bpf_trampoline_unlink_prog(struct bpf_tramp_node *node, struct bpf_trampoline *tr, - struct bpf_prog *tgt_prog) + struct bpf_prog *tgt_prog, + const struct bpf_trampoline_ops *ops, + void *data) { enum bpf_tramp_prog_type kind; int err; - kind = bpf_attach_type_to_tramp(link->link.prog); + kind = bpf_attach_type_to_tramp(node->link->prog); if (kind == BPF_TRAMP_REPLACE) { WARN_ON_ONCE(!tr->extension_prog); err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, @@ -867,29 +974,21 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, guard(mutex)(&tgt_prog->aux->ext_mutex); tgt_prog->aux->is_extended = false; return err; - } else if (kind == BPF_TRAMP_FSESSION) { - struct bpf_fsession_link *fslink = - container_of(link, struct bpf_fsession_link, link.link); - - hlist_del_init(&fslink->fexit.tramp_hlist); - tr->progs_cnt[BPF_TRAMP_FEXIT]--; - kind = BPF_TRAMP_FENTRY; } - hlist_del_init(&link->tramp_hlist); - tr->progs_cnt[kind]--; - return bpf_trampoline_update(tr, true /* lock_direct_mutex */); + bpf_trampoline_remove_prog(tr, node); + return bpf_trampoline_update(tr, true /* lock_direct_mutex */, ops, data); } /* bpf_trampoline_unlink_prog() should never fail. */ -int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, +int bpf_trampoline_unlink_prog(struct bpf_tramp_node *node, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { int err; - mutex_lock(&tr->mutex); - err = __bpf_trampoline_unlink_prog(link, tr, tgt_prog); - mutex_unlock(&tr->mutex); + trampoline_lock(tr); + err = __bpf_trampoline_unlink_prog(node, tr, tgt_prog, &trampoline_ops, NULL); + trampoline_unlock(tr); return err; } @@ -903,7 +1002,7 @@ static void bpf_shim_tramp_link_release(struct bpf_link *link) if (!shim_link->trampoline) return; - WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline, NULL)); + WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link.node, shim_link->trampoline, NULL)); bpf_trampoline_put(shim_link->trampoline); } @@ -949,8 +1048,8 @@ static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog p->type = BPF_PROG_TYPE_LSM; p->expected_attach_type = BPF_LSM_MAC; bpf_prog_inc(p); - bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC, - &bpf_shim_tramp_link_lops, p, attach_type); + bpf_tramp_link_init(&shim_link->link, BPF_LINK_TYPE_UNSPEC, + &bpf_shim_tramp_link_lops, p, attach_type, 0); bpf_cgroup_atype_get(p->aux->attach_btf_id, cgroup_atype); return shim_link; @@ -959,15 +1058,15 @@ static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr, bpf_func_t bpf_func) { - struct bpf_tramp_link *link; + struct bpf_tramp_node *node; int kind; for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { - hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) { - struct bpf_prog *p = link->link.prog; + hlist_for_each_entry(node, &tr->progs_hlist[kind], tramp_hlist) { + struct bpf_prog *p = node->link->prog; if (p->bpf_func == bpf_func) - return container_of(link, struct bpf_shim_tramp_link, link); + return container_of(node, struct bpf_shim_tramp_link, link.node); } } @@ -999,12 +1098,12 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, if (!tr) return -ENOMEM; - mutex_lock(&tr->mutex); + trampoline_lock(tr); shim_link = cgroup_shim_find(tr, bpf_func); if (shim_link && !IS_ERR(bpf_link_inc_not_zero(&shim_link->link.link))) { /* Reusing existing shim attached by the other program. */ - mutex_unlock(&tr->mutex); + trampoline_unlock(tr); bpf_trampoline_put(tr); /* bpf_trampoline_get above */ return 0; } @@ -1017,23 +1116,23 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, goto err; } - err = __bpf_trampoline_link_prog(&shim_link->link, tr, NULL); + err = __bpf_trampoline_link_prog(&shim_link->link.node, tr, NULL, &trampoline_ops, NULL); if (err) goto err; shim_link->trampoline = tr; /* note, we're still holding tr refcnt from above */ - mutex_unlock(&tr->mutex); + trampoline_unlock(tr); return 0; err: - mutex_unlock(&tr->mutex); + trampoline_unlock(tr); if (shim_link) bpf_link_put(&shim_link->link.link); - /* have to release tr while _not_ holding its mutex */ + /* have to release tr while _not_ holding pool mutex for trampoline */ bpf_trampoline_put(tr); /* bpf_trampoline_get above */ return err; @@ -1054,9 +1153,9 @@ void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog) if (WARN_ON_ONCE(!tr)) return; - mutex_lock(&tr->mutex); + trampoline_lock(tr); shim_link = cgroup_shim_find(tr, bpf_func); - mutex_unlock(&tr->mutex); + trampoline_unlock(tr); if (shim_link) bpf_link_put(&shim_link->link.link); @@ -1074,14 +1173,14 @@ struct bpf_trampoline *bpf_trampoline_get(u64 key, if (!tr) return NULL; - mutex_lock(&tr->mutex); + trampoline_lock(tr); if (tr->func.addr) goto out; memcpy(&tr->func.model, &tgt_info->fmodel, sizeof(tgt_info->fmodel)); tr->func.addr = (void *)tgt_info->tgt_addr; out: - mutex_unlock(&tr->mutex); + trampoline_unlock(tr); return tr; } @@ -1094,7 +1193,6 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) mutex_lock(&trampoline_mutex); if (!refcount_dec_and_test(&tr->refcnt)) goto out; - WARN_ON_ONCE(mutex_is_locked(&tr->mutex)); for (i = 0; i < BPF_TRAMP_MAX; i++) if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[i]))) @@ -1333,7 +1431,7 @@ bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog) int __weak arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, + struct bpf_tramp_nodes *tnodes, void *func_addr) { return -ENOTSUPP; @@ -1367,11 +1465,288 @@ int __weak arch_protect_bpf_trampoline(void *image, unsigned int size) } int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, void *func_addr) + struct bpf_tramp_nodes *tnodes, void *func_addr) { return -ENOTSUPP; } +#if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) && \ + defined(CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS) && \ + defined(CONFIG_BPF_SYSCALL) + +static void trampoline_lock_all(void) +{ + int i; + + for (i = 0; i < TRAMPOLINE_LOCKS_TABLE_SIZE; i++) + mutex_lock(&trampoline_locks[i].mutex); +} + +static void trampoline_unlock_all(void) +{ + int i; + + for (i = 0; i < TRAMPOLINE_LOCKS_TABLE_SIZE; i++) + mutex_unlock(&trampoline_locks[i].mutex); +} + +static void remove_tracing_multi_data(struct bpf_tracing_multi_data *data) +{ + ftrace_hash_remove(data->reg); + ftrace_hash_remove(data->unreg); + ftrace_hash_remove(data->modify); +} + +static void clear_tracing_multi_data(struct bpf_tracing_multi_data *data) +{ + remove_tracing_multi_data(data); + + free_ftrace_hash(data->reg); + free_ftrace_hash(data->unreg); + free_ftrace_hash(data->modify); +} + +static int init_tracing_multi_data(struct bpf_tracing_multi_data *data) +{ + data->reg = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + data->unreg = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + data->modify = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + + if (!data->reg || !data->unreg || !data->modify) { + clear_tracing_multi_data(data); + return -ENOMEM; + } + return 0; +} + +static void ftrace_hash_add(struct ftrace_hash *hash, struct ftrace_func_entry *entry, + unsigned long ip, unsigned long direct) +{ + entry->ip = ip; + entry->direct = direct; + add_ftrace_hash_entry(hash, entry); +} + +static int register_fentry_multi(struct bpf_trampoline *tr, struct bpf_tramp_image *im, void *ptr) +{ + unsigned long addr = (unsigned long) im->image; + unsigned long ip = ftrace_location(tr->ip); + struct bpf_tracing_multi_data *data = ptr; + + if (bpf_trampoline_use_jmp(tr->flags)) + addr = ftrace_jmp_set(addr); + + ftrace_hash_add(data->reg, data->entry, ip, addr); + tr->cur_image = im; + return 0; +} + +static int unregister_fentry_multi(struct bpf_trampoline *tr, u32 orig_flags, void *ptr) +{ + unsigned long addr = (unsigned long) tr->cur_image->image; + unsigned long ip = ftrace_location(tr->ip); + struct bpf_tracing_multi_data *data = ptr; + + if (bpf_trampoline_use_jmp(tr->flags)) + addr = ftrace_jmp_set(addr); + + ftrace_hash_add(data->unreg, data->entry, ip, addr); + tr->cur_image = NULL; + return 0; +} + +static int modify_fentry_multi(struct bpf_trampoline *tr, u32 orig_flags, struct bpf_tramp_image *im, + bool lock_direct_mutex, void *ptr) +{ + unsigned long addr = (unsigned long) im->image; + unsigned long ip = ftrace_location(tr->ip); + struct bpf_tracing_multi_data *data = ptr; + + if (bpf_trampoline_use_jmp(tr->flags)) + addr = ftrace_jmp_set(addr); + + ftrace_hash_add(data->modify, data->entry, ip, addr); + tr->cur_image = im; + return 0; +} + +static const struct bpf_trampoline_ops trampoline_multi_ops = { + .register_fentry = register_fentry_multi, + .unregister_fentry = unregister_fentry_multi, + .modify_fentry = modify_fentry_multi, +}; + +static void bpf_trampoline_multi_attach_init(struct bpf_trampoline *tr) +{ + tr->multi_attach.old_image = tr->cur_image; + tr->multi_attach.old_flags = tr->flags; +} + +static void bpf_trampoline_multi_attach_free(struct bpf_trampoline *tr) +{ + if (tr->multi_attach.old_image) + bpf_tramp_image_put(tr->multi_attach.old_image); + + tr->multi_attach.old_image = NULL; + tr->multi_attach.old_flags = 0; +} + +static void bpf_trampoline_multi_attach_rollback(struct bpf_trampoline *tr) +{ + if (tr->cur_image) + bpf_tramp_image_put(tr->cur_image); + tr->cur_image = tr->multi_attach.old_image; + tr->flags = tr->multi_attach.old_flags; + + tr->multi_attach.old_image = NULL; + tr->multi_attach.old_flags = 0; +} + +#define for_each_mnode_cnt(mnode, link, cnt) \ + for (i = 0, mnode = &link->nodes[i]; i < cnt; i++, mnode = &link->nodes[i]) + +#define for_each_mnode(mnode, link) \ + for_each_mnode_cnt(mnode, link, link->nodes_cnt) + +int bpf_trampoline_multi_attach(struct bpf_prog *prog, u32 *ids, + struct bpf_tracing_multi_link *link) +{ + struct bpf_tracing_multi_data *data = &link->data; + struct bpf_attach_target_info tgt_info = {}; + struct btf *btf = prog->aux->attach_btf; + struct bpf_tracing_multi_node *mnode; + struct bpf_trampoline *tr; + int i, err, rollback_cnt; + u64 key; + + for_each_mnode(mnode, link) { + rollback_cnt = i; + + err = bpf_check_attach_btf_id_multi(btf, prog, ids[i], &tgt_info); + if (err) + goto rollback_put; + + key = bpf_trampoline_compute_key(NULL, btf, ids[i]); + + tr = bpf_trampoline_get(key, &tgt_info); + if (!tr) { + err = -ENOMEM; + goto rollback_put; + } + + mnode->trampoline = tr; + mnode->node.link = &link->link; + mnode->node.cookie = link->cookies ? link->cookies[i] : 0; + + if (prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) { + link->fexits[i].link = &link->link; + link->fexits[i].cookie = link->cookies ? link->cookies[i] : 0; + } + + cond_resched(); + } + + err = init_tracing_multi_data(data); + if (err) { + rollback_cnt = link->nodes_cnt; + goto rollback_put; + } + + trampoline_lock_all(); + + for_each_mnode(mnode, link) { + bpf_trampoline_multi_attach_init(mnode->trampoline); + + data->entry = &mnode->entry; + err = __bpf_trampoline_link_prog(&mnode->node, mnode->trampoline, NULL, + &trampoline_multi_ops, data); + if (err) { + rollback_cnt = i; + goto rollback_unlink; + } + } + + rollback_cnt = link->nodes_cnt; + if (ftrace_hash_count(data->reg)) { + err = update_ftrace_direct_add(&direct_ops, data->reg); + if (err) + goto rollback_unlink; + } + + if (ftrace_hash_count(data->modify)) { + err = update_ftrace_direct_mod(&direct_ops, data->modify, true); + if (err) { + if (ftrace_hash_count(data->reg)) + WARN_ON_ONCE(update_ftrace_direct_del(&direct_ops, data->reg)); + goto rollback_unlink; + } + } + + for_each_mnode(mnode, link) + bpf_trampoline_multi_attach_free(mnode->trampoline); + + trampoline_unlock_all(); + + remove_tracing_multi_data(data); + return 0; + +rollback_unlink: + for_each_mnode_cnt(mnode, link, rollback_cnt) { + bpf_trampoline_remove_prog(mnode->trampoline, &mnode->node); + bpf_trampoline_multi_attach_rollback(mnode->trampoline); + } + + trampoline_unlock_all(); + + clear_tracing_multi_data(data); + rollback_cnt = link->nodes_cnt; + +rollback_put: + for_each_mnode_cnt(mnode, link, rollback_cnt) + bpf_trampoline_put(mnode->trampoline); + + return err; +} + +int bpf_trampoline_multi_detach(struct bpf_prog *prog, struct bpf_tracing_multi_link *link) +{ + struct bpf_tracing_multi_data *data = &link->data; + struct bpf_tracing_multi_node *mnode; + int i; + + trampoline_lock_all(); + + for_each_mnode(mnode, link) { + data->entry = &mnode->entry; + bpf_trampoline_multi_attach_init(mnode->trampoline); + WARN_ON_ONCE(__bpf_trampoline_unlink_prog(&mnode->node, mnode->trampoline, + NULL, &trampoline_multi_ops, data)); + } + + if (ftrace_hash_count(data->unreg)) + WARN_ON_ONCE(update_ftrace_direct_del(&direct_ops, data->unreg)); + if (ftrace_hash_count(data->modify)) + WARN_ON_ONCE(update_ftrace_direct_mod(&direct_ops, data->modify, true)); + + for_each_mnode(mnode, link) + bpf_trampoline_multi_attach_free(mnode->trampoline); + + trampoline_unlock_all(); + + for_each_mnode(mnode, link) + bpf_trampoline_put(mnode->trampoline); + + clear_tracing_multi_data(data); + return 0; +} + +#undef for_each_mnode_cnt +#undef for_each_mnode + +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS && + CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS && + CONFIG_BPF_SYSCALL */ + static int __init init_trampolines(void) { int i; @@ -1380,6 +1755,8 @@ static int __init init_trampolines(void) INIT_HLIST_HEAD(&trampoline_key_table[i]); for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++) INIT_HLIST_HEAD(&trampoline_ip_table[i]); + for (i = 0; i < TRAMPOLINE_LOCKS_TABLE_SIZE; i++) + __mutex_init(&trampoline_locks[i].mutex, "trampoline_lock", &trampoline_locks[i].key); return 0; } late_initcall(init_trampolines); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7fb88e1cd7c4..2abc79dbf281 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -26,6 +26,7 @@ #include <linux/poison.h> #include <linux/module.h> #include <linux/cpumask.h> +#include <linux/cnum.h> #include <linux/bpf_mem_alloc.h> #include <net/xdp.h> #include <linux/trace_events.h> @@ -199,14 +200,15 @@ struct bpf_verifier_stack_elem { #define BPF_PRIV_STACK_MIN_SIZE 64 -static int acquire_reference(struct bpf_verifier_env *env, int insn_idx); -static int release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id); -static int release_reference(struct bpf_verifier_env *env, int ref_obj_id); +static int acquire_reference(struct bpf_verifier_env *env, int insn_idx, int parent_id); +static int release_reference_nomark(struct bpf_verifier_state *state, int id); +static int release_reference(struct bpf_verifier_env *env, int id); static void invalidate_non_owning_refs(struct bpf_verifier_env *env); static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env); +static bool is_tracing_prog_type(enum bpf_prog_type type); static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg); -static bool is_trusted_reg(const struct bpf_reg_state *reg); +static bool is_trusted_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg); static inline bool in_sleepable_context(struct bpf_verifier_env *env); static const char *non_sleepable_context_description(struct bpf_verifier_env *env); static void scalar32_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg); @@ -230,8 +232,28 @@ static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state) (poisoned ? BPF_MAP_KEY_POISON : 0ULL); } +static void update_ref_obj(struct ref_obj_desc *ref_obj, struct bpf_reg_state *reg) +{ + ref_obj->id = reg->id; + ref_obj->parent_id = reg->parent_id; + ref_obj->cnt++; +} + +static int validate_ref_obj(struct bpf_verifier_env *env, struct ref_obj_desc *ref_obj) +{ + if (ref_obj->cnt > 1) { + verifier_bug(env, "function expects only one referenced object but got %d\n", + ref_obj->cnt); + return -EFAULT; + } + + return 0; +} + struct bpf_call_arg_meta { struct bpf_map_desc map; + struct bpf_dynptr_desc dynptr; + struct ref_obj_desc ref_obj; bool raw_mode; bool pkt_access; u8 release_regno; @@ -239,8 +261,6 @@ struct bpf_call_arg_meta { int access_size; int mem_size; u64 msize_max_value; - int ref_obj_id; - int dynptr_id; int func_id; struct btf *btf; u32 btf_id; @@ -261,6 +281,41 @@ struct bpf_kfunc_meta { struct btf *btf_vmlinux; +typedef struct argno { + int argno; +} argno_t; + +static argno_t argno_from_reg(u32 regno) +{ + return (argno_t){ .argno = regno }; +} + +static argno_t argno_from_arg(u32 arg) +{ + return (argno_t){ .argno = -arg }; +} + +static int reg_from_argno(argno_t a) +{ + if (a.argno >= 0) + return a.argno; + if (a.argno >= -MAX_BPF_FUNC_REG_ARGS) + return -a.argno; + return -1; +} + +static int arg_from_argno(argno_t a) +{ + if (a.argno < 0) + return -a.argno; + return -1; +} + +static int arg_idx_from_argno(argno_t a) +{ + return arg_from_argno(a) - 1; +} + static const char *btf_type_name(const struct btf *btf, u32 id) { return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); @@ -290,12 +345,12 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env, bool unknown = true; verbose(env, "%s the register %s has", ctx, reg_name); - if (reg->smin_value > S64_MIN) { - verbose(env, " smin=%lld", reg->smin_value); + if (reg_smin(reg) > S64_MIN) { + verbose(env, " smin=%lld", reg_smin(reg)); unknown = false; } - if (reg->smax_value < S64_MAX) { - verbose(env, " smax=%lld", reg->smax_value); + if (reg_smax(reg) < S64_MAX) { + verbose(env, " smax=%lld", reg_smax(reg)); unknown = false; } if (unknown) @@ -303,7 +358,7 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env, verbose(env, " should have been in [%d, %d]\n", range.minval, range.maxval); } -static bool reg_not_null(const struct bpf_reg_state *reg) +static bool reg_not_null(struct bpf_verifier_env *env, const struct bpf_reg_state *reg) { enum bpf_reg_type type; @@ -317,7 +372,7 @@ static bool reg_not_null(const struct bpf_reg_state *reg) type == PTR_TO_MAP_VALUE || type == PTR_TO_MAP_KEY || type == PTR_TO_SOCK_COMMON || - (type == PTR_TO_BTF_ID && is_trusted_reg(reg)) || + (type == PTR_TO_BTF_ID && is_trusted_reg(env, reg)) || (type == PTR_TO_MEM && !(reg->type & PTR_UNTRUSTED)) || type == CONST_PTR_TO_MAP; } @@ -434,11 +489,6 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id) func_id == BPF_FUNC_skc_to_tcp_request_sock; } -static bool is_dynptr_ref_function(enum bpf_func_id func_id) -{ - return func_id == BPF_FUNC_dynptr_data; -} - static bool is_sync_callback_calling_kfunc(u32 btf_id); static bool is_async_callback_calling_kfunc(u32 btf_id); static bool is_callback_calling_kfunc(u32 btf_id); @@ -497,22 +547,6 @@ bool bpf_is_may_goto_insn(struct bpf_insn *insn) return insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO; } -static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id, - const struct bpf_map *map) -{ - int ref_obj_uses = 0; - - if (is_ptr_cast_function(func_id)) - ref_obj_uses++; - if (is_acquire_function(func_id, map)) - ref_obj_uses++; - if (is_dynptr_ref_function(func_id)) - ref_obj_uses++; - - return ref_obj_uses > 1; -} - - static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots) { int allocated_slots = state->allocated_stack / BPF_REG_SIZE; @@ -609,43 +643,44 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type) } } -static bool dynptr_type_refcounted(enum bpf_dynptr_type type) +static bool dynptr_type_referenced(enum bpf_dynptr_type type) { return type == BPF_DYNPTR_TYPE_RINGBUF || type == BPF_DYNPTR_TYPE_FILE; } static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type, - bool first_slot, int dynptr_id); + bool first_slot, int id, int parent_id); static void mark_dynptr_stack_regs(struct bpf_verifier_env *env, struct bpf_reg_state *sreg1, struct bpf_reg_state *sreg2, - enum bpf_dynptr_type type) + enum bpf_dynptr_type type, int parent_id) { int id = ++env->id_gen; - __mark_dynptr_reg(sreg1, type, true, id); - __mark_dynptr_reg(sreg2, type, false, id); + __mark_dynptr_reg(sreg1, type, true, id, parent_id); + __mark_dynptr_reg(sreg2, type, false, id, parent_id); } static void mark_dynptr_cb_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, enum bpf_dynptr_type type) { - __mark_dynptr_reg(reg, type, true, ++env->id_gen); + __mark_dynptr_reg(reg, type, true, ++env->id_gen, 0); } static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi); static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - enum bpf_arg_type arg_type, int insn_idx, int clone_ref_obj_id) + enum bpf_arg_type arg_type, int insn_idx, + struct ref_obj_desc *ref_obj, struct bpf_dynptr_desc *dynptr) { struct bpf_func_state *state = bpf_func(env, reg); + int spi, i, err, parent_id = 0; enum bpf_dynptr_type type; - int spi, i, err; spi = dynptr_get_spi(env, reg); if (spi < 0) @@ -676,94 +711,69 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ if (type == BPF_DYNPTR_TYPE_INVALID) return -EINVAL; - mark_dynptr_stack_regs(env, &state->stack[spi].spilled_ptr, - &state->stack[spi - 1].spilled_ptr, type); + if (dynptr->type == BPF_DYNPTR_TYPE_INVALID) { /* dynptr constructors */ + err = validate_ref_obj(env, ref_obj); + if (err) + return err; - if (dynptr_type_refcounted(type)) { - /* The id is used to track proper releasing */ - int id; + /* Track parent's id if the parent is a referenced object */ + parent_id = ref_obj->id; - if (clone_ref_obj_id) - id = clone_ref_obj_id; - else - id = acquire_reference(env, insn_idx); + if (dynptr_type_referenced(type)) { + int id; - if (id < 0) - return id; + /* + * Create an intermediate reference that tracks the referenced + * object for the referenced dynptr. Freeing a referenced dynptr + * through helpers/kfuncs will invalidate all clones. + */ + id = acquire_reference(env, insn_idx, parent_id); + if (id < 0) + return id; - state->stack[spi].spilled_ptr.ref_obj_id = id; - state->stack[spi - 1].spilled_ptr.ref_obj_id = id; + parent_id = id; + } + } else { /* bpf_dynptr_clone() */ + parent_id = dynptr->parent_id; } + mark_dynptr_stack_regs(env, &state->stack[spi].spilled_ptr, + &state->stack[spi - 1].spilled_ptr, type, parent_id); + return 0; } -static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi) +static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_stack_state *stack) { int i; for (i = 0; i < BPF_REG_SIZE; i++) { - state->stack[spi].slot_type[i] = STACK_INVALID; - state->stack[spi - 1].slot_type[i] = STACK_INVALID; + stack[0].slot_type[i] = STACK_INVALID; + stack[1].slot_type[i] = STACK_INVALID; } - bpf_mark_reg_not_init(env, &state->stack[spi].spilled_ptr); - bpf_mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); + bpf_mark_reg_not_init(env, &stack[0].spilled_ptr); + bpf_mark_reg_not_init(env, &stack[1].spilled_ptr); } static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_func_state *state = bpf_func(env, reg); - int spi, ref_obj_id, i; + int spi; - /* - * This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot - * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr - * is safe to do directly. - */ - if (reg->type == CONST_PTR_TO_DYNPTR) { - verifier_bug(env, "CONST_PTR_TO_DYNPTR cannot be released"); - return -EFAULT; - } spi = dynptr_get_spi(env, reg); if (spi < 0) return spi; - if (!dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) { - invalidate_dynptr(env, state, spi); - return 0; - } - - ref_obj_id = state->stack[spi].spilled_ptr.ref_obj_id; - - /* If the dynptr has a ref_obj_id, then we need to invalidate - * two things: - * - * 1) Any dynptrs with a matching ref_obj_id (clones) - * 2) Any slices derived from this dynptr. + /* + * For referenced dynptr, release the parent ref which cascades to + * all clones and derived slices. For non-referenced dynptr, only + * the dynptr and slices derived from it will be invalidated. */ - - /* Invalidate any slices associated with this dynptr */ - WARN_ON_ONCE(release_reference(env, ref_obj_id)); - - /* Invalidate any dynptr clones */ - for (i = 1; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].spilled_ptr.ref_obj_id != ref_obj_id) - continue; - - /* it should always be the case that if the ref obj id - * matches then the stack slot also belongs to a - * dynptr - */ - if (state->stack[i].slot_type[0] != STACK_DYNPTR) { - verifier_bug(env, "misconfigured ref_obj_id"); - return -EFAULT; - } - if (state->stack[i].spilled_ptr.dynptr.first_slot) - invalidate_dynptr(env, state, i); - } - - return 0; + reg = &state->stack[spi].spilled_ptr; + return release_reference(env, dynptr_type_referenced(reg->dynptr.type) + ? reg->parent_id + : reg->id); } static void __mark_reg_unknown(const struct bpf_verifier_env *env, @@ -777,12 +787,29 @@ static void mark_reg_invalid(const struct bpf_verifier_env *env, struct bpf_reg_ __mark_reg_unknown(env, reg); } +static int dynptr_ref_cnt(struct bpf_verifier_env *env, int v_parent_id) +{ + struct bpf_stack_state *stack; + struct bpf_func_state *state; + struct bpf_reg_state *reg; + int ref_cnt = 0; + + bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, stack, 1 << STACK_DYNPTR, ({ + if (!stack || stack->slot_type[0] != STACK_DYNPTR) + continue; + if (!stack->spilled_ptr.dynptr.first_slot) + continue; + if (stack->spilled_ptr.parent_id == v_parent_id) + ref_cnt++; + })); + + return ref_cnt; +} + static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi) { - struct bpf_func_state *fstate; - struct bpf_reg_state *dreg; - int i, dynptr_id; + int err = 0; /* We always ensure that STACK_DYNPTR is never set partially, * hence just checking for slot_type[0] is enough. This is @@ -796,56 +823,25 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, if (!state->stack[spi].spilled_ptr.dynptr.first_slot) spi = spi + 1; - if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) { - int ref_obj_id = state->stack[spi].spilled_ptr.ref_obj_id; - int ref_cnt = 0; - - /* - * A referenced dynptr can be overwritten only if there is at - * least one other dynptr sharing the same ref_obj_id, - * ensuring the reference can still be properly released. - */ - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_DYNPTR) - continue; - if (!state->stack[i].spilled_ptr.dynptr.first_slot) - continue; - if (state->stack[i].spilled_ptr.ref_obj_id == ref_obj_id) - ref_cnt++; - } - - if (ref_cnt <= 1) { - verbose(env, "cannot overwrite referenced dynptr\n"); - return -EINVAL; - } + /* + * A referenced dynptr can be overwritten only if there is at + * least one other dynptr sharing the same virtual ref parent, + * ensuring the reference can still be properly released. + */ + if (dynptr_type_referenced(state->stack[spi].spilled_ptr.dynptr.type) && + dynptr_ref_cnt(env, state->stack[spi].spilled_ptr.parent_id) <= 1) { + verbose(env, "cannot overwrite referenced dynptr\n"); + return -EINVAL; } - mark_stack_slot_scratched(env, spi); - mark_stack_slot_scratched(env, spi - 1); - - /* Writing partially to one dynptr stack slot destroys both. */ - for (i = 0; i < BPF_REG_SIZE; i++) { - state->stack[spi].slot_type[i] = STACK_INVALID; - state->stack[spi - 1].slot_type[i] = STACK_INVALID; + /* Invalidate the dynptr and any derived slices */ + err = release_reference(env, state->stack[spi].spilled_ptr.id); + if (!err) { + mark_stack_slot_scratched(env, spi); + mark_stack_slot_scratched(env, spi - 1); } - dynptr_id = state->stack[spi].spilled_ptr.id; - /* Invalidate any slices associated with this dynptr */ - bpf_for_each_reg_in_vstate(env->cur_state, fstate, dreg, ({ - /* Dynptr slices are only PTR_TO_MEM_OR_NULL and PTR_TO_MEM */ - if (dreg->type != (PTR_TO_MEM | PTR_MAYBE_NULL) && dreg->type != PTR_TO_MEM) - continue; - if (dreg->dynptr_id == dynptr_id) - mark_reg_invalid(env, dreg); - })); - - /* Do not release reference state, we are destroying dynptr on stack, - * not using some helper to release it. Just reset register. - */ - bpf_mark_reg_not_init(env, &state->stack[spi].spilled_ptr); - bpf_mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); - - return 0; + return err; } static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg) @@ -945,7 +941,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, if (spi < 0) return spi; - id = acquire_reference(env, insn_idx); + id = acquire_reference(env, insn_idx, 0); if (id < 0) return id; @@ -961,7 +957,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, else st->type |= PTR_UNTRUSTED; } - st->ref_obj_id = i == 0 ? id : 0; + st->id = i == 0 ? id : 0; st->iter.btf = btf; st->iter.btf_id = btf_id; st->iter.state = BPF_ITER_STATE_ACTIVE; @@ -991,7 +987,7 @@ static int unmark_stack_slots_iter(struct bpf_verifier_env *env, struct bpf_reg_state *st = &slot->spilled_ptr; if (i == 0) - WARN_ON_ONCE(release_reference(env, st->ref_obj_id)); + WARN_ON_ONCE(release_reference(env, st->id)); bpf_mark_reg_not_init(env, st); @@ -1047,10 +1043,10 @@ static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_s if (st->type & PTR_UNTRUSTED) return -EPROTO; - /* only main (first) slot has ref_obj_id set */ - if (i == 0 && !st->ref_obj_id) + /* only main (first) slot has id set */ + if (i == 0 && !st->id) return -EINVAL; - if (i != 0 && st->ref_obj_id) + if (i != 0 && st->id) return -EINVAL; if (st->iter.btf != btf || st->iter.btf_id != btf_id) return -EINVAL; @@ -1089,7 +1085,7 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, __mark_reg_known_zero(st); st->type = PTR_TO_STACK; /* we don't have dedicated reg type */ - st->ref_obj_id = id; + st->id = id; st->irq.kfunc_class = kfunc_class; for (i = 0; i < BPF_REG_SIZE; i++) @@ -1123,7 +1119,7 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r return -EINVAL; } - err = release_irq_state(env->cur_state, st->ref_obj_id); + err = release_irq_state(env->cur_state, st->id); WARN_ON_ONCE(err && err != -EACCES); if (err) { int insn_idx = 0; @@ -1187,7 +1183,7 @@ static int is_irq_flag_reg_valid_init(struct bpf_verifier_env *env, struct bpf_r slot = &state->stack[spi]; st = &slot->spilled_ptr; - if (!st->ref_obj_id) + if (!st->id) return -EINVAL; for (i = 0; i < BPF_REG_SIZE; i++) @@ -1339,6 +1335,18 @@ static int copy_stack_state(struct bpf_func_state *dst, const struct bpf_func_st return -ENOMEM; dst->allocated_stack = src->allocated_stack; + + /* copy stack args state */ + n = src->out_stack_arg_cnt; + if (n) { + dst->stack_arg_regs = copy_array(dst->stack_arg_regs, src->stack_arg_regs, n, + sizeof(struct bpf_reg_state), + GFP_KERNEL_ACCOUNT); + if (!dst->stack_arg_regs) + return -ENOMEM; + } + + dst->out_stack_arg_cnt = src->out_stack_arg_cnt; return 0; } @@ -1380,6 +1388,23 @@ static int grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state return 0; } +static int grow_stack_arg_slots(struct bpf_verifier_env *env, + struct bpf_func_state *state, int cnt) +{ + size_t old_n = state->out_stack_arg_cnt; + + if (old_n >= cnt) + return 0; + + state->stack_arg_regs = realloc_array(state->stack_arg_regs, old_n, cnt, + sizeof(struct bpf_reg_state)); + if (!state->stack_arg_regs) + return -ENOMEM; + + state->out_stack_arg_cnt = cnt; + return 0; +} + /* Acquire a pointer id from the env and update the state->refs to include * this new pointer reference. * On success, returns a valid pointer id to associate with the register @@ -1399,7 +1424,7 @@ static struct bpf_reference_state *acquire_reference_state(struct bpf_verifier_e return &state->refs[new_ofs]; } -static int acquire_reference(struct bpf_verifier_env *env, int insn_idx) +static int acquire_reference(struct bpf_verifier_env *env, int insn_idx, int parent_id) { struct bpf_reference_state *s; @@ -1408,6 +1433,7 @@ static int acquire_reference(struct bpf_verifier_env *env, int insn_idx) return -ENOMEM; s->type = REF_TYPE_PTR; s->id = ++env->id_gen; + s->parent_id = parent_id; return s->id; } @@ -1464,17 +1490,25 @@ static void release_reference_state(struct bpf_verifier_state *state, int idx) return; } -static bool find_reference_state(struct bpf_verifier_state *state, int ptr_id) +static bool find_reference_state(struct bpf_verifier_state *state, int id) { int i; - for (i = 0; i < state->acquired_refs; i++) - if (state->refs[i].id == ptr_id) + for (i = 0; i < state->acquired_refs; i++) { + if (state->refs[i].type != REF_TYPE_PTR) + continue; + if (state->refs[i].id == id) return true; + } return false; } +static bool reg_is_referenced(struct bpf_verifier_env *env, const struct bpf_reg_state *reg) +{ + return find_reference_state(env->cur_state, reg->id); +} + static int release_lock_state(struct bpf_verifier_state *state, int type, int id, void *ptr) { void *prev_ptr = NULL; @@ -1542,6 +1576,7 @@ static void free_func_state(struct bpf_func_state *state) { if (!state) return; + kfree(state->stack_arg_regs); kfree(state->stack); kfree(state); } @@ -1750,6 +1785,22 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, return &elem->st; } +static const char *reg_arg_name(struct bpf_verifier_env *env, argno_t argno) +{ + char *buf = env->tmp_arg_name; + int len = sizeof(env->tmp_arg_name); + int arg, regno = reg_from_argno(argno); + + if (regno >= 0) { + snprintf(buf, len, "R%d", regno); + } else { + arg = arg_from_argno(argno); + snprintf(buf, len, "*(R11-%u)", (arg - MAX_BPF_FUNC_REG_ARGS) * BPF_REG_SIZE); + } + + return buf; +} + static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; @@ -1758,15 +1809,8 @@ static const int caller_saved[CALLER_SAVED_REGS] = { static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm) { reg->var_off = tnum_const(imm); - reg->smin_value = (s64)imm; - reg->smax_value = (s64)imm; - reg->umin_value = imm; - reg->umax_value = imm; - - reg->s32_min_value = (s32)imm; - reg->s32_max_value = (s32)imm; - reg->u32_min_value = (u32)imm; - reg->u32_max_value = (u32)imm; + reg->r64 = cnum64_from_urange(imm, imm); + reg->r32 = cnum32_from_urange((u32)imm, (u32)imm); } /* Mark the unknown part of a register (variable offset or scalar value) as @@ -1778,17 +1822,14 @@ static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm) memset(((u8 *)reg) + sizeof(reg->type), 0, offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type)); reg->id = 0; - reg->ref_obj_id = 0; + reg->parent_id = 0; ___mark_reg_known(reg, imm); } static void __mark_reg32_known(struct bpf_reg_state *reg, u64 imm) { reg->var_off = tnum_const_subreg(reg->var_off, imm); - reg->s32_min_value = (s32)imm; - reg->s32_max_value = (s32)imm; - reg->u32_min_value = (u32)imm; - reg->u32_max_value = (u32)imm; + reg->r32 = cnum32_from_urange((u32)imm, (u32)imm); } /* Mark the 'variable offset' part of a register as zero. This should be @@ -1816,7 +1857,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env, } static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type, - bool first_slot, int dynptr_id) + bool first_slot, int id, int parent_id) { /* reg->type has no meaning for STACK_DYNPTR, but when we set reg for * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply @@ -1825,7 +1866,8 @@ static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type ty __mark_reg_known_zero(reg); reg->type = CONST_PTR_TO_DYNPTR; /* Give each dynptr a unique id to uniquely associate slices to it. */ - reg->id = dynptr_id; + reg->id = id; + reg->parent_id = parent_id; reg->dynptr.type = type; reg->dynptr.first_slot = first_slot; } @@ -1899,34 +1941,21 @@ static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg, tnum_equals_const(reg->var_off, 0); } -/* Reset the min/max bounds of a register */ -static void __mark_reg_unbounded(struct bpf_reg_state *reg) +static void __mark_reg32_unbounded(struct bpf_reg_state *reg) { - reg->smin_value = S64_MIN; - reg->smax_value = S64_MAX; - reg->umin_value = 0; - reg->umax_value = U64_MAX; - - reg->s32_min_value = S32_MIN; - reg->s32_max_value = S32_MAX; - reg->u32_min_value = 0; - reg->u32_max_value = U32_MAX; + reg->r32 = CNUM32_UNBOUNDED; } static void __mark_reg64_unbounded(struct bpf_reg_state *reg) { - reg->smin_value = S64_MIN; - reg->smax_value = S64_MAX; - reg->umin_value = 0; - reg->umax_value = U64_MAX; + reg->r64 = CNUM64_UNBOUNDED; } -static void __mark_reg32_unbounded(struct bpf_reg_state *reg) +/* Reset the min/max bounds of a register */ +static void __mark_reg_unbounded(struct bpf_reg_state *reg) { - reg->s32_min_value = S32_MIN; - reg->s32_max_value = S32_MAX; - reg->u32_min_value = 0; - reg->u32_max_value = U32_MAX; + __mark_reg64_unbounded(reg); + __mark_reg32_unbounded(reg); } static void reset_reg64_and_tnum(struct bpf_reg_state *reg) @@ -1941,19 +1970,32 @@ static void reset_reg32_and_tnum(struct bpf_reg_state *reg) reg->var_off = tnum_unknown; } -static void __update_reg32_bounds(struct bpf_reg_state *reg) +static struct cnum32 cnum32_from_tnum(struct tnum tnum) { - struct tnum var32_off = tnum_subreg(reg->var_off); + tnum = tnum_subreg(tnum); + if ((tnum.mask & S32_MIN) || (tnum.value & S32_MIN)) + /* min signed is max(sign bit) | min(other bits) */ + /* max signed is min(sign bit) | max(other bits) */ + return cnum32_from_srange(tnum.value | (tnum.mask & S32_MIN), + tnum.value | (tnum.mask & S32_MAX)); + else + return cnum32_from_urange(tnum.value, (tnum.value | tnum.mask)); +} - /* min signed is max(sign bit) | min(other bits) */ - reg->s32_min_value = max_t(s32, reg->s32_min_value, - var32_off.value | (var32_off.mask & S32_MIN)); - /* max signed is min(sign bit) | max(other bits) */ - reg->s32_max_value = min_t(s32, reg->s32_max_value, - var32_off.value | (var32_off.mask & S32_MAX)); - reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)var32_off.value); - reg->u32_max_value = min(reg->u32_max_value, - (u32)(var32_off.value | var32_off.mask)); +static struct cnum64 cnum64_from_tnum(struct tnum tnum) +{ + if ((tnum.mask & S64_MIN) || (tnum.value & S64_MIN)) + /* min signed is max(sign bit) | min(other bits) */ + /* max signed is min(sign bit) | max(other bits) */ + return cnum64_from_srange(tnum.value | (tnum.mask & S64_MIN), + tnum.value | (tnum.mask & S64_MAX)); + else + return cnum64_from_urange(tnum.value, (tnum.value | tnum.mask)); +} + +static void __update_reg32_bounds(struct bpf_reg_state *reg) +{ + cnum32_intersect_with(®->r32, cnum32_from_tnum(reg->var_off)); } static void __update_reg64_bounds(struct bpf_reg_state *reg) @@ -1961,26 +2003,18 @@ static void __update_reg64_bounds(struct bpf_reg_state *reg) u64 tnum_next, tmax; bool umin_in_tnum; - /* min signed is max(sign bit) | min(other bits) */ - reg->smin_value = max_t(s64, reg->smin_value, - reg->var_off.value | (reg->var_off.mask & S64_MIN)); - /* max signed is min(sign bit) | max(other bits) */ - reg->smax_value = min_t(s64, reg->smax_value, - reg->var_off.value | (reg->var_off.mask & S64_MAX)); - reg->umin_value = max(reg->umin_value, reg->var_off.value); - reg->umax_value = min(reg->umax_value, - reg->var_off.value | reg->var_off.mask); + cnum64_intersect_with(®->r64, cnum64_from_tnum(reg->var_off)); /* Check if u64 and tnum overlap in a single value */ - tnum_next = tnum_step(reg->var_off, reg->umin_value); - umin_in_tnum = (reg->umin_value & ~reg->var_off.mask) == reg->var_off.value; + tnum_next = tnum_step(reg->var_off, reg_umin(reg)); + umin_in_tnum = (reg_umin(reg) & ~reg->var_off.mask) == reg->var_off.value; tmax = reg->var_off.value | reg->var_off.mask; - if (umin_in_tnum && tnum_next > reg->umax_value) { + if (umin_in_tnum && tnum_next > reg_umax(reg)) { /* The u64 range and the tnum only overlap in umin. * u64: ---[xxxxxx]----- * tnum: --xx----------x- */ - ___mark_reg_known(reg, reg->umin_value); + ___mark_reg_known(reg, reg_umin(reg)); } else if (!umin_in_tnum && tnum_next == tmax) { /* The u64 range and the tnum only overlap in the maximum value * represented by the tnum, called tmax. @@ -1988,8 +2022,8 @@ static void __update_reg64_bounds(struct bpf_reg_state *reg) * tnum: xx-----x-------- */ ___mark_reg_known(reg, tmax); - } else if (!umin_in_tnum && tnum_next <= reg->umax_value && - tnum_step(reg->var_off, tnum_next) > reg->umax_value) { + } else if (!umin_in_tnum && tnum_next <= reg_umax(reg) && + tnum_step(reg->var_off, tnum_next) > reg_umax(reg)) { /* The u64 range and the tnum only overlap in between umin * (excluded) and umax. * u64: ---[xxxxxx]----- @@ -2005,329 +2039,19 @@ static void __update_reg_bounds(struct bpf_reg_state *reg) __update_reg64_bounds(reg); } -/* Uses signed min/max values to inform unsigned, and vice-versa */ static void deduce_bounds_32_from_64(struct bpf_reg_state *reg) { - /* If upper 32 bits of u64/s64 range don't change, we can use lower 32 - * bits to improve our u32/s32 boundaries. - * - * E.g., the case where we have upper 32 bits as zero ([10, 20] in - * u64) is pretty trivial, it's obvious that in u32 we'll also have - * [10, 20] range. But this property holds for any 64-bit range as - * long as upper 32 bits in that entire range of values stay the same. - * - * E.g., u64 range [0x10000000A, 0x10000000F] ([4294967306, 4294967311] - * in decimal) has the same upper 32 bits throughout all the values in - * that range. As such, lower 32 bits form a valid [0xA, 0xF] ([10, 15]) - * range. - * - * Note also, that [0xA, 0xF] is a valid range both in u32 and in s32, - * following the rules outlined below about u64/s64 correspondence - * (which equally applies to u32 vs s32 correspondence). In general it - * depends on actual hexadecimal values of 32-bit range. They can form - * only valid u32, or only valid s32 ranges in some cases. - * - * So we use all these insights to derive bounds for subregisters here. - */ - if ((reg->umin_value >> 32) == (reg->umax_value >> 32)) { - /* u64 to u32 casting preserves validity of low 32 bits as - * a range, if upper 32 bits are the same - */ - reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->umin_value); - reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->umax_value); - - if ((s32)reg->umin_value <= (s32)reg->umax_value) { - reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value); - reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value); - } - } - if ((reg->smin_value >> 32) == (reg->smax_value >> 32)) { - /* low 32 bits should form a proper u32 range */ - if ((u32)reg->smin_value <= (u32)reg->smax_value) { - reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->smin_value); - reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->smax_value); - } - /* low 32 bits should form a proper s32 range */ - if ((s32)reg->smin_value <= (s32)reg->smax_value) { - reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value); - reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value); - } - } - /* Special case where upper bits form a small sequence of two - * sequential numbers (in 32-bit unsigned space, so 0xffffffff to - * 0x00000000 is also valid), while lower bits form a proper s32 range - * going from negative numbers to positive numbers. E.g., let's say we - * have s64 range [-1, 1] ([0xffffffffffffffff, 0x0000000000000001]). - * Possible s64 values are {-1, 0, 1} ({0xffffffffffffffff, - * 0x0000000000000000, 0x00000000000001}). Ignoring upper 32 bits, - * we still get a valid s32 range [-1, 1] ([0xffffffff, 0x00000001]). - * Note that it doesn't have to be 0xffffffff going to 0x00000000 in - * upper 32 bits. As a random example, s64 range - * [0xfffffff0fffffff0; 0xfffffff100000010], forms a valid s32 range - * [-16, 16] ([0xfffffff0; 0x00000010]) in its 32 bit subregister. - */ - if ((u32)(reg->umin_value >> 32) + 1 == (u32)(reg->umax_value >> 32) && - (s32)reg->umin_value < 0 && (s32)reg->umax_value >= 0) { - reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value); - reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value); - } - if ((u32)(reg->smin_value >> 32) + 1 == (u32)(reg->smax_value >> 32) && - (s32)reg->smin_value < 0 && (s32)reg->smax_value >= 0) { - reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value); - reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value); - } -} - -static void deduce_bounds_32_from_32(struct bpf_reg_state *reg) -{ - /* if u32 range forms a valid s32 range (due to matching sign bit), - * try to learn from that - */ - if ((s32)reg->u32_min_value <= (s32)reg->u32_max_value) { - reg->s32_min_value = max_t(s32, reg->s32_min_value, reg->u32_min_value); - reg->s32_max_value = min_t(s32, reg->s32_max_value, reg->u32_max_value); - } - /* If we cannot cross the sign boundary, then signed and unsigned bounds - * are the same, so combine. This works even in the negative case, e.g. - * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff. - */ - if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) { - reg->u32_min_value = max_t(u32, reg->s32_min_value, reg->u32_min_value); - reg->u32_max_value = min_t(u32, reg->s32_max_value, reg->u32_max_value); - } else { - if (reg->u32_max_value < (u32)reg->s32_min_value) { - /* See __reg64_deduce_bounds() for detailed explanation. - * Refine ranges in the following situation: - * - * 0 U32_MAX - * | [xxxxxxxxxxxxxx u32 range xxxxxxxxxxxxxx] | - * |----------------------------|----------------------------| - * |xxxxx s32 range xxxxxxxxx] [xxxxxxx| - * 0 S32_MAX S32_MIN -1 - */ - reg->s32_min_value = (s32)reg->u32_min_value; - reg->u32_max_value = min_t(u32, reg->u32_max_value, reg->s32_max_value); - } else if ((u32)reg->s32_max_value < reg->u32_min_value) { - /* - * 0 U32_MAX - * | [xxxxxxxxxxxxxx u32 range xxxxxxxxxxxxxx] | - * |----------------------------|----------------------------| - * |xxxxxxxxx] [xxxxxxxxxxxx s32 range | - * 0 S32_MAX S32_MIN -1 - */ - reg->s32_max_value = (s32)reg->u32_max_value; - reg->u32_min_value = max_t(u32, reg->u32_min_value, reg->s32_min_value); - } - } -} - -static void deduce_bounds_64_from_64(struct bpf_reg_state *reg) -{ - /* If u64 range forms a valid s64 range (due to matching sign bit), - * try to learn from that. Let's do a bit of ASCII art to see when - * this is happening. Let's take u64 range first: - * - * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX - * |-------------------------------|--------------------------------| - * - * Valid u64 range is formed when umin and umax are anywhere in the - * range [0, U64_MAX], and umin <= umax. u64 case is simple and - * straightforward. Let's see how s64 range maps onto the same range - * of values, annotated below the line for comparison: - * - * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX - * |-------------------------------|--------------------------------| - * 0 S64_MAX S64_MIN -1 - * - * So s64 values basically start in the middle and they are logically - * contiguous to the right of it, wrapping around from -1 to 0, and - * then finishing as S64_MAX (0x7fffffffffffffff) right before - * S64_MIN. We can try drawing the continuity of u64 vs s64 values - * more visually as mapped to sign-agnostic range of hex values. - * - * u64 start u64 end - * _______________________________________________________________ - * / \ - * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX - * |-------------------------------|--------------------------------| - * 0 S64_MAX S64_MIN -1 - * / \ - * >------------------------------ -------------------------------> - * s64 continues... s64 end s64 start s64 "midpoint" - * - * What this means is that, in general, we can't always derive - * something new about u64 from any random s64 range, and vice versa. - * - * But we can do that in two particular cases. One is when entire - * u64/s64 range is *entirely* contained within left half of the above - * diagram or when it is *entirely* contained in the right half. I.e.: - * - * |-------------------------------|--------------------------------| - * ^ ^ ^ ^ - * A B C D - * - * [A, B] and [C, D] are contained entirely in their respective halves - * and form valid contiguous ranges as both u64 and s64 values. [A, B] - * will be non-negative both as u64 and s64 (and in fact it will be - * identical ranges no matter the signedness). [C, D] treated as s64 - * will be a range of negative values, while in u64 it will be - * non-negative range of values larger than 0x8000000000000000. - * - * Now, any other range here can't be represented in both u64 and s64 - * simultaneously. E.g., [A, C], [A, D], [B, C], [B, D] are valid - * contiguous u64 ranges, but they are discontinuous in s64. [B, C] - * in s64 would be properly presented as [S64_MIN, C] and [B, S64_MAX], - * for example. Similarly, valid s64 range [D, A] (going from negative - * to positive values), would be two separate [D, U64_MAX] and [0, A] - * ranges as u64. Currently reg_state can't represent two segments per - * numeric domain, so in such situations we can only derive maximal - * possible range ([0, U64_MAX] for u64, and [S64_MIN, S64_MAX] for s64). - * - * So we use these facts to derive umin/umax from smin/smax and vice - * versa only if they stay within the same "half". This is equivalent - * to checking sign bit: lower half will have sign bit as zero, upper - * half have sign bit 1. Below in code we simplify this by just - * casting umin/umax as smin/smax and checking if they form valid - * range, and vice versa. Those are equivalent checks. - */ - if ((s64)reg->umin_value <= (s64)reg->umax_value) { - reg->smin_value = max_t(s64, reg->smin_value, reg->umin_value); - reg->smax_value = min_t(s64, reg->smax_value, reg->umax_value); - } - /* If we cannot cross the sign boundary, then signed and unsigned bounds - * are the same, so combine. This works even in the negative case, e.g. - * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff. - */ - if ((u64)reg->smin_value <= (u64)reg->smax_value) { - reg->umin_value = max_t(u64, reg->smin_value, reg->umin_value); - reg->umax_value = min_t(u64, reg->smax_value, reg->umax_value); - } else { - /* If the s64 range crosses the sign boundary, then it's split - * between the beginning and end of the U64 domain. In that - * case, we can derive new bounds if the u64 range overlaps - * with only one end of the s64 range. - * - * In the following example, the u64 range overlaps only with - * positive portion of the s64 range. - * - * 0 U64_MAX - * | [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx] | - * |----------------------------|----------------------------| - * |xxxxx s64 range xxxxxxxxx] [xxxxxxx| - * 0 S64_MAX S64_MIN -1 - * - * We can thus derive the following new s64 and u64 ranges. - * - * 0 U64_MAX - * | [xxxxxx u64 range xxxxx] | - * |----------------------------|----------------------------| - * | [xxxxxx s64 range xxxxx] | - * 0 S64_MAX S64_MIN -1 - * - * If they overlap in two places, we can't derive anything - * because reg_state can't represent two ranges per numeric - * domain. - * - * 0 U64_MAX - * | [xxxxxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxxxxx] | - * |----------------------------|----------------------------| - * |xxxxx s64 range xxxxxxxxx] [xxxxxxxxxx| - * 0 S64_MAX S64_MIN -1 - * - * The first condition below corresponds to the first diagram - * above. - */ - if (reg->umax_value < (u64)reg->smin_value) { - reg->smin_value = (s64)reg->umin_value; - reg->umax_value = min_t(u64, reg->umax_value, reg->smax_value); - } else if ((u64)reg->smax_value < reg->umin_value) { - /* This second condition considers the case where the u64 range - * overlaps with the negative portion of the s64 range: - * - * 0 U64_MAX - * | [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx] | - * |----------------------------|----------------------------| - * |xxxxxxxxx] [xxxxxxxxxxxx s64 range | - * 0 S64_MAX S64_MIN -1 - */ - reg->smax_value = (s64)reg->umax_value; - reg->umin_value = max_t(u64, reg->umin_value, reg->smin_value); - } - } + cnum32_intersect_with(®->r32, cnum32_from_cnum64(reg->r64)); } static void deduce_bounds_64_from_32(struct bpf_reg_state *reg) { - /* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit - * values on both sides of 64-bit range in hope to have tighter range. - * E.g., if r1 is [0x1'00000000, 0x3'80000000], and we learn from - * 32-bit signed > 0 operation that s32 bounds are now [1; 0x7fffffff]. - * With this, we can substitute 1 as low 32-bits of _low_ 64-bit bound - * (0x100000000 -> 0x100000001) and 0x7fffffff as low 32-bits of - * _high_ 64-bit bound (0x380000000 -> 0x37fffffff) and arrive at a - * better overall bounds for r1 as [0x1'000000001; 0x3'7fffffff]. - * We just need to make sure that derived bounds we are intersecting - * with are well-formed ranges in respective s64 or u64 domain, just - * like we do with similar kinds of 32-to-64 or 64-to-32 adjustments. - */ - __u64 new_umin, new_umax; - __s64 new_smin, new_smax; - - /* u32 -> u64 tightening, it's always well-formed */ - new_umin = (reg->umin_value & ~0xffffffffULL) | reg->u32_min_value; - new_umax = (reg->umax_value & ~0xffffffffULL) | reg->u32_max_value; - reg->umin_value = max_t(u64, reg->umin_value, new_umin); - reg->umax_value = min_t(u64, reg->umax_value, new_umax); - /* u32 -> s64 tightening, u32 range embedded into s64 preserves range validity */ - new_smin = (reg->smin_value & ~0xffffffffULL) | reg->u32_min_value; - new_smax = (reg->smax_value & ~0xffffffffULL) | reg->u32_max_value; - reg->smin_value = max_t(s64, reg->smin_value, new_smin); - reg->smax_value = min_t(s64, reg->smax_value, new_smax); - - /* Here we would like to handle a special case after sign extending load, - * when upper bits for a 64-bit range are all 1s or all 0s. - * - * Upper bits are all 1s when register is in a range: - * [0xffff_ffff_0000_0000, 0xffff_ffff_ffff_ffff] - * Upper bits are all 0s when register is in a range: - * [0x0000_0000_0000_0000, 0x0000_0000_ffff_ffff] - * Together this forms are continuous range: - * [0xffff_ffff_0000_0000, 0x0000_0000_ffff_ffff] - * - * Now, suppose that register range is in fact tighter: - * [0xffff_ffff_8000_0000, 0x0000_0000_ffff_ffff] (R) - * Also suppose that it's 32-bit range is positive, - * meaning that lower 32-bits of the full 64-bit register - * are in the range: - * [0x0000_0000, 0x7fff_ffff] (W) - * - * If this happens, then any value in a range: - * [0xffff_ffff_0000_0000, 0xffff_ffff_7fff_ffff] - * is smaller than a lowest bound of the range (R): - * 0xffff_ffff_8000_0000 - * which means that upper bits of the full 64-bit register - * can't be all 1s, when lower bits are in range (W). - * - * Note that: - * - 0xffff_ffff_8000_0000 == (s64)S32_MIN - * - 0x0000_0000_7fff_ffff == (s64)S32_MAX - * These relations are used in the conditions below. - */ - if (reg->s32_min_value >= 0 && reg->smin_value >= S32_MIN && reg->smax_value <= S32_MAX) { - reg->smin_value = reg->s32_min_value; - reg->smax_value = reg->s32_max_value; - reg->umin_value = reg->s32_min_value; - reg->umax_value = reg->s32_max_value; - reg->var_off = tnum_intersect(reg->var_off, - tnum_range(reg->smin_value, reg->smax_value)); - } + reg->r64 = cnum64_cnum32_intersect(reg->r64, reg->r32); } static void __reg_deduce_bounds(struct bpf_reg_state *reg) { - deduce_bounds_64_from_64(reg); deduce_bounds_32_from_64(reg); - deduce_bounds_32_from_32(reg); deduce_bounds_64_from_32(reg); } @@ -2335,11 +2059,11 @@ static void __reg_deduce_bounds(struct bpf_reg_state *reg) static void __reg_bound_offset(struct bpf_reg_state *reg) { struct tnum var64_off = tnum_intersect(reg->var_off, - tnum_range(reg->umin_value, - reg->umax_value)); + tnum_range(reg_umin(reg), + reg_umax(reg))); struct tnum var32_off = tnum_intersect(tnum_subreg(var64_off), - tnum_range(reg->u32_min_value, - reg->u32_max_value)); + tnum_range(reg_u32_min(reg), + reg_u32_max(reg))); reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off); } @@ -2365,35 +2089,25 @@ static void reg_bounds_sync(struct bpf_reg_state *reg) __update_reg_bounds(reg); } -static bool range_bounds_violation(struct bpf_reg_state *reg) -{ - return (reg->umin_value > reg->umax_value || reg->smin_value > reg->smax_value || - reg->u32_min_value > reg->u32_max_value || - reg->s32_min_value > reg->s32_max_value); -} - static bool const_tnum_range_mismatch(struct bpf_reg_state *reg) { - u64 uval = reg->var_off.value; - s64 sval = (s64)uval; - if (!tnum_is_const(reg->var_off)) return false; - return reg->umin_value != uval || reg->umax_value != uval || - reg->smin_value != sval || reg->smax_value != sval; + return !cnum64_is_const(reg->r64) || reg->r64.base != reg->var_off.value; } static bool const_tnum_range_mismatch_32(struct bpf_reg_state *reg) { - u32 uval32 = tnum_subreg(reg->var_off).value; - s32 sval32 = (s32)uval32; - if (!tnum_subreg_is_const(reg->var_off)) return false; - return reg->u32_min_value != uval32 || reg->u32_max_value != uval32 || - reg->s32_min_value != sval32 || reg->s32_max_value != sval32; + return !cnum32_is_const(reg->r32) || reg->r32.base != tnum_subreg(reg->var_off).value; +} + +static bool range_bounds_violation(struct bpf_reg_state *reg) +{ + return cnum32_is_empty(reg->r32) || cnum64_is_empty(reg->r64); } static int reg_bounds_sanity_check(struct bpf_verifier_env *env, @@ -2418,12 +2132,11 @@ static int reg_bounds_sanity_check(struct bpf_verifier_env *env, return 0; out: - verifier_bug(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] " - "s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)", - ctx, msg, reg->umin_value, reg->umax_value, - reg->smin_value, reg->smax_value, - reg->u32_min_value, reg->u32_max_value, - reg->s32_min_value, reg->s32_max_value, + verifier_bug(env, "REG INVARIANTS VIOLATION (%s): %s r64={.base=%#llx, .size=%#llx} " + "r32={.base=%#x, .size=%#x} var_off=(%#llx, %#llx)", + ctx, msg, + reg->r64.base, reg->r64.size, + reg->r32.base, reg->r32.size, reg->var_off.value, reg->var_off.mask); if (env->test_reg_invariants) return -EFAULT; @@ -2431,44 +2144,15 @@ out: return 0; } -static bool __reg32_bound_s64(s32 a) -{ - return a >= 0 && a <= S32_MAX; -} - -static void __reg_assign_32_into_64(struct bpf_reg_state *reg) -{ - reg->umin_value = reg->u32_min_value; - reg->umax_value = reg->u32_max_value; - - /* Attempt to pull 32-bit signed bounds into 64-bit bounds but must - * be positive otherwise set to worse case bounds and refine later - * from tnum. - */ - if (__reg32_bound_s64(reg->s32_min_value) && - __reg32_bound_s64(reg->s32_max_value)) { - reg->smin_value = reg->s32_min_value; - reg->smax_value = reg->s32_max_value; - } else { - reg->smin_value = 0; - reg->smax_value = U32_MAX; - } -} - /* Mark a register as having a completely unknown (scalar) value. */ void bpf_mark_reg_unknown_imprecise(struct bpf_reg_state *reg) { - /* - * Clear type, off, and union(map_ptr, range) and - * padding between 'type' and union - */ - memset(reg, 0, offsetof(struct bpf_reg_state, var_off)); + s32 subreg_def = reg->subreg_def; + + memset(reg, 0, sizeof(*reg)); reg->type = SCALAR_VALUE; - reg->id = 0; - reg->ref_obj_id = 0; reg->var_off = tnum_unknown; - reg->frameno = 0; - reg->precise = false; + reg->subreg_def = subreg_def; __mark_reg_unbounded(reg); } @@ -2496,11 +2180,12 @@ static int __mark_reg_s32_range(struct bpf_verifier_env *env, { struct bpf_reg_state *reg = regs + regno; - reg->s32_min_value = max_t(s32, reg->s32_min_value, s32_min); - reg->s32_max_value = min_t(s32, reg->s32_max_value, s32_max); - - reg->smin_value = max_t(s64, reg->smin_value, s32_min); - reg->smax_value = min_t(s64, reg->smax_value, s32_max); + reg_set_srange32(reg, + max_t(s32, reg_s32_min(reg), s32_min), + min_t(s32, reg_s32_max(reg), s32_max)); + reg_set_srange64(reg, + max_t(s64, reg_smin(reg), s32_min), + min_t(s64, reg_smax(reg), s32_max)); reg_bounds_sync(reg); @@ -3295,50 +2980,13 @@ out: return ret; } -static int mark_stack_slot_obj_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - int spi, int nr_slots) +static void mark_stack_slots_scratched(struct bpf_verifier_env *env, + int spi, int nr_slots) { int i; for (i = 0; i < nr_slots; i++) mark_stack_slot_scratched(env, spi - i); - return 0; -} - -static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg) -{ - int spi; - - /* For CONST_PTR_TO_DYNPTR, it must have already been done by - * check_reg_arg in check_helper_call and mark_btf_func_reg_size in - * check_kfunc_call. - */ - if (reg->type == CONST_PTR_TO_DYNPTR) - return 0; - spi = dynptr_get_spi(env, reg); - if (spi < 0) - return spi; - /* Caller ensures dynptr is valid and initialized, which means spi is in - * bounds and spi is the first dynptr slot. Simply mark stack slot as - * read. - */ - return mark_stack_slot_obj_read(env, reg, spi, BPF_DYNPTR_NR_SLOTS); -} - -static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - int spi, int nr_slots) -{ - return mark_stack_slot_obj_read(env, reg, spi, nr_slots); -} - -static int mark_irq_flag_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg) -{ - int spi; - - spi = irq_flag_get_spi(env, reg); - if (spi < 0) - return spi; - return mark_stack_slot_obj_read(env, reg, spi, 1); } /* This function is supposed to be used by the following 32-bit optimization @@ -3491,17 +3139,12 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, return __check_reg_arg(env, state->regs, regno, t); } -static int insn_stack_access_flags(int frameno, int spi) -{ - return INSN_F_STACK_ACCESS | (spi << INSN_F_SPI_SHIFT) | frameno; -} - static void mark_indirect_target(struct bpf_verifier_env *env, int idx) { env->insn_aux_data[idx].indirect_target = true; } -#define LR_FRAMENO_BITS 3 +#define LR_FRAMENO_BITS 4 #define LR_SPI_BITS 6 #define LR_ENTRY_BITS (LR_SPI_BITS + LR_FRAMENO_BITS + 1) #define LR_SIZE_BITS 4 @@ -3510,7 +3153,11 @@ static void mark_indirect_target(struct bpf_verifier_env *env, int idx) #define LR_SIZE_MASK ((1ull << LR_SIZE_BITS) - 1) #define LR_SPI_OFF LR_FRAMENO_BITS #define LR_IS_REG_OFF (LR_SPI_BITS + LR_FRAMENO_BITS) -#define LINKED_REGS_MAX 6 +#define LINKED_REGS_MAX 5 + +static_assert(MAX_CALL_FRAMES <= (1 << LR_FRAMENO_BITS)); +static_assert(LINKED_REGS_MAX < (1 << LR_SIZE_BITS)); +static_assert(LINKED_REGS_MAX * LR_ENTRY_BITS + LR_SIZE_BITS <= 64); struct linked_reg { u8 frameno; @@ -3534,10 +3181,11 @@ static struct linked_reg *linked_regs_push(struct linked_regs *s) return NULL; } -/* Use u64 as a vector of 6 10-bit values, use first 4-bits to track +/* + * Use u64 as a vector of 5 11-bit values, use first 4-bits to track * number of elements currently in stack. - * Pack one history entry for linked registers as 10 bits in the following format: - * - 3-bits frameno + * Pack one history entry for linked registers as 11 bits in the following format: + * - 4-bits frameno * - 6-bits spi_or_reg * - 1-bit is_reg */ @@ -3733,12 +3381,6 @@ static void assign_scalar_id_before_mov(struct bpf_verifier_env *env, src_reg->id = ++env->id_gen; } -/* Copy src state preserving dst->parent and dst->live fields */ -static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src) -{ - *dst = *src; -} - static void save_register_state(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi, struct bpf_reg_state *reg, @@ -3746,7 +3388,7 @@ static void save_register_state(struct bpf_verifier_env *env, { int i; - copy_register_state(&state->stack[spi].spilled_ptr, reg); + state->stack[spi].spilled_ptr = *reg; for (i = BPF_REG_SIZE; i > BPF_REG_SIZE - size; i--) state->stack[spi].slot_type[i - 1] = STACK_SPILL; @@ -3763,7 +3405,7 @@ static bool is_bpf_st_mem(struct bpf_insn *insn) static int get_reg_width(struct bpf_reg_state *reg) { - return fls64(reg->umax_value); + return fls64(reg_umax(reg)); } /* See comment for mark_fastcall_pattern_for_call() */ @@ -3816,7 +3458,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; struct bpf_reg_state *reg = NULL; - int insn_flags = insn_stack_access_flags(state->frameno, spi); + int insn_flags = INSN_F_STACK_ACCESS; + int hist_spi = spi, hist_frame = state->frameno; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, * so it's aligned access and [off, off + size) are within stack limits @@ -3912,11 +3555,12 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, } if (insn_flags) - return bpf_push_jmp_history(env, env->cur_state, insn_flags, 0); + return bpf_push_jmp_history(env, env->cur_state, insn_flags, + hist_spi, hist_frame, 0); return 0; } -/* Write the stack: 'stack[ptr_regno + off] = value_regno'. 'ptr_regno' is +/* Write the stack: 'stack[ptr_reg + off] = value_regno'. 'ptr_reg' is * known to contain a variable offset. * This function checks whether the write is permitted and conservatively * tracks the effects of the write, considering that each stack slot in the @@ -3937,13 +3581,13 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, static int check_stack_write_var_off(struct bpf_verifier_env *env, /* func where register points to */ struct bpf_func_state *state, - int ptr_regno, int off, int size, + struct bpf_reg_state *ptr_reg, int off, int size, int value_regno, int insn_idx) { struct bpf_func_state *cur; /* state of the current function */ int min_off, max_off; int i, err; - struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL; + struct bpf_reg_state *value_reg = NULL; struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; bool writing_zero = false; /* set if the fact that we're writing a zero is used to let any @@ -3952,9 +3596,8 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, bool zero_used = false; cur = env->cur_state->frame[env->cur_state->curframe]; - ptr_reg = &cur->regs[ptr_regno]; - min_off = ptr_reg->smin_value + off; - max_off = ptr_reg->smax_value + off + size; + min_off = reg_smin(ptr_reg) + off; + max_off = reg_smax(ptr_reg) + off + size; if (value_regno >= 0) value_reg = &cur->regs[value_regno]; if ((value_reg && bpf_register_is_null(value_reg)) || @@ -4109,7 +3752,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; struct bpf_reg_state *reg; u8 *stype, type; - int insn_flags = insn_stack_access_flags(reg_state->frameno, spi); + int insn_flags = INSN_F_STACK_ACCESS; + int hist_spi = spi, hist_frame = reg_state->frameno; stype = reg_state->stack[spi].slot_type; reg = ®_state->stack[spi].spilled_ptr; @@ -4146,7 +3790,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, * with the destination register on fill. */ assign_scalar_id_before_mov(env, reg); - copy_register_state(&state->regs[dst_regno], reg); + state->regs[dst_regno] = *reg; state->regs[dst_regno].subreg_def = subreg_def; /* Break the relation on a narrowing fill. @@ -4201,7 +3845,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, * with the destination register on fill. */ assign_scalar_id_before_mov(env, reg); - copy_register_state(&state->regs[dst_regno], reg); + state->regs[dst_regno] = *reg; /* mark reg as written since spilled pointer state likely * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions @@ -4240,7 +3884,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, insn_flags = 0; /* we are not restoring spilled register */ } if (insn_flags) - return bpf_push_jmp_history(env, env->cur_state, insn_flags, 0); + return bpf_push_jmp_history(env, env->cur_state, insn_flags, + hist_spi, hist_frame, 0); return 0; } @@ -4249,8 +3894,8 @@ enum bpf_access_src { ACCESS_HELPER = 2, /* the access is performed by a helper */ }; -static int check_stack_range_initialized(struct bpf_verifier_env *env, - int regno, int off, int access_size, +static int check_stack_range_initialized(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + argno_t argno, int off, int access_size, bool zero_size_allowed, enum bpf_access_type type, struct bpf_call_arg_meta *meta); @@ -4260,37 +3905,35 @@ static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) return cur_regs(env) + regno; } -/* Read the stack at 'ptr_regno + off' and put the result into the register +/* Read the stack at 'reg + off' and put the result into the register * 'dst_regno'. - * 'off' includes the pointer register's fixed offset(i.e. 'ptr_regno.off'), + * 'off' includes the pointer register's fixed offset(i.e. 'reg->off'), * but not its variable offset. * 'size' is assumed to be <= reg size and the access is assumed to be aligned. * * As opposed to check_stack_read_fixed_off, this function doesn't deal with * filling registers (i.e. reads of spilled register cannot be detected when * the offset is not fixed). We conservatively mark 'dst_regno' as containing - * SCALAR_VALUE. That's why we assert that the 'ptr_regno' has a variable + * SCALAR_VALUE. That's why we assert that the 'reg' has a variable * offset; for a fixed offset check_stack_read_fixed_off should be used * instead. */ -static int check_stack_read_var_off(struct bpf_verifier_env *env, - int ptr_regno, int off, int size, int dst_regno) +static int check_stack_read_var_off(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + argno_t ptr_argno, int off, int size, int dst_regno) { - /* The state of the source register. */ - struct bpf_reg_state *reg = reg_state(env, ptr_regno); struct bpf_func_state *ptr_state = bpf_func(env, reg); int err; int min_off, max_off; /* Note that we pass a NULL meta, so raw access will not be permitted. */ - err = check_stack_range_initialized(env, ptr_regno, off, size, + err = check_stack_range_initialized(env, reg, ptr_argno, off, size, false, BPF_READ, NULL); if (err) return err; - min_off = reg->smin_value + off; - max_off = reg->smax_value + off; + min_off = reg_smin(reg) + off; + max_off = reg_smax(reg) + off; mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno); check_fastcall_stack_contract(env, ptr_state, env->insn_idx, min_off); return 0; @@ -4306,10 +3949,9 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env, * can be -1, meaning that the read value is not going to a register. */ static int check_stack_read(struct bpf_verifier_env *env, - int ptr_regno, int off, int size, + struct bpf_reg_state *reg, argno_t ptr_argno, int off, int size, int dst_regno) { - struct bpf_reg_state *reg = reg_state(env, ptr_regno); struct bpf_func_state *state = bpf_func(env, reg); int err; /* Some accesses are only permitted with a static offset. */ @@ -4345,7 +3987,7 @@ static int check_stack_read(struct bpf_verifier_env *env, * than fixed offset ones. Note that dst_regno >= 0 on this * branch. */ - err = check_stack_read_var_off(env, ptr_regno, off, size, + err = check_stack_read_var_off(env, reg, ptr_argno, off, size, dst_regno); } return err; @@ -4355,17 +3997,16 @@ static int check_stack_read(struct bpf_verifier_env *env, /* check_stack_write dispatches to check_stack_write_fixed_off or * check_stack_write_var_off. * - * 'ptr_regno' is the register used as a pointer into the stack. + * 'reg' is the register used as a pointer into the stack. * 'value_regno' is the register whose value we're writing to the stack. It can * be -1, meaning that we're not writing from a register. * * The caller must ensure that the offset falls within the maximum stack size. */ static int check_stack_write(struct bpf_verifier_env *env, - int ptr_regno, int off, int size, + struct bpf_reg_state *reg, int off, int size, int value_regno, int insn_idx) { - struct bpf_reg_state *reg = reg_state(env, ptr_regno); struct bpf_func_state *state = bpf_func(env, reg); int err; @@ -4378,28 +4019,135 @@ static int check_stack_write(struct bpf_verifier_env *env, * than fixed offset ones. */ err = check_stack_write_var_off(env, state, - ptr_regno, off, size, + reg, off, size, value_regno, insn_idx); } return err; } -static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, +/* + * Write a value to the outgoing stack arg area. + * off is a negative offset from r11 (e.g. -8 for arg6, -16 for arg7). + */ +static int check_stack_arg_write(struct bpf_verifier_env *env, struct bpf_func_state *state, + int off, struct bpf_reg_state *value_reg) +{ + int max_stack_arg_regs = MAX_BPF_FUNC_ARGS - MAX_BPF_FUNC_REG_ARGS; + struct bpf_subprog_info *subprog = &env->subprog_info[state->subprogno]; + int spi = -off / BPF_REG_SIZE - 1; + struct bpf_reg_state *arg; + int err; + + if (spi >= max_stack_arg_regs) { + verbose(env, "stack arg write offset %d exceeds max %d stack args\n", + off, max_stack_arg_regs); + return -EINVAL; + } + + err = grow_stack_arg_slots(env, state, spi + 1); + if (err) + return err; + + /* Track the max outgoing stack arg slot count. */ + if (spi + 1 > subprog->max_out_stack_arg_cnt) + subprog->max_out_stack_arg_cnt = spi + 1; + + if (value_reg) { + state->stack_arg_regs[spi] = *value_reg; + } else { + /* BPF_ST: store immediate, treat as scalar */ + arg = &state->stack_arg_regs[spi]; + arg->type = SCALAR_VALUE; + __mark_reg_known(arg, env->prog->insnsi[env->insn_idx].imm); + } + state->no_stack_arg_load = true; + return bpf_push_jmp_history(env, env->cur_state, + INSN_F_STACK_ARG_ACCESS, spi, 0, 0); +} + +/* + * Read a value from the incoming stack arg area. + * off is a positive offset from r11 (e.g. +8 for arg6, +16 for arg7). + */ +static int check_stack_arg_read(struct bpf_verifier_env *env, struct bpf_func_state *state, + int off, int dst_regno) +{ + struct bpf_subprog_info *subprog = &env->subprog_info[state->subprogno]; + struct bpf_verifier_state *vstate = env->cur_state; + int spi = off / BPF_REG_SIZE - 1; + struct bpf_func_state *caller, *cur; + struct bpf_reg_state *arg; + + if (state->no_stack_arg_load) { + verbose(env, "r11 load must be before any r11 store or call insn\n"); + return -EINVAL; + } + + if (spi + 1 > bpf_in_stack_arg_cnt(subprog)) { + verbose(env, "invalid read from stack arg off %d depth %d\n", + off, bpf_in_stack_arg_cnt(subprog) * BPF_REG_SIZE); + return -EACCES; + } + + caller = vstate->frame[vstate->curframe - 1]; + arg = &caller->stack_arg_regs[spi]; + cur = vstate->frame[vstate->curframe]; + cur->regs[dst_regno] = *arg; + return bpf_push_jmp_history(env, env->cur_state, + INSN_F_STACK_ARG_ACCESS, spi, 0, 0); +} + +static int mark_stack_arg_precision(struct bpf_verifier_env *env, int arg_idx) +{ + struct bpf_func_state *caller = cur_func(env); + int spi = arg_idx - MAX_BPF_FUNC_REG_ARGS; + + bt_set_frame_stack_arg_slot(&env->bt, caller->frameno, spi); + return mark_chain_precision_batch(env, env->cur_state); +} + +static int check_outgoing_stack_args(struct bpf_verifier_env *env, struct bpf_func_state *caller, + int nargs) +{ + int i, spi; + + for (i = MAX_BPF_FUNC_REG_ARGS; i < nargs; i++) { + spi = i - MAX_BPF_FUNC_REG_ARGS; + if (spi >= caller->out_stack_arg_cnt || + caller->stack_arg_regs[spi].type == NOT_INIT) { + verbose(env, "callee expects %d args, stack arg%d is not initialized\n", + nargs, spi + 1); + return -EFAULT; + } + } + + return 0; +} + +static struct bpf_reg_state *get_func_arg_reg(struct bpf_func_state *caller, + struct bpf_reg_state *regs, int arg) +{ + if (arg < MAX_BPF_FUNC_REG_ARGS) + return ®s[arg + 1]; + + return &caller->stack_arg_regs[arg - MAX_BPF_FUNC_REG_ARGS]; +} + +static int check_map_access_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int off, int size, enum bpf_access_type type) { - struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_map *map = reg->map_ptr; u32 cap = bpf_map_flags_to_cap(map); if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) { verbose(env, "write into map forbidden, value_size=%d off=%lld size=%d\n", - map->value_size, reg->smin_value + off, size); + map->value_size, reg_smin(reg) + off, size); return -EACCES; } if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) { verbose(env, "read from map forbidden, value_size=%d off=%lld size=%d\n", - map->value_size, reg->smin_value + off, size); + map->value_size, reg_smin(reg) + off, size); return -EACCES; } @@ -4407,17 +4155,15 @@ static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, } /* check read/write into memory region (e.g., map value, ringbuf sample, etc) */ -static int __check_mem_access(struct bpf_verifier_env *env, int regno, +static int __check_mem_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int off, int size, u32 mem_size, bool zero_size_allowed) { bool size_ok = size > 0 || (size == 0 && zero_size_allowed); - struct bpf_reg_state *reg; if (off >= 0 && size_ok && (u64)off + size <= mem_size) return 0; - reg = &cur_regs(env)[regno]; switch (reg->type) { case PTR_TO_MAP_KEY: verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n", @@ -4430,8 +4176,8 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno, case PTR_TO_PACKET: case PTR_TO_PACKET_META: case PTR_TO_PACKET_END: - verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", - off, size, regno, reg->id, off, mem_size); + verbose(env, "invalid access to packet, off=%d size=%d, %s(id=%d,off=%d,r=%d)\n", + off, size, reg_arg_name(env, argno), reg->id, off, mem_size); break; case PTR_TO_CTX: verbose(env, "invalid access to context, ctx_size=%d off=%d size=%d\n", @@ -4447,13 +4193,10 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno, } /* check read/write into a memory region with possible variable offset */ -static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, +static int check_mem_region_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int off, int size, u32 mem_size, bool zero_size_allowed) { - struct bpf_verifier_state *vstate = env->cur_state; - struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *reg = &state->regs[regno]; int err; /* We may have adjusted the register pointing to memory region, so we @@ -4466,36 +4209,36 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, * index'es we need to make sure that whatever we use * will have a set floor within our range. */ - if (reg->smin_value < 0 && - (reg->smin_value == S64_MIN || - (off + reg->smin_value != (s64)(s32)(off + reg->smin_value)) || - reg->smin_value + off < 0)) { - verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", - regno); + if (reg_smin(reg) < 0 && + (reg_smin(reg) == S64_MIN || + (off + reg_smin(reg) != (s64)(s32)(off + reg_smin(reg))) || + reg_smin(reg) + off < 0)) { + verbose(env, "%s min value is negative, either use unsigned index or do a if (index >=0) check.\n", + reg_arg_name(env, argno)); return -EACCES; } - err = __check_mem_access(env, regno, reg->smin_value + off, size, + err = __check_mem_access(env, reg, argno, reg_smin(reg) + off, size, mem_size, zero_size_allowed); if (err) { - verbose(env, "R%d min value is outside of the allowed memory range\n", - regno); + verbose(env, "%s min value is outside of the allowed memory range\n", + reg_arg_name(env, argno)); return err; } /* If we haven't set a max value then we need to bail since we can't be * sure we won't do bad things. - * If reg->umax_value + off could overflow, treat that as unbounded too. + * If reg_umax(reg) + off could overflow, treat that as unbounded too. */ - if (reg->umax_value >= BPF_MAX_VAR_OFF) { - verbose(env, "R%d unbounded memory access, make sure to bounds check any such access\n", - regno); + if (reg_umax(reg) >= BPF_MAX_VAR_OFF) { + verbose(env, "%s unbounded memory access, make sure to bounds check any such access\n", + reg_arg_name(env, argno)); return -EACCES; } - err = __check_mem_access(env, regno, reg->umax_value + off, size, + err = __check_mem_access(env, reg, argno, reg_umax(reg) + off, size, mem_size, zero_size_allowed); if (err) { - verbose(env, "R%d max value is outside of the allowed memory range\n", - regno); + verbose(env, "%s max value is outside of the allowed memory range\n", + reg_arg_name(env, argno)); return err; } @@ -4503,7 +4246,7 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, } static int __check_ptr_off_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno, + const struct bpf_reg_state *reg, argno_t argno, bool fixed_off_ok) { /* Access to this pointer-typed register or passing it to a helper @@ -4519,15 +4262,15 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env, return -EACCES; } - if (reg->smin_value < 0) { - verbose(env, "negative offset %s ptr R%d off=%lld disallowed\n", - reg_type_str(env, reg->type), regno, reg->var_off.value); + if (reg_smin(reg) < 0) { + verbose(env, "negative offset %s ptr %s off=%lld disallowed\n", + reg_type_str(env, reg->type), reg_arg_name(env, argno), reg->var_off.value); return -EACCES; } if (!fixed_off_ok && reg->var_off.value != 0) { - verbose(env, "dereference of modified %s ptr R%d off=%lld disallowed\n", - reg_type_str(env, reg->type), regno, reg->var_off.value); + verbose(env, "dereference of modified %s ptr %s off=%lld disallowed\n", + reg_type_str(env, reg->type), reg_arg_name(env, argno), reg->var_off.value); return -EACCES; } @@ -4537,7 +4280,7 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env, static int check_ptr_off_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno) { - return __check_ptr_off_reg(env, reg, regno, false); + return __check_ptr_off_reg(env, reg, argno_from_reg(regno), false); } static int map_kptr_match_type(struct bpf_verifier_env *env, @@ -4573,9 +4316,9 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the * normal store of unreferenced kptr, we must ensure var_off is zero. * Since ref_ptr cannot be accessed directly by BPF insns, check for - * reg->ref_obj_id is not needed here. + * reg->id is not needed here. */ - if (__check_ptr_off_reg(env, reg, regno, true)) + if (__check_ptr_off_reg(env, reg, argno_from_reg(regno), true)) return -EACCES; /* A full type match is needed, as BTF can be vmlinux, module or prog BTF, and @@ -4718,7 +4461,7 @@ static int mark_uptr_ld_reg(struct bpf_verifier_env *env, u32 regno, return 0; } -static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, +static int check_map_kptr_access(struct bpf_verifier_env *env, int value_regno, int insn_idx, struct btf_field *kptr_field) { @@ -4795,19 +4538,16 @@ static u32 map_mem_size(const struct bpf_map *map) } /* check read/write into a map element with possible variable offset */ -static int check_map_access(struct bpf_verifier_env *env, u32 regno, +static int check_map_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int off, int size, bool zero_size_allowed, enum bpf_access_src src) { - struct bpf_verifier_state *vstate = env->cur_state; - struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *reg = &state->regs[regno]; struct bpf_map *map = reg->map_ptr; u32 mem_size = map_mem_size(map); struct btf_record *rec; int err, i; - err = check_mem_region_access(env, regno, off, size, mem_size, zero_size_allowed); + err = check_mem_region_access(env, reg, argno, off, size, mem_size, zero_size_allowed); if (err) return err; @@ -4822,8 +4562,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * this program. To check that [x1, x2) overlaps with [y1, y2), * it is sufficient to check x1 < y2 && y1 < x2. */ - if (reg->smin_value + off < p + field->size && - p < reg->umax_value + off + size) { + if (reg_smin(reg) + off < p + field->size && + p < reg_umax(reg) + off + size) { switch (field->type) { case BPF_KPTR_UNREF: case BPF_KPTR_REF: @@ -4903,30 +4643,29 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, } } -static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, +static int check_packet_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int off, int size, bool zero_size_allowed) { - struct bpf_reg_state *reg = reg_state(env, regno); int err; if (reg->range < 0) { - verbose(env, "R%d offset is outside of the packet\n", regno); + verbose(env, "%s offset is outside of the packet\n", reg_arg_name(env, argno)); return -EINVAL; } - err = check_mem_region_access(env, regno, off, size, reg->range, zero_size_allowed); + err = check_mem_region_access(env, reg, argno, off, size, reg->range, zero_size_allowed); if (err) return err; /* __check_mem_access has made sure "off + size - 1" is within u16. - * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff, + * reg_umax(reg) can't be bigger than MAX_PACKET_OFF which is 0xffff, * otherwise find_good_pkt_pointers would have refused to set range info * that __check_mem_access would have rejected this pkt access. - * Therefore, "off + reg->umax_value + size - 1" won't overflow u32. + * Therefore, "off + reg_umax(reg) + size - 1" won't overflow u32. */ env->prog->aux->max_pkt_offset = max_t(u32, env->prog->aux->max_pkt_offset, - off + reg->umax_value + size - 1); + off + reg_umax(reg) + size - 1); return 0; } @@ -4950,8 +4689,8 @@ static int __check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int of * type of narrower access. */ if (base_type(info->reg_type) == PTR_TO_BTF_ID) { - if (info->ref_obj_id && - !find_reference_state(env->cur_state, info->ref_obj_id)) { + if (info->ref_id && + !find_reference_state(env->cur_state, info->ref_id)) { verbose(env, "invalid bpf_context access off=%d. Reference may already be released\n", off); return -EACCES; @@ -4969,7 +4708,7 @@ static int __check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int of return -EACCES; } -static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, +static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, struct bpf_reg_state *reg, argno_t argno, int off, int access_size, enum bpf_access_type t, struct bpf_insn_access_aux *info) { @@ -4979,17 +4718,15 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, u32 regn */ bool var_off_ok = is_var_ctx_off_allowed(env->prog); bool fixed_off_ok = !env->ops->convert_ctx_access; - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = regs + regno; int err; if (var_off_ok) - err = check_mem_region_access(env, regno, off, access_size, U16_MAX, false); + err = check_mem_region_access(env, reg, argno, off, access_size, U16_MAX, false); else - err = __check_ptr_off_reg(env, reg, regno, fixed_off_ok); + err = __check_ptr_off_reg(env, reg, argno, fixed_off_ok); if (err) return err; - off += reg->umax_value; + off += reg_umax(reg); err = __check_ctx_access(env, insn_idx, off, access_size, t, info); if (err) @@ -4997,9 +4734,21 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return err; } -static int check_flow_keys_access(struct bpf_verifier_env *env, int off, - int size) +static int check_flow_keys_access(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, argno_t argno, + int off, int size) { + /* Only a constant offset is allowed here; fold it into off. */ + if (!tnum_is_const(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "%s invalid variable offset to flow keys: off=%d, var_off=%s\n", + reg_arg_name(env, argno), off, tn_buf); + return -EACCES; + } + off += reg->var_off.value; + if (size < 0 || off < 0 || (u64)off + size > sizeof(struct bpf_flow_keys)) { verbose(env, "invalid access to flow keys off=%d size=%d\n", @@ -5010,16 +4759,15 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off, } static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, - u32 regno, int off, int size, + struct bpf_reg_state *reg, argno_t argno, int off, int size, enum bpf_access_type t) { - struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_insn_access_aux info = {}; bool valid; - if (reg->smin_value < 0) { - verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", - regno); + if (reg_smin(reg) < 0) { + verbose(env, "%s min value is negative, either use unsigned index or do a if (index >=0) check.\n", + reg_arg_name(env, argno)); return -EACCES; } @@ -5047,8 +4795,8 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, return 0; } - verbose(env, "R%d invalid %s access off=%d size=%d\n", - regno, reg_type_str(env, reg->type), off, size); + verbose(env, "%s invalid %s access off=%d size=%d\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type), off, size); return -EACCES; } @@ -5123,10 +4871,10 @@ static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { [CONST_PTR_TO_MAP] = btf_bpf_map_id, }; -static bool is_trusted_reg(const struct bpf_reg_state *reg) +static bool is_trusted_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg) { /* A referenced register is always trusted. */ - if (reg->ref_obj_id) + if (reg_is_referenced(env, reg)) return true; /* Types listed in the reg2btf_ids are always trusted */ @@ -5368,7 +5116,10 @@ process_func: } subprog_depth = round_up_stack_depth(env, subprog[idx].stack_depth); - if (priv_stack_supported) { + if (IS_ENABLED(CONFIG_X86_64) && subprog[idx].stack_arg_cnt) { + /* x86-64 uses R9 for both private stack frame pointer and arg6. */ + subprog[idx].priv_stack_mode = NO_PRIV_STACK; + } else if (priv_stack_supported) { /* Request private stack support only if the subprog stack * depth is no less than BPF_PRIV_STACK_MIN_SIZE. This is to * avoid jit penalty if the stack usage is small. @@ -5379,6 +5130,8 @@ process_func: } if (subprog[idx].priv_stack_mode == PRIV_STACK_ADAPTIVE) { + if (subprog_depth > env->max_stack_depth) + env->max_stack_depth = subprog_depth; if (subprog_depth > MAX_BPF_STACK) { verbose(env, "stack size of subprog %d is %d. Too large\n", idx, subprog_depth); @@ -5386,6 +5139,8 @@ process_func: } } else { depth += subprog_depth; + if (depth > env->max_stack_depth) + env->max_stack_depth = depth; if (depth > MAX_BPF_STACK) { total = 0; for (tmp = idx; tmp >= 0; tmp = dinfo[tmp].caller) @@ -5472,14 +5227,23 @@ continue_func: * this info will be utilized by JIT so that we will be preserving the * tail call counter throughout bpf2bpf calls combined with tailcalls */ - if (tail_call_reachable) + if (tail_call_reachable) { for (tmp = idx; tmp >= 0; tmp = dinfo[tmp].caller) { if (subprog[tmp].is_exception_cb) { verbose(env, "cannot tail call within exception cb\n"); return -EINVAL; } + if (subprog[tmp].stack_arg_cnt) { + verbose(env, "tail_calls are not allowed in programs with stack args\n"); + return -EINVAL; + } subprog[tmp].tail_call_reachable = true; } + } else if (!idx && subprog[0].has_tail_call && subprog[0].stack_arg_cnt) { + verbose(env, "tail_calls are not allowed in programs with stack args\n"); + return -EINVAL; + } + if (subprog[0].tail_call_reachable) env->prog->aux->tail_call_reachable = true; @@ -5498,6 +5262,9 @@ continue_func: frame = dinfo[idx].frame; i = dinfo[idx].ret_insn; + /* reset tail_call_reachable to the parent's actual state */ + tail_call_reachable = subprog[idx].tail_call_reachable; + goto continue_func; } @@ -5558,12 +5325,12 @@ static int check_max_stack_depth(struct bpf_verifier_env *env) static int __check_buffer_access(struct bpf_verifier_env *env, const char *buf_info, const struct bpf_reg_state *reg, - int regno, int off, int size) + argno_t argno, int off, int size) { if (off < 0) { verbose(env, - "R%d invalid %s buffer access: off=%d, size=%d\n", - regno, buf_info, off, size); + "%s invalid %s buffer access: off=%d, size=%d\n", + reg_arg_name(env, argno), buf_info, off, size); return -EACCES; } if (!tnum_is_const(reg->var_off)) { @@ -5571,8 +5338,8 @@ static int __check_buffer_access(struct bpf_verifier_env *env, tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, - "R%d invalid variable buffer offset: off=%d, var_off=%s\n", - regno, off, tn_buf); + "%s invalid variable buffer offset: off=%d, var_off=%s\n", + reg_arg_name(env, argno), off, tn_buf); return -EACCES; } @@ -5581,11 +5348,11 @@ static int __check_buffer_access(struct bpf_verifier_env *env, static int check_tp_buffer_access(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, - int regno, int off, int size) + argno_t argno, int off, int size) { int err; - err = __check_buffer_access(env, "tracepoint", reg, regno, off, size); + err = __check_buffer_access(env, "tracepoint", reg, argno, off, size); if (err) return err; @@ -5597,14 +5364,14 @@ static int check_tp_buffer_access(struct bpf_verifier_env *env, static int check_buffer_access(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, - int regno, int off, int size, + argno_t argno, int off, int size, bool zero_size_allowed, u32 *max_access) { const char *buf_info = type_is_rdonly_mem(reg->type) ? "rdonly" : "rdwr"; int err; - err = __check_buffer_access(env, buf_info, reg, regno, off, size); + err = __check_buffer_access(env, buf_info, reg, argno, off, size); if (err) return err; @@ -5617,7 +5384,7 @@ static int check_buffer_access(struct bpf_verifier_env *env, static void zext_32_to_64(struct bpf_reg_state *reg) { reg->var_off = tnum_subreg(reg->var_off); - __reg_assign_32_into_64(reg); + reg_set_urange64(reg, reg_u32_min(reg), reg_u32_max(reg)); } /* truncate register to smaller size (in bytes) @@ -5632,15 +5399,10 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) /* fix arithmetic bounds */ mask = ((u64)1 << (size * 8)) - 1; - if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) { - reg->umin_value &= mask; - reg->umax_value &= mask; - } else { - reg->umin_value = 0; - reg->umax_value = mask; - } - reg->smin_value = reg->umin_value; - reg->smax_value = reg->umax_value; + if ((reg_umin(reg) & ~mask) == (reg_umax(reg) & ~mask)) + reg_set_urange64(reg, reg_umin(reg) & mask, reg_umax(reg) & mask); + else + reg_set_urange64(reg, 0, mask); /* If size is smaller than 32bit register the 32bit register * values are also truncated so we push 64-bit bounds into @@ -5655,19 +5417,16 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) static void set_sext64_default_val(struct bpf_reg_state *reg, int size) { if (size == 1) { - reg->smin_value = reg->s32_min_value = S8_MIN; - reg->smax_value = reg->s32_max_value = S8_MAX; + reg_set_srange64(reg, S8_MIN, S8_MAX); + reg_set_srange32(reg, S8_MIN, S8_MAX); } else if (size == 2) { - reg->smin_value = reg->s32_min_value = S16_MIN; - reg->smax_value = reg->s32_max_value = S16_MAX; + reg_set_srange64(reg, S16_MIN, S16_MAX); + reg_set_srange32(reg, S16_MIN, S16_MAX); } else { /* size == 4 */ - reg->smin_value = reg->s32_min_value = S32_MIN; - reg->smax_value = reg->s32_max_value = S32_MAX; + reg_set_srange64(reg, S32_MIN, S32_MAX); + reg_set_srange32(reg, S32_MIN, S32_MAX); } - reg->umin_value = reg->u32_min_value = 0; - reg->umax_value = U64_MAX; - reg->u32_max_value = U32_MAX; reg->var_off = tnum_unknown; } @@ -5688,29 +5447,27 @@ static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size) reg->var_off = tnum_const((s32)u64_cval); u64_cval = reg->var_off.value; - reg->smax_value = reg->smin_value = u64_cval; - reg->umax_value = reg->umin_value = u64_cval; - reg->s32_max_value = reg->s32_min_value = u64_cval; - reg->u32_max_value = reg->u32_min_value = u64_cval; + reg->r64 = cnum64_from_urange(u64_cval, u64_cval); + reg->r32 = cnum32_from_urange((u32)u64_cval, (u32)u64_cval); return; } - top_smax_value = ((u64)reg->smax_value >> num_bits) << num_bits; - top_smin_value = ((u64)reg->smin_value >> num_bits) << num_bits; + top_smax_value = ((u64)reg_smax(reg) >> num_bits) << num_bits; + top_smin_value = ((u64)reg_smin(reg) >> num_bits) << num_bits; if (top_smax_value != top_smin_value) goto out; /* find the s64_min and s64_min after sign extension */ if (size == 1) { - init_s64_max = (s8)reg->smax_value; - init_s64_min = (s8)reg->smin_value; + init_s64_max = (s8)reg_smax(reg); + init_s64_min = (s8)reg_smin(reg); } else if (size == 2) { - init_s64_max = (s16)reg->smax_value; - init_s64_min = (s16)reg->smin_value; + init_s64_max = (s16)reg_smax(reg); + init_s64_min = (s16)reg_smin(reg); } else { - init_s64_max = (s32)reg->smax_value; - init_s64_min = (s32)reg->smin_value; + init_s64_max = (s32)reg_smax(reg); + init_s64_min = (s32)reg_smin(reg); } s64_max = max(init_s64_max, init_s64_min); @@ -5718,10 +5475,8 @@ static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size) /* both of s64_max/s64_min positive or negative */ if ((s64_max >= 0) == (s64_min >= 0)) { - reg->s32_min_value = reg->smin_value = s64_min; - reg->s32_max_value = reg->smax_value = s64_max; - reg->u32_min_value = reg->umin_value = s64_min; - reg->u32_max_value = reg->umax_value = s64_max; + reg_set_srange64(reg, s64_min, s64_max); + reg_set_srange32(reg, s64_min, s64_max); reg->var_off = tnum_range(s64_min, s64_max); return; } @@ -5732,16 +5487,11 @@ out: static void set_sext32_default_val(struct bpf_reg_state *reg, int size) { - if (size == 1) { - reg->s32_min_value = S8_MIN; - reg->s32_max_value = S8_MAX; - } else { + if (size == 1) + reg_set_srange32(reg, S8_MIN, S8_MAX); + else /* size == 2 */ - reg->s32_min_value = S16_MIN; - reg->s32_max_value = S16_MAX; - } - reg->u32_min_value = 0; - reg->u32_max_value = U32_MAX; + reg_set_srange32(reg, S16_MIN, S16_MAX); reg->var_off = tnum_subreg(tnum_unknown); } @@ -5759,34 +5509,30 @@ static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size) reg->var_off = tnum_const((s16)u32_val); u32_val = reg->var_off.value; - reg->s32_min_value = reg->s32_max_value = u32_val; - reg->u32_min_value = reg->u32_max_value = u32_val; + reg_set_srange32(reg, u32_val, u32_val); return; } - top_smax_value = ((u32)reg->s32_max_value >> num_bits) << num_bits; - top_smin_value = ((u32)reg->s32_min_value >> num_bits) << num_bits; + top_smax_value = ((u32)reg_s32_max(reg) >> num_bits) << num_bits; + top_smin_value = ((u32)reg_s32_min(reg) >> num_bits) << num_bits; if (top_smax_value != top_smin_value) goto out; /* find the s32_min and s32_min after sign extension */ if (size == 1) { - init_s32_max = (s8)reg->s32_max_value; - init_s32_min = (s8)reg->s32_min_value; + init_s32_max = (s8)reg_s32_max(reg); + init_s32_min = (s8)reg_s32_min(reg); } else { /* size == 2 */ - init_s32_max = (s16)reg->s32_max_value; - init_s32_min = (s16)reg->s32_min_value; + init_s32_max = (s16)reg_s32_max(reg); + init_s32_min = (s16)reg_s32_min(reg); } s32_max = max(init_s32_max, init_s32_min); s32_min = min(init_s32_max, init_s32_min); if ((s32_min >= 0) == (s32_max >= 0)) { - reg->s32_min_value = s32_min; - reg->s32_max_value = s32_max; - reg->u32_min_value = (u32)s32_min; - reg->u32_max_value = (u32)s32_max; + reg_set_srange32(reg, s32_min, s32_max); reg->var_off = tnum_subreg(tnum_range(s32_min, s32_max)); return; } @@ -5976,12 +5722,11 @@ static bool type_is_trusted_or_null(struct bpf_verifier_env *env, } static int check_ptr_to_btf_access(struct bpf_verifier_env *env, - struct bpf_reg_state *regs, - int regno, int off, int size, + struct bpf_reg_state *regs, struct bpf_reg_state *reg, + argno_t argno, int off, int size, enum bpf_access_type atype, int value_regno) { - struct bpf_reg_state *reg = regs + regno; const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id); const char *tname = btf_name_by_offset(reg->btf, t->name_off); const char *field_name = NULL; @@ -6007,8 +5752,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, - "R%d is ptr_%s invalid variable offset: off=%d, var_off=%s\n", - regno, tname, off, tn_buf); + "%s is ptr_%s invalid variable offset: off=%d, var_off=%s\n", + reg_arg_name(env, argno), tname, off, tn_buf); return -EACCES; } @@ -6016,22 +5761,22 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (off < 0) { verbose(env, - "R%d is ptr_%s invalid negative access: off=%d\n", - regno, tname, off); + "%s is ptr_%s invalid negative access: off=%d\n", + reg_arg_name(env, argno), tname, off); return -EACCES; } if (reg->type & MEM_USER) { verbose(env, - "R%d is ptr_%s access user memory: off=%d\n", - regno, tname, off); + "%s is ptr_%s access user memory: off=%d\n", + reg_arg_name(env, argno), tname, off); return -EACCES; } if (reg->type & MEM_PERCPU) { verbose(env, - "R%d is ptr_%s access percpu memory: off=%d\n", - regno, tname, off); + "%s is ptr_%s access percpu memory: off=%d\n", + reg_arg_name(env, argno), tname, off); return -EACCES; } @@ -6043,7 +5788,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, ret = env->ops->btf_struct_access(&env->log, reg, off, size); } else { /* Writes are permitted with default btf_struct_access for - * program allocated objects (which always have ref_obj_id > 0), + * program allocated objects (which always have id > 0), * but not for untrusted PTR_TO_BTF_ID | MEM_ALLOC. */ if (atype != BPF_READ && !type_is_ptr_alloc_obj(reg->type)) { @@ -6052,8 +5797,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, } if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) && - !(reg->type & MEM_RCU) && !reg->ref_obj_id) { - verifier_bug(env, "ref_obj_id for allocated object must be non-zero"); + !(reg->type & MEM_RCU) && !reg_is_referenced(env, reg)) { + verifier_bug(env, "allocated object must have a referenced id"); return -EFAULT; } @@ -6072,7 +5817,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, */ flag = PTR_UNTRUSTED; - } else if (is_trusted_reg(reg) || is_rcu_reg(reg)) { + } else if (is_trusted_reg(env, reg) || is_rcu_reg(reg)) { /* By default any pointer obtained from walking a trusted pointer is no * longer trusted, unless the field being accessed has explicitly been * marked as inheriting its parent's state of trust (either full or RCU). @@ -6133,12 +5878,11 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, } static int check_ptr_to_map_access(struct bpf_verifier_env *env, - struct bpf_reg_state *regs, - int regno, int off, int size, + struct bpf_reg_state *regs, struct bpf_reg_state *reg, + argno_t argno, int off, int size, enum bpf_access_type atype, int value_regno) { - struct bpf_reg_state *reg = regs + regno; struct bpf_map *map = reg->map_ptr; struct bpf_reg_state map_reg; enum bpf_type_flag flag = 0; @@ -6169,8 +5913,8 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, } if (off < 0) { - verbose(env, "R%d is %s invalid negative access: off=%d\n", - regno, tname, off); + verbose(env, "%s is %s invalid negative access: off=%d\n", + reg_arg_name(env, argno), tname, off); return -EACCES; } @@ -6227,11 +5971,10 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env, * 'off' includes `regno->offset`, but not its dynamic part (if any). */ static int check_stack_access_within_bounds( - struct bpf_verifier_env *env, - int regno, int off, int access_size, + struct bpf_verifier_env *env, struct bpf_reg_state *reg, + argno_t argno, int off, int access_size, enum bpf_access_type type) { - struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = bpf_func(env, reg); s64 min_off, max_off; int err; @@ -6246,14 +5989,14 @@ static int check_stack_access_within_bounds( min_off = (s64)reg->var_off.value + off; max_off = min_off + access_size; } else { - if (reg->smax_value >= BPF_MAX_VAR_OFF || - reg->smin_value <= -BPF_MAX_VAR_OFF) { - verbose(env, "invalid unbounded variable-offset%s stack R%d\n", - err_extra, regno); + if (reg_smax(reg) >= BPF_MAX_VAR_OFF || + reg_smin(reg) <= -BPF_MAX_VAR_OFF) { + verbose(env, "invalid unbounded variable-offset%s stack %s\n", + err_extra, reg_arg_name(env, argno)); return -EACCES; } - min_off = reg->smin_value + off; - max_off = reg->smax_value + off + access_size; + min_off = reg_smin(reg) + off; + max_off = reg_smax(reg) + off + access_size; } err = check_stack_slot_within_bounds(env, min_off, state, type); @@ -6267,14 +6010,14 @@ static int check_stack_access_within_bounds( if (err) { if (tnum_is_const(reg->var_off)) { - verbose(env, "invalid%s stack R%d off=%lld size=%d\n", - err_extra, regno, min_off, access_size); + verbose(env, "invalid%s stack %s off=%lld size=%d\n", + err_extra, reg_arg_name(env, argno), min_off, access_size); } else { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid variable-offset%s stack R%d var_off=%s off=%d size=%d\n", - err_extra, regno, tn_buf, off, access_size); + verbose(env, "invalid variable-offset%s stack %s var_off=%s off=%d size=%d\n", + err_extra, reg_arg_name(env, argno), tn_buf, off, access_size); } return err; } @@ -6319,12 +6062,11 @@ static void add_scalar_to_reg(struct bpf_reg_state *dst_reg, s64 val) * if t==write && value_regno==-1, some unknown value is stored into memory * if t==read && value_regno==-1, don't care what we read from memory */ -static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, +static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct bpf_reg_state *reg, argno_t argno, int off, int bpf_size, enum bpf_access_type t, int value_regno, bool strict_alignment_once, bool is_ldsx) { struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = regs + regno; int size, err = 0; size = bpf_size_to_bytes(bpf_size); @@ -6337,11 +6079,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (reg->type == PTR_TO_MAP_KEY) { if (t == BPF_WRITE) { - verbose(env, "write to change key R%d not allowed\n", regno); + verbose(env, "write to change key %s not allowed\n", + reg_arg_name(env, argno)); return -EACCES; } - err = check_mem_region_access(env, regno, off, size, + err = check_mem_region_access(env, reg, argno, off, size, reg->map_ptr->key_size, false); if (err) return err; @@ -6355,17 +6098,17 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn verbose(env, "R%d leaks addr into map\n", value_regno); return -EACCES; } - err = check_map_access_type(env, regno, off, size, t); + err = check_map_access_type(env, reg, off, size, t); if (err) return err; - err = check_map_access(env, regno, off, size, false, ACCESS_DIRECT); + err = check_map_access(env, reg, argno, off, size, false, ACCESS_DIRECT); if (err) return err; if (tnum_is_const(reg->var_off)) kptr_field = btf_record_find(reg->map_ptr->record, off + reg->var_off.value, BPF_KPTR | BPF_UPTR); if (kptr_field) { - err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field); + err = check_map_kptr_access(env, value_regno, insn_idx, kptr_field); } else if (t == BPF_READ && value_regno >= 0) { struct bpf_map *map = reg->map_ptr; @@ -6393,7 +6136,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn size); return -EACCES; } - copy_register_state(®s[value_regno], reg); + regs[value_regno] = *reg; add_scalar_to_reg(®s[value_regno], off); regs[value_regno].type = PTR_TO_INSN; } else { @@ -6405,14 +6148,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn bool rdonly_untrusted = rdonly_mem && (reg->type & PTR_UNTRUSTED); if (type_may_be_null(reg->type)) { - verbose(env, "R%d invalid mem access '%s'\n", regno, + verbose(env, "%s invalid mem access '%s'\n", reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } if (t == BPF_WRITE && rdonly_mem) { - verbose(env, "R%d cannot write into %s\n", - regno, reg_type_str(env, reg->type)); + verbose(env, "%s cannot write into %s\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } @@ -6427,7 +6170,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * instructions, hence no need to check bounds in that case. */ if (!rdonly_untrusted) - err = check_mem_region_access(env, regno, off, size, + err = check_mem_region_access(env, reg, argno, off, size, reg->mem_size, false); if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem)) mark_reg_unknown(env, regs, value_regno); @@ -6445,7 +6188,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } - err = check_ctx_access(env, insn_idx, regno, off, size, t, &info); + err = check_ctx_access(env, insn_idx, reg, argno, off, size, t, &info); if (!err && t == BPF_READ && value_regno >= 0) { /* ctx access returns either a scalar, or a * PTR_TO_PACKET[_META,_END]. In the latter @@ -6463,8 +6206,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else { mark_reg_known_zero(env, regs, value_regno); - if (type_may_be_null(info.reg_type)) - regs[value_regno].id = ++env->id_gen; /* A load of ctx field could have different * actual load size with the one encoded in the * insn. When the dst is PTR, it is for sure not @@ -6474,23 +6215,25 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (base_type(info.reg_type) == PTR_TO_BTF_ID) { regs[value_regno].btf = info.btf; regs[value_regno].btf_id = info.btf_id; - regs[value_regno].ref_obj_id = info.ref_obj_id; + regs[value_regno].id = info.ref_id; } + if (type_may_be_null(info.reg_type) && !regs[value_regno].id) + regs[value_regno].id = ++env->id_gen; } regs[value_regno].type = info.reg_type; } } else if (reg->type == PTR_TO_STACK) { /* Basic bounds checks. */ - err = check_stack_access_within_bounds(env, regno, off, size, t); + err = check_stack_access_within_bounds(env, reg, argno, off, size, t); if (err) return err; if (t == BPF_READ) - err = check_stack_read(env, regno, off, size, + err = check_stack_read(env, reg, argno, off, size, value_regno); else - err = check_stack_write(env, regno, off, size, + err = check_stack_write(env, reg, off, size, value_regno, insn_idx); } else if (reg_is_pkt_pointer(reg)) { if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { @@ -6503,7 +6246,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn value_regno); return -EACCES; } - err = check_packet_access(env, regno, off, size, false); + err = check_packet_access(env, reg, argno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_FLOW_KEYS) { @@ -6514,28 +6257,28 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } - err = check_flow_keys_access(env, off, size); + err = check_flow_keys_access(env, reg, argno, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (type_is_sk_pointer(reg->type)) { if (t == BPF_WRITE) { - verbose(env, "R%d cannot write into %s\n", - regno, reg_type_str(env, reg->type)); + verbose(env, "%s cannot write into %s\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } - err = check_sock_access(env, insn_idx, regno, off, size, t); + err = check_sock_access(env, insn_idx, reg, argno, off, size, t); if (!err && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_TP_BUFFER) { - err = check_tp_buffer_access(env, reg, regno, off, size); + err = check_tp_buffer_access(env, reg, argno, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (base_type(reg->type) == PTR_TO_BTF_ID && !type_may_be_null(reg->type)) { - err = check_ptr_to_btf_access(env, regs, regno, off, size, t, + err = check_ptr_to_btf_access(env, regs, reg, argno, off, size, t, value_regno); } else if (reg->type == CONST_PTR_TO_MAP) { - err = check_ptr_to_map_access(env, regs, regno, off, size, t, + err = check_ptr_to_map_access(env, regs, reg, argno, off, size, t, value_regno); } else if (base_type(reg->type) == PTR_TO_BUF && !type_may_be_null(reg->type)) { @@ -6544,8 +6287,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (rdonly_mem) { if (t == BPF_WRITE) { - verbose(env, "R%d cannot write into %s\n", - regno, reg_type_str(env, reg->type)); + verbose(env, "%s cannot write into %s\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } max_access = &env->prog->aux->max_rdonly_access; @@ -6553,7 +6296,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn max_access = &env->prog->aux->max_rdwr_access; } - err = check_buffer_access(env, reg, regno, off, size, false, + err = check_buffer_access(env, reg, argno, off, size, false, max_access); if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ)) @@ -6562,7 +6305,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else { - verbose(env, "R%d invalid mem access '%s'\n", regno, + verbose(env, "%s invalid mem access '%s'\n", reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } @@ -6585,10 +6328,20 @@ static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn, bool strict_alignment_once, bool is_ldsx, bool allow_trust_mismatch, const char *ctx) { + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = cur_regs(env); enum bpf_reg_type src_reg_type; int err; + /* Handle stack arg read */ + if (is_stack_arg_ldx(insn)) { + err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); + if (err) + return err; + return check_stack_arg_read(env, state, insn->off, insn->dst_reg); + } + /* check src operand */ err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) @@ -6604,7 +6357,7 @@ static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn, /* Check if (src_reg + off) is readable. The state of dst_reg will be * updated by this call. */ - err = check_mem_access(env, env->insn_idx, insn->src_reg, insn->off, + err = check_mem_access(env, env->insn_idx, regs + insn->src_reg, argno_from_reg(insn->src_reg), insn->off, BPF_SIZE(insn->code), BPF_READ, insn->dst_reg, strict_alignment_once, is_ldsx); err = err ?: save_aux_ptr_type(env, src_reg_type, @@ -6617,10 +6370,20 @@ static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn, static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn, bool strict_alignment_once) { + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = cur_regs(env); enum bpf_reg_type dst_reg_type; int err; + /* Handle stack arg write */ + if (is_stack_arg_stx(insn)) { + err = check_reg_arg(env, insn->src_reg, SRC_OP); + if (err) + return err; + return check_stack_arg_write(env, state, insn->off, regs + insn->src_reg); + } + /* check src1 operand */ err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) @@ -6634,7 +6397,7 @@ static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn, dst_reg_type = regs[insn->dst_reg].type; /* Check if (dst_reg + off) is writeable. */ - err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, regs + insn->dst_reg, argno_from_reg(insn->dst_reg), insn->off, BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg, strict_alignment_once, false); err = err ?: save_aux_ptr_type(env, dst_reg_type, false); @@ -6645,6 +6408,7 @@ static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn, static int check_atomic_rmw(struct bpf_verifier_env *env, struct bpf_insn *insn) { + struct bpf_reg_state *dst_reg; int load_reg; int err; @@ -6706,13 +6470,15 @@ static int check_atomic_rmw(struct bpf_verifier_env *env, load_reg = -1; } + dst_reg = cur_regs(env) + insn->dst_reg; + /* Check whether we can read the memory, with second call for fetch * case to simulate the register fill. */ - err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, dst_reg, argno_from_reg(insn->dst_reg), insn->off, BPF_SIZE(insn->code), BPF_READ, -1, true, false); if (!err && load_reg >= 0) - err = check_mem_access(env, env->insn_idx, insn->dst_reg, + err = check_mem_access(env, env->insn_idx, dst_reg, argno_from_reg(insn->dst_reg), insn->off, BPF_SIZE(insn->code), BPF_READ, load_reg, true, false); if (err) @@ -6724,7 +6490,7 @@ static int check_atomic_rmw(struct bpf_verifier_env *env, return err; } /* Check whether we can write into the same memory. */ - err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, dst_reg, argno_from_reg(insn->dst_reg), insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, true, false); if (err) return err; @@ -6813,11 +6579,10 @@ static int check_atomic(struct bpf_verifier_env *env, struct bpf_insn *insn) * read offsets are marked as read. */ static int check_stack_range_initialized( - struct bpf_verifier_env *env, int regno, int off, + struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int off, int access_size, bool zero_size_allowed, enum bpf_access_type type, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = bpf_func(env, reg); int err, min_off, max_off, i, j, slot, spi; /* Some accesses can write anything into the stack, others are @@ -6839,7 +6604,7 @@ static int check_stack_range_initialized( return -EACCES; } - err = check_stack_access_within_bounds(env, regno, off, access_size, type); + err = check_stack_access_within_bounds(env, reg, argno, off, access_size, type); if (err) return err; @@ -6856,8 +6621,8 @@ static int check_stack_range_initialized( char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n", - regno, tn_buf); + verbose(env, "%s variable offset stack access prohibited for !root, var_off=%s\n", + reg_arg_name(env, argno), tn_buf); return -EACCES; } /* Only initialized buffer on stack is allowed to be accessed @@ -6869,8 +6634,8 @@ static int check_stack_range_initialized( if (meta && meta->raw_mode) meta = NULL; - min_off = reg->smin_value + off; - max_off = reg->smax_value + off; + min_off = reg_smin(reg) + off; + max_off = reg_smax(reg) + off; } if (meta && meta->raw_mode) { @@ -6900,7 +6665,7 @@ static int check_stack_range_initialized( } } meta->access_size = access_size; - meta->regno = regno; + meta->regno = reg_from_argno(argno); return 0; } @@ -6940,17 +6705,17 @@ static int check_stack_range_initialized( if (*stype == STACK_POISON) { if (allow_poison) goto mark; - verbose(env, "reading from stack R%d off %d+%d size %d, slot poisoned by dead code elimination\n", - regno, min_off, i - min_off, access_size); + verbose(env, "reading from stack %s off %d+%d size %d, slot poisoned by dead code elimination\n", + reg_arg_name(env, argno), min_off, i - min_off, access_size); } else if (tnum_is_const(reg->var_off)) { - verbose(env, "invalid read from stack R%d off %d+%d size %d\n", - regno, min_off, i - min_off, access_size); + verbose(env, "invalid read from stack %s off %d+%d size %d\n", + reg_arg_name(env, argno), min_off, i - min_off, access_size); } else { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid read from stack R%d var_off %s+%d size %d\n", - regno, tn_buf, i - min_off, access_size); + verbose(env, "invalid read from stack %s var_off %s+%d size %d\n", + reg_arg_name(env, argno), tn_buf, i - min_off, access_size); } return -EACCES; mark: @@ -6959,48 +6724,48 @@ mark: return 0; } -static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, +static int check_helper_mem_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int access_size, enum bpf_access_type access_type, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *regs = cur_regs(env); u32 *max_access; switch (base_type(reg->type)) { case PTR_TO_PACKET: case PTR_TO_PACKET_META: - return check_packet_access(env, regno, 0, access_size, + return check_packet_access(env, reg, argno, 0, access_size, zero_size_allowed); case PTR_TO_MAP_KEY: if (access_type == BPF_WRITE) { - verbose(env, "R%d cannot write into %s\n", regno, - reg_type_str(env, reg->type)); + verbose(env, "%s cannot write into %s\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } - return check_mem_region_access(env, regno, 0, access_size, + return check_mem_region_access(env, reg, argno, 0, access_size, reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: - if (check_map_access_type(env, regno, 0, access_size, access_type)) + if (check_map_access_type(env, reg, 0, access_size, access_type)) return -EACCES; - return check_map_access(env, regno, 0, access_size, + return check_map_access(env, reg, argno, 0, access_size, zero_size_allowed, ACCESS_HELPER); case PTR_TO_MEM: if (type_is_rdonly_mem(reg->type)) { if (access_type == BPF_WRITE) { - verbose(env, "R%d cannot write into %s\n", regno, - reg_type_str(env, reg->type)); + verbose(env, "%s cannot write into %s\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } } - return check_mem_region_access(env, regno, 0, + return check_mem_region_access(env, reg, argno, 0, access_size, reg->mem_size, zero_size_allowed); case PTR_TO_BUF: if (type_is_rdonly_mem(reg->type)) { if (access_type == BPF_WRITE) { - verbose(env, "R%d cannot write into %s\n", regno, - reg_type_str(env, reg->type)); + verbose(env, "%s cannot write into %s\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } @@ -7008,26 +6773,26 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } else { max_access = &env->prog->aux->max_rdwr_access; } - return check_buffer_access(env, reg, regno, 0, + return check_buffer_access(env, reg, argno, 0, access_size, zero_size_allowed, max_access); case PTR_TO_STACK: return check_stack_range_initialized( - env, - regno, 0, access_size, + env, reg, + argno, 0, access_size, zero_size_allowed, access_type, meta); case PTR_TO_BTF_ID: - return check_ptr_to_btf_access(env, regs, regno, 0, - access_size, BPF_READ, -1); + return check_ptr_to_btf_access(env, regs, reg, argno, 0, + access_size, access_type, -1); case PTR_TO_CTX: /* Only permit reading or writing syscall context using helper calls. */ if (is_var_ctx_off_allowed(env->prog)) { - int err = check_mem_region_access(env, regno, 0, access_size, U16_MAX, + int err = check_mem_region_access(env, reg, argno, 0, access_size, U16_MAX, zero_size_allowed); if (err) return err; - if (env->prog->aux->max_ctx_offset < reg->umax_value + access_size) - env->prog->aux->max_ctx_offset = reg->umax_value + access_size; + if (env->prog->aux->max_ctx_offset < reg_umax(reg) + access_size) + env->prog->aux->max_ctx_offset = reg_umax(reg) + access_size; return 0; } fallthrough; @@ -7037,7 +6802,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, bpf_register_is_null(reg)) return 0; - verbose(env, "R%d type=%s ", regno, + verbose(env, "%s type=%s ", reg_arg_name(env, argno), reg_type_str(env, reg->type)); verbose(env, "expected=%s\n", reg_type_str(env, PTR_TO_STACK)); return -EACCES; @@ -7047,12 +6812,12 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, /* verify arguments to helpers or kfuncs consisting of a pointer and an access * size. * - * @regno is the register containing the access size. regno-1 is the register - * containing the pointer. + * @mem_reg contains the pointer, @size_reg contains the access size. */ static int check_mem_size_reg(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, - enum bpf_access_type access_type, + struct bpf_reg_state *mem_reg, + struct bpf_reg_state *size_reg, argno_t mem_argno, + argno_t size_argno, enum bpf_access_type access_type, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { @@ -7066,42 +6831,48 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, * out. Only upper bounds can be learned because retval is an * int type and negative retvals are allowed. */ - meta->msize_max_value = reg->umax_value; + meta->msize_max_value = reg_umax(size_reg); /* The register is SCALAR_VALUE; the access check happens using * its boundaries. For unprivileged variable accesses, disable * raw mode so that the program is required to initialize all * the memory that the helper could just partially fill up. */ - if (!tnum_is_const(reg->var_off)) + if (!tnum_is_const(size_reg->var_off)) meta = NULL; - if (reg->smin_value < 0) { - verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n", - regno); + if (reg_smin(size_reg) < 0) { + verbose(env, "%s min value is negative, either use unsigned or 'var &= const'\n", + reg_arg_name(env, size_argno)); return -EACCES; } - if (reg->umin_value == 0 && !zero_size_allowed) { - verbose(env, "R%d invalid zero-sized read: u64=[%lld,%lld]\n", - regno, reg->umin_value, reg->umax_value); + if (reg_umin(size_reg) == 0 && !zero_size_allowed) { + verbose(env, "%s invalid zero-sized read: u64=[%lld,%lld]\n", + reg_arg_name(env, size_argno), reg_umin(size_reg), reg_umax(size_reg)); return -EACCES; } - if (reg->umax_value >= BPF_MAX_VAR_SIZ) { - verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", - regno); + if (reg_umax(size_reg) >= BPF_MAX_VAR_SIZ) { + verbose(env, "%s unbounded memory access, use 'var &= const' or 'if (var < const)'\n", + reg_arg_name(env, size_argno)); return -EACCES; } - err = check_helper_mem_access(env, regno - 1, reg->umax_value, + err = check_helper_mem_access(env, mem_reg, mem_argno, reg_umax(size_reg), access_type, zero_size_allowed, meta); - if (!err) - err = mark_chain_precision(env, regno); + if (!err) { + int regno = reg_from_argno(size_argno); + + if (regno >= 0) + err = mark_chain_precision(env, regno); + else + err = mark_stack_arg_precision(env, arg_idx_from_argno(size_argno)); + } return err; } static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - u32 regno, u32 mem_size) + argno_t argno, u32 mem_size) { bool may_be_null = type_may_be_null(reg->type); struct bpf_reg_state saved_reg; @@ -7110,6 +6881,12 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg if (bpf_register_is_null(reg)) return 0; + if (mem_size > S32_MAX) { + verbose(env, "%s memory size %u is too large\n", + reg_arg_name(env, argno), mem_size); + return -EACCES; + } + /* Assuming that the register contains a value check if the memory * access is safe. Temporarily save and restore the register's state as * the conversion shouldn't be visible to a caller. @@ -7121,8 +6898,8 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg int size = base_type(reg->type) == PTR_TO_STACK ? -(int)mem_size : mem_size; - err = check_helper_mem_access(env, regno, size, BPF_READ, true, NULL); - err = err ?: check_helper_mem_access(env, regno, size, BPF_WRITE, true, NULL); + err = check_helper_mem_access(env, reg, argno, size, BPF_READ, true, NULL); + err = err ?: check_helper_mem_access(env, reg, argno, size, BPF_WRITE, true, NULL); if (may_be_null) *reg = saved_reg; @@ -7130,17 +6907,14 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg return err; } -static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - u32 regno) +static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *mem_reg, + struct bpf_reg_state *size_reg, argno_t mem_argno, argno_t size_argno) { - struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1]; bool may_be_null = type_may_be_null(mem_reg->type); struct bpf_reg_state saved_reg; struct bpf_call_arg_meta meta; int err; - WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5); - memset(&meta, 0, sizeof(meta)); if (may_be_null) { @@ -7148,8 +6922,8 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg mark_ptr_not_null_reg(mem_reg); } - err = check_mem_size_reg(env, reg, regno, BPF_READ, true, &meta); - err = err ?: check_mem_size_reg(env, reg, regno, BPF_WRITE, true, &meta); + err = check_mem_size_reg(env, mem_reg, size_reg, mem_argno, size_argno, BPF_READ, true, &meta); + err = err ?: check_mem_size_reg(env, mem_reg, size_reg, mem_argno, size_argno, BPF_WRITE, true, &meta); if (may_be_null) *mem_reg = saved_reg; @@ -7185,11 +6959,10 @@ enum { * env->cur_state->active_locks remembers which map value element or allocated * object got locked and clears it after bpf_spin_unlock. */ -static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) +static int process_spin_lock(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int flags) { bool is_lock = flags & PROCESS_SPIN_LOCK, is_res_lock = flags & PROCESS_RES_LOCK; const char *lock_str = is_res_lock ? "bpf_res_spin" : "bpf_spin"; - struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_verifier_state *cur = env->cur_state; bool is_const = tnum_is_const(reg->var_off); bool is_irq = flags & PROCESS_LOCK_IRQ; @@ -7202,8 +6975,8 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) if (!is_const) { verbose(env, - "R%d doesn't have constant offset. %s_lock has to be at the constant offset\n", - regno, lock_str); + "%s doesn't have constant offset. %s_lock has to be at the constant offset\n", + reg_arg_name(env, argno), lock_str); return -EINVAL; } if (reg->type == PTR_TO_MAP_VALUE) { @@ -7302,11 +7075,10 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) } /* Check if @regno is a pointer to a specific field in a map value */ -static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, +static int check_map_field_pointer(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, enum btf_field_type field_type, struct bpf_map_desc *map_desc) { - struct bpf_reg_state *reg = reg_state(env, regno); bool is_const = tnum_is_const(reg->var_off); struct bpf_map *map = reg->map_ptr; u64 val = reg->var_off.value; @@ -7315,8 +7087,8 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, if (!is_const) { verbose(env, - "R%d doesn't have constant offset. %s has to be at the constant offset\n", - regno, struct_name); + "%s doesn't have constant offset. %s has to be at the constant offset\n", + reg_arg_name(env, argno), struct_name); return -EINVAL; } if (!map->btf) { @@ -7356,26 +7128,26 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, return 0; } -static int process_timer_func(struct bpf_verifier_env *env, int regno, +static int process_timer_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, struct bpf_map_desc *map) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) { verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n"); return -EOPNOTSUPP; } - return check_map_field_pointer(env, regno, BPF_TIMER, map); + return check_map_field_pointer(env, reg, argno, BPF_TIMER, map); } -static int process_timer_helper(struct bpf_verifier_env *env, int regno, +static int process_timer_helper(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, struct bpf_call_arg_meta *meta) { - return process_timer_func(env, regno, &meta->map); + return process_timer_func(env, reg, argno, &meta->map); } -static int process_timer_kfunc(struct bpf_verifier_env *env, int regno, +static int process_timer_kfunc(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta) { - return process_timer_func(env, regno, &meta->map); + return process_timer_func(env, reg, argno, &meta->map); } static int process_kptr_func(struct bpf_verifier_env *env, int regno, @@ -7426,52 +7198,42 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, return 0; } -/* There are two register types representing a bpf_dynptr, one is PTR_TO_STACK +/* + * Validate dynptr arguments for helper, kfunc and subprog. + * + * @dynptr is both input and output. It is populated when the argument is + * tagged with MEM_UNINIT (i.e., the dynptr argument that will be constructed) + * and consumed when the argument is expecting to be an initialized dynptr. + * @parent_id is used to track the referenced parent object (e.g., file or skb in + * qdisc program) when constructing a dynptr. + * + * There are two register types representing a bpf_dynptr, one is PTR_TO_STACK * which points to a stack slot, and the other is CONST_PTR_TO_DYNPTR. * * In both cases we deal with the first 8 bytes, but need to mark the next 8 * bytes as STACK_DYNPTR in case of PTR_TO_STACK. In case of * CONST_PTR_TO_DYNPTR, we are guaranteed to get the beginning of the object. * - * Mutability of bpf_dynptr is at two levels, one is at the level of struct - * bpf_dynptr itself, i.e. whether the helper is receiving a pointer to struct - * bpf_dynptr or pointer to const struct bpf_dynptr. In the former case, it can - * mutate the view of the dynptr and also possibly destroy it. In the latter - * case, it cannot mutate the bpf_dynptr itself but it can still mutate the - * memory that dynptr points to. - * - * The verifier will keep track both levels of mutation (bpf_dynptr's in - * reg->type and the memory's in reg->dynptr.type), but there is no support for - * readonly dynptr view yet, hence only the first case is tracked and checked. - * - * This is consistent with how C applies the const modifier to a struct object, - * where the pointer itself inside bpf_dynptr becomes const but not what it - * points to. - * - * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument - * type, and declare it as 'const struct bpf_dynptr *' in their prototype. + * Mutability of bpf_dynptr is at two levels: the dynptr and the memory the + * dynptr points to. At the first level, the verifier will make sure a + * CONST_PTR_TO_DYNPTR cannot be reinitialized or destroyed. The mutability of + * a dynptr's view (i.e., start and offset) is not tracked as there is not such + * use case. The second level is tracked using the upper bit of bpf_dynptr->size + * and checked dynamically during runtime. */ -static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx, - enum bpf_arg_type arg_type, int clone_ref_obj_id) +static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + argno_t argno, int insn_idx, enum bpf_arg_type arg_type, + struct ref_obj_desc *ref_obj, struct bpf_dynptr_desc *dynptr) { - struct bpf_reg_state *reg = reg_state(env, regno); - int err; + int spi, err = 0; if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) { verbose(env, - "arg#%d expected pointer to stack or const struct bpf_dynptr\n", - regno - 1); + "%s expected pointer to stack or const struct bpf_dynptr\n", + reg_arg_name(env, argno)); return -EINVAL; } - /* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an - * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*): - */ - if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) { - verifier_bug(env, "misconfigured dynptr helper type flags"); - return -EFAULT; - } - /* MEM_UNINIT - Points to memory that is an appropriate candidate for * constructing a mutable bpf_dynptr object. * @@ -7479,13 +7241,12 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn * pointing to a region of at least 16 bytes which doesn't * contain an existing bpf_dynptr. * - * MEM_RDONLY - Points to a initialized bpf_dynptr that will not be - * mutated or destroyed. However, the memory it points to - * may be mutated. + * OBJ_RELEASE - Points to a initialized bpf_dynptr that will be + * destroyed. * - * None - Points to a initialized dynptr that can be mutated and - * destroyed, including mutation of the memory it points - * to. + * None - Points to a initialized dynptr that cannot be + * reinitialized or destroyed. However, the view of the + * dynptr and the memory it points to may be mutated. */ if (arg_type & MEM_UNINIT) { int i; @@ -7497,45 +7258,58 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn /* we write BPF_DW bits (8 bytes) at a time */ for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) { - err = check_mem_access(env, insn_idx, regno, + err = check_mem_access(env, insn_idx, reg, argno, i, BPF_DW, BPF_WRITE, -1, false, false); if (err) return err; } - err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, clone_ref_obj_id); - } else /* MEM_RDONLY and None case from above */ { + err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, ref_obj, dynptr); + } else /* OBJ_RELEASE and None case from above */ { /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */ - if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) { - verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n"); + if (reg->type == CONST_PTR_TO_DYNPTR && (arg_type & OBJ_RELEASE)) { + verbose(env, "CONST_PTR_TO_DYNPTR cannot be released\n"); return -EINVAL; } if (!is_dynptr_reg_valid_init(env, reg)) { - verbose(env, - "Expected an initialized dynptr as arg #%d\n", - regno - 1); + verbose(env, "Expected an initialized dynptr as %s\n", + reg_arg_name(env, argno)); return -EINVAL; } - /* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */ - if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) { + /* Fold modifiers (in this case, OBJ_RELEASE) when checking expected type */ + if (!is_dynptr_type_expected(env, reg, arg_type & ~OBJ_RELEASE)) { verbose(env, - "Expected a dynptr of type %s as arg #%d\n", - dynptr_type_str(arg_to_dynptr_type(arg_type)), regno - 1); + "Expected a dynptr of type %s as %s\n", + dynptr_type_str(arg_to_dynptr_type(arg_type)), + reg_arg_name(env, argno)); return -EINVAL; } - err = mark_dynptr_read(env, reg); - } - return err; -} + if (reg->type != CONST_PTR_TO_DYNPTR) { + struct bpf_func_state *state = bpf_func(env, reg); -static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi) -{ - struct bpf_func_state *state = bpf_func(env, reg); + spi = dynptr_get_spi(env, reg); + if (spi < 0) + return spi; + + /* + * For CONST_PTR_TO_DYNPTR, reg is already scratched by check_reg_arg + * in check_helper_call and mark_btf_func_reg_size in check_kfunc_call. + */ + mark_stack_slots_scratched(env, spi, BPF_DYNPTR_NR_SLOTS); - return state->stack[spi].spilled_ptr.ref_obj_id; + reg = &state->stack[spi].spilled_ptr; + } + + if (dynptr) { + dynptr->type = reg->dynptr.type; + dynptr->id = reg->id; + dynptr->parent_id = reg->parent_id; + } + } + return err; } static bool is_iter_kfunc(struct bpf_kfunc_call_arg_meta *meta) @@ -7567,15 +7341,17 @@ static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg_idx, return btf_param_match_suffix(meta->btf, arg, "__iter"); } -static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx, +static int process_iter_arg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int insn_idx, struct bpf_kfunc_call_arg_meta *meta) { - struct bpf_reg_state *reg = reg_state(env, regno); + struct bpf_func_state *state = bpf_func(env, reg); const struct btf_type *t; + u32 arg_idx = arg_idx_from_argno(argno); int spi, err, i, nr_slots, btf_id; if (reg->type != PTR_TO_STACK) { - verbose(env, "arg#%d expected pointer to an iterator on stack\n", regno - 1); + verbose(env, "%s expected pointer to an iterator on stack\n", + reg_arg_name(env, argno)); return -EINVAL; } @@ -7585,9 +7361,10 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id * to any kfunc, if arg has "__iter" suffix, we need to be a bit more * conservative here. */ - btf_id = btf_check_iter_arg(meta->btf, meta->func_proto, regno - 1); + btf_id = btf_check_iter_arg(meta->btf, meta->func_proto, arg_idx); if (btf_id < 0) { - verbose(env, "expected valid iter pointer as arg #%d\n", regno - 1); + verbose(env, "expected valid iter pointer as %s\n", + reg_arg_name(env, argno)); return -EINVAL; } t = btf_type_by_id(meta->btf, btf_id); @@ -7596,13 +7373,13 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id if (is_iter_new_kfunc(meta)) { /* bpf_iter_<type>_new() expects pointer to uninit iter state */ if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) { - verbose(env, "expected uninitialized iter_%s as arg #%d\n", - iter_type_str(meta->btf, btf_id), regno - 1); + verbose(env, "expected uninitialized iter_%s as %s\n", + iter_type_str(meta->btf, btf_id), reg_arg_name(env, argno)); return -EINVAL; } for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) { - err = check_mem_access(env, insn_idx, regno, + err = check_mem_access(env, insn_idx, reg, argno, i, BPF_DW, BPF_WRITE, -1, false, false); if (err) return err; @@ -7620,8 +7397,8 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id case 0: break; case -EINVAL: - verbose(env, "expected an initialized iter_%s as arg #%d\n", - iter_type_str(meta->btf, btf_id), regno - 1); + verbose(env, "expected an initialized iter_%s as %s\n", + iter_type_str(meta->btf, btf_id), reg_arg_name(env, argno)); return err; case -EPROTO: verbose(env, "expected an RCU CS when using %s\n", meta->func_name); @@ -7634,14 +7411,12 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id if (spi < 0) return spi; - err = mark_iter_read(env, reg, spi, nr_slots); - if (err) - return err; + mark_stack_slots_scratched(env, spi, nr_slots); /* remember meta->iter info for process_iter_next_call() */ meta->iter.spi = spi; meta->iter.frameno = reg->frameno; - meta->ref_obj_id = iter_ref_obj_id(env, reg, spi); + update_ref_obj(&meta->ref_obj, &state->stack[spi].spilled_ptr); if (is_iter_destroy_kfunc(meta)) { err = unmark_stack_slots_iter(env, reg, nr_slots); @@ -8041,12 +7816,11 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_DYNPTR] = &dynptr_types, }; -static int check_reg_type(struct bpf_verifier_env *env, u32 regno, +static int check_reg_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, enum bpf_arg_type arg_type, const u32 *arg_btf_id, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *reg = reg_state(env, regno); enum bpf_reg_type expected, type = reg->type; const struct bpf_reg_types *compatible; int i, j, err; @@ -8077,7 +7851,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, type &= ~DYNPTR_TYPE_FLAG_MASK; /* Local kptr types are allowed as the source argument of bpf_kptr_xchg */ - if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type) && regno == BPF_REG_2) { + if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type) && reg_from_argno(argno) == BPF_REG_2) { type &= ~MEM_ALLOC; type &= ~MEM_PERCPU; } @@ -8091,7 +7865,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, goto found; } - verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, reg->type)); + verbose(env, "%s type=%s expected=", reg_arg_name(env, argno), reg_type_str(env, reg->type)); for (j = 0; j + 1 < i; j++) verbose(env, "%s, ", reg_type_str(env, compatible->types[j])); verbose(env, "%s\n", reg_type_str(env, compatible->types[j])); @@ -8104,9 +7878,9 @@ found: if (compatible == &mem_types) { if (!(arg_type & MEM_RDONLY)) { verbose(env, - "%s() may write into memory pointed by R%d type=%s\n", + "%s() may write into memory pointed by %s type=%s\n", func_id_name(meta->func_id), - regno, reg_type_str(env, reg->type)); + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } return 0; @@ -8129,7 +7903,8 @@ found: if (type_may_be_null(reg->type) && (!type_may_be_null(arg_type) || arg_type_is_release(arg_type))) { - verbose(env, "Possibly NULL pointer passed to helper arg%d\n", regno); + verbose(env, "Possibly NULL pointer passed to helper %s\n", + reg_arg_name(env, argno)); return -EACCES; } @@ -8142,25 +7917,26 @@ found: } if (meta->func_id == BPF_FUNC_kptr_xchg) { - if (map_kptr_match_type(env, meta->kptr_field, reg, regno)) + if (map_kptr_match_type(env, meta->kptr_field, reg, reg_from_argno(argno))) return -EACCES; } else { if (arg_btf_id == BPF_PTR_POISON) { verbose(env, "verifier internal error:"); - verbose(env, "R%d has non-overwritten BPF_PTR_POISON type\n", - regno); + verbose(env, "%s has non-overwritten BPF_PTR_POISON type\n", + reg_arg_name(env, argno)); return -EACCES; } - err = __check_ptr_off_reg(env, reg, regno, true); + err = __check_ptr_off_reg(env, reg, argno, true); if (err) return err; if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->var_off.value, btf_vmlinux, *arg_btf_id, strict_type_match)) { - verbose(env, "R%d is of type %s but %s is expected\n", - regno, btf_type_name(reg->btf, reg->btf_id), + verbose(env, "%s is of type %s but %s is expected\n", + reg_arg_name(env, argno), + btf_type_name(reg->btf, reg->btf_id), btf_type_name(btf_vmlinux, *arg_btf_id)); return -EACCES; } @@ -8177,8 +7953,11 @@ found: return -EFAULT; } /* Check if local kptr in src arg matches kptr in dst arg */ - if (meta->func_id == BPF_FUNC_kptr_xchg && regno == BPF_REG_2) { - if (map_kptr_match_type(env, meta->kptr_field, reg, regno)) + if (meta->func_id == BPF_FUNC_kptr_xchg) { + int regno = reg_from_argno(argno); + + if (regno == BPF_REG_2 && + map_kptr_match_type(env, meta->kptr_field, reg, regno)) return -EACCES; } break; @@ -8212,7 +7991,7 @@ reg_find_field_offset(const struct bpf_reg_state *reg, s32 off, u32 fields) } static int check_func_arg_reg_off(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno, + const struct bpf_reg_state *reg, argno_t argno, enum bpf_arg_type arg_type) { u32 type = reg->type; @@ -8220,7 +7999,7 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, /* When referenced register is passed to release function, its fixed * offset must be 0. * - * We will check arg_type_is_release reg has ref_obj_id when storing + * We will check arg_type_is_release reg has id when storing * meta->release_regno. */ if (arg_type_is_release(arg_type)) { @@ -8238,8 +8017,8 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, * to give the user a better error message. */ if (!tnum_is_const(reg->var_off) || reg->var_off.value != 0) { - verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n", - regno); + verbose(env, "%s must have zero offset when passed to release func or trusted arg to kfunc\n", + reg_arg_name(env, argno)); return -EINVAL; } } @@ -8275,7 +8054,7 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, * cases. var_off always must be 0 for PTR_TO_BTF_ID, hence we * still need to do checks instead of returning. */ - return __check_ptr_off_reg(env, reg, regno, true); + return __check_ptr_off_reg(env, reg, argno, true); case PTR_TO_CTX: /* * Allow fixed and variable offsets for syscall context, but @@ -8287,78 +8066,12 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, return 0; fallthrough; default: - return __check_ptr_off_reg(env, reg, regno, false); - } -} - -static struct bpf_reg_state *get_dynptr_arg_reg(struct bpf_verifier_env *env, - const struct bpf_func_proto *fn, - struct bpf_reg_state *regs) -{ - struct bpf_reg_state *state = NULL; - int i; - - for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) - if (arg_type_is_dynptr(fn->arg_type[i])) { - if (state) { - verbose(env, "verifier internal error: multiple dynptr args\n"); - return NULL; - } - state = ®s[BPF_REG_1 + i]; - } - - if (!state) - verbose(env, "verifier internal error: no dynptr arg found\n"); - - return state; -} - -static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) -{ - struct bpf_func_state *state = bpf_func(env, reg); - int spi; - - if (reg->type == CONST_PTR_TO_DYNPTR) - return reg->id; - spi = dynptr_get_spi(env, reg); - if (spi < 0) - return spi; - return state->stack[spi].spilled_ptr.id; -} - -static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) -{ - struct bpf_func_state *state = bpf_func(env, reg); - int spi; - - if (reg->type == CONST_PTR_TO_DYNPTR) - return reg->ref_obj_id; - spi = dynptr_get_spi(env, reg); - if (spi < 0) - return spi; - return state->stack[spi].spilled_ptr.ref_obj_id; -} - -static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env, - struct bpf_reg_state *reg) -{ - struct bpf_func_state *state = bpf_func(env, reg); - int spi; - - if (reg->type == CONST_PTR_TO_DYNPTR) - return reg->dynptr.type; - - spi = bpf_get_spi(reg->var_off.value); - if (spi < 0) { - verbose(env, "verifier internal error: invalid spi when querying dynptr type\n"); - return BPF_DYNPTR_TYPE_INVALID; + return __check_ptr_off_reg(env, reg, argno, false); } - - return state->stack[spi].spilled_ptr.dynptr.type; } -static int check_reg_const_str(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno) +static int check_arg_const_str(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, argno_t argno) { struct bpf_map *map = reg->map_ptr; int err; @@ -8370,17 +8083,18 @@ static int check_reg_const_str(struct bpf_verifier_env *env, return -EINVAL; if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) { - verbose(env, "R%d points to insn_array map which cannot be used as const string\n", regno); + verbose(env, "%s points to insn_array map which cannot be used as const string\n", + reg_arg_name(env, argno)); return -EACCES; } if (!bpf_map_is_rdonly(map)) { - verbose(env, "R%d does not point to a readonly map'\n", regno); + verbose(env, "%s does not point to a readonly map'\n", reg_arg_name(env, argno)); return -EACCES; } if (!tnum_is_const(reg->var_off)) { - verbose(env, "R%d is not a constant address'\n", regno); + verbose(env, "%s is not a constant address'\n", reg_arg_name(env, argno)); return -EACCES; } @@ -8389,7 +8103,7 @@ static int check_reg_const_str(struct bpf_verifier_env *env, return -EACCES; } - err = check_map_access(env, regno, 0, + err = check_map_access(env, reg, argno, 0, map->value_size - reg->var_off.value, false, ACCESS_HELPER); if (err) @@ -8471,7 +8185,7 @@ static int get_constant_map_key(struct bpf_verifier_env *env, return 0; } -static bool can_elide_value_nullness(enum bpf_map_type type); +static bool can_elide_value_nullness(const struct bpf_map *map); static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_call_arg_meta *meta, @@ -8481,6 +8195,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, u32 regno = BPF_REG_1 + arg; struct bpf_reg_state *reg = reg_state(env, regno); enum bpf_arg_type arg_type = fn->arg_type[arg]; + argno_t argno = argno_from_arg(arg + 1); enum bpf_reg_type type = reg->type; u32 *arg_btf_id = NULL; u32 key_size; @@ -8525,56 +8240,24 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK) arg_btf_id = fn->arg_btf_id[arg]; - err = check_reg_type(env, regno, arg_type, arg_btf_id, meta); + err = check_reg_type(env, reg, argno_from_reg(regno), arg_type, arg_btf_id, meta); if (err) return err; - err = check_func_arg_reg_off(env, reg, regno, arg_type); + err = check_func_arg_reg_off(env, reg, argno_from_reg(regno), arg_type); if (err) return err; skip_type_check: - if (arg_type_is_release(arg_type)) { - if (arg_type_is_dynptr(arg_type)) { - struct bpf_func_state *state = bpf_func(env, reg); - int spi; - - /* Only dynptr created on stack can be released, thus - * the get_spi and stack state checks for spilled_ptr - * should only be done before process_dynptr_func for - * PTR_TO_STACK. - */ - if (reg->type == PTR_TO_STACK) { - spi = dynptr_get_spi(env, reg); - if (spi < 0 || !state->stack[spi].spilled_ptr.ref_obj_id) { - verbose(env, "arg %d is an unacquired reference\n", regno); - return -EINVAL; - } - } else { - verbose(env, "cannot release unowned const bpf_dynptr\n"); - return -EINVAL; - } - } else if (!reg->ref_obj_id && !bpf_register_is_null(reg)) { - verbose(env, "R%d must be referenced when passed to release function\n", - regno); - return -EINVAL; - } - if (meta->release_regno) { - verifier_bug(env, "more than one release argument"); - return -EFAULT; - } - meta->release_regno = regno; + if (arg_type_is_release(arg_type) && !arg_type_is_dynptr(arg_type) && + !reg_is_referenced(env, reg) && !bpf_register_is_null(reg)) { + verbose(env, "release helper %s expects referenced PTR_TO_BTF_ID passed to %s\n", + func_id_name(meta->func_id), reg_arg_name(env, argno)); + return -EINVAL; } - if (reg->ref_obj_id && base_type(arg_type) != ARG_KPTR_XCHG_DEST) { - if (meta->ref_obj_id) { - verbose(env, "more than one arg with ref_obj_id R%d %u %u", - regno, reg->ref_obj_id, - meta->ref_obj_id); - return -EACCES; - } - meta->ref_obj_id = reg->ref_obj_id; - } + if (reg_is_referenced(env, reg)) + update_ref_obj(&meta->ref_obj, reg); switch (base_type(arg_type)) { case ARG_CONST_MAP_PTR: @@ -8618,10 +8301,10 @@ skip_type_check: return -EFAULT; } key_size = meta->map.ptr->key_size; - err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL); + err = check_helper_mem_access(env, reg, argno_from_reg(regno), key_size, BPF_READ, false, NULL); if (err) return err; - if (can_elide_value_nullness(meta->map.ptr->map_type)) { + if (can_elide_value_nullness(meta->map.ptr)) { err = get_constant_map_key(env, reg, key_size, &meta->const_map_key); if (err < 0) { meta->const_map_key = -1; @@ -8645,7 +8328,7 @@ skip_type_check: return -EFAULT; } meta->raw_mode = arg_type & MEM_UNINIT; - err = check_helper_mem_access(env, regno, meta->map.ptr->value_size, + err = check_helper_mem_access(env, reg, argno_from_reg(regno), meta->map.ptr->value_size, arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, false, meta); break; @@ -8663,11 +8346,11 @@ skip_type_check: return -EACCES; } if (meta->func_id == BPF_FUNC_spin_lock) { - err = process_spin_lock(env, regno, PROCESS_SPIN_LOCK); + err = process_spin_lock(env, reg, argno_from_reg(regno), PROCESS_SPIN_LOCK); if (err) return err; } else if (meta->func_id == BPF_FUNC_spin_unlock) { - err = process_spin_lock(env, regno, 0); + err = process_spin_lock(env, reg, argno_from_reg(regno), 0); if (err) return err; } else { @@ -8676,7 +8359,7 @@ skip_type_check: } break; case ARG_PTR_TO_TIMER: - err = process_timer_helper(env, regno, meta); + err = process_timer_helper(env, reg, argno_from_reg(regno), meta); if (err) return err; break; @@ -8689,7 +8372,7 @@ skip_type_check: */ meta->raw_mode = arg_type & MEM_UNINIT; if (arg_type & MEM_FIXED_SIZE) { - err = check_helper_mem_access(env, regno, fn->arg_size[arg], + err = check_helper_mem_access(env, reg, argno_from_reg(regno), fn->arg_size[arg], arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, false, meta); if (err) @@ -8699,19 +8382,22 @@ skip_type_check: } break; case ARG_CONST_SIZE: - err = check_mem_size_reg(env, reg, regno, + err = check_mem_size_reg(env, reg_state(env, regno - 1), reg, argno_from_reg(regno - 1), + argno_from_reg(regno), fn->arg_type[arg - 1] & MEM_WRITE ? BPF_WRITE : BPF_READ, false, meta); break; case ARG_CONST_SIZE_OR_ZERO: - err = check_mem_size_reg(env, reg, regno, + err = check_mem_size_reg(env, reg_state(env, regno - 1), reg, argno_from_reg(regno - 1), + argno_from_reg(regno), fn->arg_type[arg - 1] & MEM_WRITE ? BPF_WRITE : BPF_READ, true, meta); break; case ARG_PTR_TO_DYNPTR: - err = process_dynptr_func(env, regno, insn_idx, arg_type, 0); + err = process_dynptr_func(env, reg, argno_from_reg(regno), insn_idx, arg_type, &meta->ref_obj, + &meta->dynptr); if (err) return err; break; @@ -8728,7 +8414,7 @@ skip_type_check: break; case ARG_PTR_TO_CONST_STR: { - err = check_reg_const_str(env, reg, regno); + err = check_arg_const_str(env, reg, argno_from_reg(regno)); if (err) return err; break; @@ -9130,11 +8816,29 @@ static bool check_mem_arg_rw_flag_ok(const struct bpf_func_proto *fn) return true; } -static int check_func_proto(const struct bpf_func_proto *fn) +static bool check_proto_release_reg(const struct bpf_func_proto *fn, struct bpf_call_arg_meta *meta) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) { + enum bpf_arg_type arg_type = fn->arg_type[i]; + + if (arg_type_is_release(arg_type)) { + if (meta->release_regno) + return false; + meta->release_regno = i + 1; + } + } + + return true; +} + +static int check_func_proto(const struct bpf_func_proto *fn, struct bpf_call_arg_meta *meta) { return check_raw_mode_ok(fn) && check_arg_pair_ok(fn) && check_mem_arg_rw_flag_ok(fn) && + check_proto_release_reg(fn, meta) && check_btf_id_ok(fn) ? 0 : -EINVAL; } @@ -9181,14 +8885,14 @@ static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range reg->range = AT_PKT_END; } -static int release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id) +static int release_reference_nomark(struct bpf_verifier_state *state, int id) { int i; for (i = 0; i < state->acquired_refs; i++) { if (state->refs[i].type != REF_TYPE_PTR) continue; - if (state->refs[i].id == ref_obj_id) { + if (state->refs[i].id == id) { release_reference_state(state, i); return 0; } @@ -9196,26 +8900,83 @@ static int release_reference_nomark(struct bpf_verifier_state *state, int ref_ob return -EINVAL; } -/* The pointer with the specified id has released its reference to kernel - * resources. Identify all copies of the same pointer and clear the reference. - * - * This is the release function corresponding to acquire_reference(). Idempotent. - */ -static int release_reference(struct bpf_verifier_env *env, int ref_obj_id) +static int idstack_push(struct bpf_idmap *idmap, u32 id) { + int i; + + if (!id) + return 0; + + for (i = 0; i < idmap->cnt; i++) + if (idmap->map[i].old == id) + return 0; + + if (WARN_ON_ONCE(idmap->cnt >= BPF_ID_MAP_SIZE)) + return -EFAULT; + + idmap->map[idmap->cnt++].old = id; + return 0; +} + +static int idstack_pop(struct bpf_idmap *idmap) +{ + if (!idmap->cnt) + return 0; + + return idmap->map[--idmap->cnt].old; +} + +/* Release id and objects derived from it iteratively in a DFS manner */ +static int release_reference(struct bpf_verifier_env *env, int id) +{ + u32 mask = (1 << STACK_SPILL) | (1 << STACK_DYNPTR); struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_idmap *idstack = &env->idmap_scratch; + struct bpf_stack_state *stack; struct bpf_func_state *state; struct bpf_reg_state *reg; - int err; + int i, err; - err = release_reference_nomark(vstate, ref_obj_id); + idstack->cnt = 0; + err = idstack_push(idstack, id); if (err) return err; - bpf_for_each_reg_in_vstate(vstate, state, reg, ({ - if (reg->ref_obj_id == ref_obj_id) - mark_reg_invalid(env, reg); - })); + if (find_reference_state(vstate, id)) + WARN_ON_ONCE(release_reference_nomark(vstate, id)); + + while ((id = idstack_pop(idstack))) { + /* + * Child references are inaccessible after parent is released, + * any child references that exist at this point are a leak. + */ + for (i = 0; i < vstate->acquired_refs; i++) { + if (vstate->refs[i].type != REF_TYPE_PTR) + continue; + if (vstate->refs[i].parent_id != id) + continue; + verbose(env, "Leaking reference id=%d alloc_insn=%d. Release it first.\n", + vstate->refs[i].id, vstate->refs[i].insn_idx); + return -EINVAL; + } + + bpf_for_each_reg_in_vstate_mask(vstate, state, reg, stack, mask, ({ + if (reg->id != id && reg->parent_id != id) + continue; + + /* Free objects derived from the current object */ + if (reg->parent_id == id) { + err = idstack_push(idstack, reg->id); + if (err) + return err; + } + + if (!stack || stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL) + mark_reg_invalid(env, reg); + else if (stack->slot_type[BPF_REG_SIZE - 1] == STACK_DYNPTR) + invalidate_dynptr(env, stack); + })); + } return 0; } @@ -9231,6 +8992,42 @@ static void invalidate_non_owning_refs(struct bpf_verifier_env *env) })); } +static void invalidate_rcu_protected_refs(struct bpf_verifier_env *env) +{ + struct bpf_stack_state *stack; + struct bpf_func_state *state; + struct bpf_reg_state *reg; + u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER); + + bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, stack, clear_mask, ({ + if (reg->type & MEM_RCU) { + reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL); + reg->type |= PTR_UNTRUSTED; + } + })); +} + +static int ref_convert_alloc_rcu_protected(struct bpf_verifier_env *env, u32 id) +{ + struct bpf_func_state *state; + struct bpf_reg_state *reg; + int err; + + err = release_reference_nomark(env->cur_state, id); + + bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ + if (reg->id != id) + continue; + if ((reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU)) { + reg->id = 0; + reg->type &= ~MEM_ALLOC; + reg->type |= MEM_RCU; + } + })); + + return err; +} + static void clear_caller_saved_regs(struct bpf_verifier_env *env, struct bpf_reg_state *regs) { @@ -9243,6 +9040,15 @@ static void clear_caller_saved_regs(struct bpf_verifier_env *env, } } +static void invalidate_outgoing_stack_args(const struct bpf_verifier_env *env, + struct bpf_func_state *state) +{ + int i, nslots = state->out_stack_arg_cnt; + + for (i = 0; i < nslots; i++) + bpf_mark_reg_not_init(env, &state->stack_arg_regs[i]); +} + typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, @@ -9305,11 +9111,23 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs) { struct bpf_subprog_info *sub = subprog_info(env, subprog); + struct bpf_func_state *caller = cur_func(env); struct bpf_verifier_log *log = &env->log; + struct ref_obj_desc ref_obj = {}; u32 i; - int ret; + int ret, err; ret = btf_prepare_func_args(env, subprog); + if (ret) { + if (bpf_in_stack_arg_cnt(sub) > 0) { + err = check_outgoing_stack_args(env, caller, sub->arg_cnt); + if (err) + return err; + } + return ret; + } + + ret = check_outgoing_stack_args(env, caller, sub->arg_cnt); if (ret) return ret; @@ -9317,13 +9135,13 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, * verifier sees. */ for (i = 0; i < sub->arg_cnt; i++) { - u32 regno = i + 1; - struct bpf_reg_state *reg = ®s[regno]; + argno_t argno = argno_from_arg(i + 1); + struct bpf_reg_state *reg = get_func_arg_reg(caller, regs, i); struct bpf_subprog_arg_info *arg = &sub->args[i]; if (arg->arg_type == ARG_ANYTHING) { if (reg->type != SCALAR_VALUE) { - bpf_log(log, "R%d is not a scalar\n", regno); + bpf_log(log, "%s is not a scalar\n", reg_arg_name(env, argno)); return -EINVAL; } } else if (arg->arg_type & PTR_UNTRUSTED) { @@ -9333,24 +9151,26 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, * invalid memory access. */ } else if (arg->arg_type == ARG_PTR_TO_CTX) { - ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_CTX); + ret = check_func_arg_reg_off(env, reg, argno, ARG_PTR_TO_CTX); if (ret < 0) return ret; /* If function expects ctx type in BTF check that caller * is passing PTR_TO_CTX. */ if (reg->type != PTR_TO_CTX) { - bpf_log(log, "arg#%d expects pointer to ctx\n", i); + bpf_log(log, "%s expects pointer to ctx\n", + reg_arg_name(env, argno)); return -EINVAL; } } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) { - ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE); + ret = check_func_arg_reg_off(env, reg, argno, ARG_DONTCARE); if (ret < 0) return ret; - if (check_mem_reg(env, reg, regno, arg->mem_size)) + if (check_mem_reg(env, reg, argno, arg->mem_size)) return -EINVAL; if (!(arg->arg_type & PTR_MAYBE_NULL) && (reg->type & PTR_MAYBE_NULL)) { - bpf_log(log, "arg#%d is expected to be non-NULL\n", i); + bpf_log(log, "%s is expected to be non-NULL\n", + reg_arg_name(env, argno)); return -EINVAL; } } else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) { @@ -9362,15 +9182,16 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, * run-time debug nightmare. */ if (reg->type != PTR_TO_ARENA && reg->type != SCALAR_VALUE) { - bpf_log(log, "R%d is not a pointer to arena or scalar.\n", regno); + bpf_log(log, "%s is not a pointer to arena or scalar.\n", + reg_arg_name(env, argno)); return -EINVAL; } - } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) { - ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_DYNPTR); + } else if (arg->arg_type == ARG_PTR_TO_DYNPTR) { + ret = check_func_arg_reg_off(env, reg, argno, ARG_PTR_TO_DYNPTR); if (ret) return ret; - ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0); + ret = process_dynptr_func(env, reg, argno, -1, arg->arg_type, &ref_obj, NULL); if (ret) return ret; } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) { @@ -9381,12 +9202,13 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, continue; memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */ - err = check_reg_type(env, regno, arg->arg_type, &arg->btf_id, &meta); - err = err ?: check_func_arg_reg_off(env, reg, regno, arg->arg_type); + err = check_reg_type(env, reg, argno, arg->arg_type, &arg->btf_id, &meta); + err = err ?: check_func_arg_reg_off(env, reg, argno, arg->arg_type); if (err) return err; } else { - verifier_bug(env, "unrecognized arg#%d type %d", i, arg->arg_type); + verifier_bug(env, "unrecognized %s type %d", + reg_arg_name(env, argno), arg->arg_type); return -EFAULT; } } @@ -9505,6 +9327,8 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; + struct bpf_subprog_info *caller_info; + u16 callee_incoming, stack_arg_cnt; struct bpf_func_state *caller; int err, subprog, target_insn; @@ -9547,6 +9371,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* mark global subprog for verifying after main prog */ subprog_aux(env, subprog)->called = true; clear_caller_saved_regs(env, caller->regs); + invalidate_outgoing_stack_args(env, cur_func(env)); /* All non-void global functions return a 64-bit SCALAR_VALUE. */ if (!subprog_returns_void(env, subprog)) { @@ -9569,6 +9394,16 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return 0; } + /* + * Track caller's total stack arg count (incoming + max outgoing). + * This is needed so the JIT knows how much stack arg space to allocate. + */ + caller_info = &env->subprog_info[caller->subprogno]; + callee_incoming = bpf_in_stack_arg_cnt(&env->subprog_info[subprog]); + stack_arg_cnt = bpf_in_stack_arg_cnt(caller_info) + callee_incoming; + if (stack_arg_cnt > caller_info->stack_arg_cnt) + caller_info->stack_arg_cnt = stack_arg_cnt; + /* for regular function entry setup new frame and continue * from that frame. */ @@ -9852,9 +9687,9 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env) static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg) { if (range.return_32bit) - return range.minval <= reg->s32_min_value && reg->s32_max_value <= range.maxval; + return range.minval <= reg_s32_min(reg) && reg_s32_max(reg) <= range.maxval; else - return range.minval <= reg->smin_value && reg->smax_value <= range.maxval; + return range.minval <= reg_smin(reg) && reg_smax(reg) <= range.maxval; } static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) @@ -9926,6 +9761,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) * bpf_throw, this will be done by copy_verifier_state for extra frames. */ free_func_state(callee); state->frame[state->curframe--] = NULL; + invalidate_outgoing_stack_args(env, caller); /* for callbacks widen imprecise scalars to make programs like below verify: * @@ -9952,7 +9788,9 @@ static int do_refine_retval_range(struct bpf_verifier_env *env, int func_id, struct bpf_call_arg_meta *meta) { + struct bpf_retval_range range; struct bpf_reg_state *ret_reg = ®s[BPF_REG_0]; + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); if (ret_type != RET_INTEGER) return 0; @@ -9963,21 +9801,36 @@ static int do_refine_retval_range(struct bpf_verifier_env *env, case BPF_FUNC_probe_read_str: case BPF_FUNC_probe_read_kernel_str: case BPF_FUNC_probe_read_user_str: - ret_reg->smax_value = meta->msize_max_value; - ret_reg->s32_max_value = meta->msize_max_value; - ret_reg->smin_value = -MAX_ERRNO; - ret_reg->s32_min_value = -MAX_ERRNO; + reg_set_srange64(ret_reg, -MAX_ERRNO, meta->msize_max_value); + reg_set_srange32(ret_reg, -MAX_ERRNO, meta->msize_max_value); reg_bounds_sync(ret_reg); break; case BPF_FUNC_get_smp_processor_id: - ret_reg->umax_value = nr_cpu_ids - 1; - ret_reg->u32_max_value = nr_cpu_ids - 1; - ret_reg->smax_value = nr_cpu_ids - 1; - ret_reg->s32_max_value = nr_cpu_ids - 1; - ret_reg->umin_value = 0; - ret_reg->u32_min_value = 0; - ret_reg->smin_value = 0; - ret_reg->s32_min_value = 0; + reg_set_urange64(ret_reg, 0, nr_cpu_ids - 1); + reg_set_urange32(ret_reg, 0, nr_cpu_ids - 1); + reg_bounds_sync(ret_reg); + break; + case BPF_FUNC_get_retval: + /* + * bpf_get_retval may see arbitrary value passed by bpf_prog_run_array_cg for + * CGROUP_GETSOCKOPT type. + */ + if (prog_type == BPF_PROG_TYPE_CGROUP_SOCKOPT && + env->prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT) + break; + + if (prog_type == BPF_PROG_TYPE_LSM && + env->prog->expected_attach_type == BPF_LSM_CGROUP) { + if (!env->prog->aux->attach_func_proto->type) + break; + bpf_lsm_get_retval_range(env->prog, &range); + } else { + range.minval = -MAX_ERRNO; + range.maxval = 0; + } + + reg_set_srange64(ret_reg, range.minval, range.maxval); + reg_set_srange32(ret_reg, range.minval, range.maxval); reg_bounds_sync(ret_reg); break; } @@ -10086,7 +9939,7 @@ static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exi * kernel. Type checks are performed later in check_return_code. */ if (type == BPF_PROG_TYPE_STRUCT_OPS && !exception_exit && - reg->ref_obj_id == state->refs[i].id) + reg->id == state->refs[i].id) continue; verbose(env, "Unreleased reference id=%d alloc_insn=%d\n", state->refs[i].id, state->refs[i].insn_idx); @@ -10221,13 +10074,16 @@ static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno state->callback_subprogno == subprogno); } -/* Returns whether or not the given map type can potentially elide +/* Returns whether or not the given map can potentially elide * lookup return value nullness check. This is possible if the key * is statically known. */ -static bool can_elide_value_nullness(enum bpf_map_type type) +static bool can_elide_value_nullness(const struct bpf_map *map) { - switch (type) { + if (map->map_flags & BPF_F_INNER_MAP) + return false; + + switch (map->map_type) { case BPF_MAP_TYPE_ARRAY: case BPF_MAP_TYPE_PERCPU_ARRAY: return true; @@ -10272,6 +10128,24 @@ static const char *non_sleepable_context_description(struct bpf_verifier_env *en return "non-sleepable prog"; } +static int release_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + bool convert_rcu, bool release_dynptr) +{ + int err = -EINVAL; + + if (bpf_register_is_null(reg)) + return 0; + + if (release_dynptr) + err = unmark_stack_slots_dynptr(env, reg); + else if (convert_rcu) + err = ref_convert_alloc_rcu_protected(env, reg->id); + else if (reg_is_referenced(env, reg)) + err = release_reference(env, reg->id); + + return err; +} + static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { @@ -10321,7 +10195,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; - err = check_func_proto(fn); + err = check_func_proto(fn, &meta); if (err) { verifier_bug(env, "incorrect func proto %s#%d", func_id_name(func_id), func_id); return err; @@ -10353,55 +10227,26 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (err) return err; + regs = cur_regs(env); + /* Mark slots with STACK_MISC in case of raw mode, stack offset * is inferred from register state. */ for (i = 0; i < meta.access_size; i++) { - err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, + err = check_mem_access(env, insn_idx, regs + meta.regno, argno_from_reg(meta.regno), i, BPF_B, BPF_WRITE, -1, false, false); if (err) return err; } - regs = cur_regs(env); - if (meta.release_regno) { - err = -EINVAL; - if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) { - err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]); - } else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj_id) { - u32 ref_obj_id = meta.ref_obj_id; - bool in_rcu = in_rcu_cs(env); - struct bpf_func_state *state; - struct bpf_reg_state *reg; - - err = release_reference_nomark(env->cur_state, ref_obj_id); - if (!err) { - bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ - if (reg->ref_obj_id == ref_obj_id) { - if (in_rcu && (reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU)) { - reg->ref_obj_id = 0; - reg->type &= ~MEM_ALLOC; - reg->type |= MEM_RCU; - } else { - mark_reg_invalid(env, reg); - } - } - })); - } - } else if (meta.ref_obj_id) { - err = release_reference(env, meta.ref_obj_id); - } else if (bpf_register_is_null(®s[meta.release_regno])) { - /* meta.ref_obj_id can only be 0 if register that is meant to be - * released is NULL, which must be > R0. - */ - err = 0; - } - if (err) { - verbose(env, "func %s#%d reference has not been acquired before\n", - func_id_name(func_id), func_id); + struct bpf_reg_state *reg = ®s[meta.release_regno]; + bool convert_rcu = (func_id == BPF_FUNC_kptr_xchg) && in_rcu_cs(env) && + (reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU); + + err = release_reg(env, reg, convert_rcu, !!meta.dynptr.id); + if (err) return err; - } } switch (func_id) { @@ -10442,7 +10287,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn err = mark_chain_precision(env, BPF_REG_1); if (err) return err; - if (cur_func(env)->callback_depth < regs[BPF_REG_1].umax_value) { + if (cur_func(env)->callback_depth < reg_umax(®s[BPF_REG_1])) { err = push_callback_call(env, insn, insn_idx, meta.subprogno, set_loop_callback_state); } else { @@ -10460,6 +10305,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } break; case BPF_FUNC_set_retval: + { + struct bpf_retval_range range = { + .minval = -MAX_ERRNO, + .maxval = 0, + .return_32bit = true + }; + struct bpf_reg_state *r1 = ®s[BPF_REG_1]; + + if (r1->type != SCALAR_VALUE) { + verbose(env, "R1 is not a scalar\n"); + return -EINVAL; + } + + /* CGROUP_GETSOCKOPT is allowed to return arbitrary value */ + if (prog_type == BPF_PROG_TYPE_CGROUP_SOCKOPT && + env->prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT) + break; + if (prog_type == BPF_PROG_TYPE_LSM && env->prog->expected_attach_type == BPF_LSM_CGROUP) { if (!env->prog->aux->attach_func_proto->type) { @@ -10469,54 +10332,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn verbose(env, "BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n"); return -EINVAL; } + bpf_lsm_get_retval_range(env->prog, &range); } - break; - case BPF_FUNC_dynptr_data: - { - struct bpf_reg_state *reg; - int id, ref_obj_id; - reg = get_dynptr_arg_reg(env, fn, regs); - if (!reg) - return -EFAULT; - - - if (meta.dynptr_id) { - verifier_bug(env, "meta.dynptr_id already set"); - return -EFAULT; - } - if (meta.ref_obj_id) { - verifier_bug(env, "meta.ref_obj_id already set"); - return -EFAULT; - } - - id = dynptr_id(env, reg); - if (id < 0) { - verifier_bug(env, "failed to obtain dynptr id"); - return id; - } + err = mark_chain_precision(env, BPF_REG_1); + if (err) + return err; - ref_obj_id = dynptr_ref_obj_id(env, reg); - if (ref_obj_id < 0) { - verifier_bug(env, "failed to obtain dynptr ref_obj_id"); - return ref_obj_id; + if (!retval_range_within(range, r1)) { + verbose_invalid_scalar(env, r1, range, "At bpf_set_retval", "R1"); + return -EINVAL; } - meta.dynptr_id = id; - meta.ref_obj_id = ref_obj_id; - break; } case BPF_FUNC_dynptr_write: { - enum bpf_dynptr_type dynptr_type; - struct bpf_reg_state *reg; + enum bpf_dynptr_type dynptr_type = meta.dynptr.type; - reg = get_dynptr_arg_reg(env, fn, regs); - if (!reg) - return -EFAULT; - - dynptr_type = dynptr_get_type(env, reg); if (dynptr_type == BPF_DYNPTR_TYPE_INVALID) return -EFAULT; @@ -10560,6 +10393,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn bpf_mark_reg_not_init(env, ®s[caller_saved[i]]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } + invalidate_outgoing_stack_args(env, cur_func(env)); /* helper call returns 64-bit value. */ regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; @@ -10589,7 +10423,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } if (func_id == BPF_FUNC_map_lookup_elem && - can_elide_value_nullness(meta.map.ptr->map_type) && + can_elide_value_nullness(meta.map.ptr) && meta.const_map_key >= 0 && meta.const_map_key < meta.map.ptr->max_entries) ret_flag &= ~PTR_MAYBE_NULL; @@ -10701,29 +10535,45 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (type_may_be_null(regs[BPF_REG_0].type)) regs[BPF_REG_0].id = ++env->id_gen; - if (helper_multiple_ref_obj_use(func_id, meta.map.ptr)) { - verifier_bug(env, "func %s#%d sets ref_obj_id more than once", - func_id_name(func_id), func_id); - return -EFAULT; - } + if (is_ptr_cast_function(func_id) && + find_reference_state(env->cur_state, meta.ref_obj.id)) { + struct bpf_verifier_state *branch; + struct bpf_reg_state *r0; - if (is_dynptr_ref_function(func_id)) - regs[BPF_REG_0].dynptr_id = meta.dynptr_id; + err = validate_ref_obj(env, &meta.ref_obj); + if (err) + return err; - if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) { - /* For release_reference() */ - regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; + /* + * In order for a release of any of the original or cast pointers + * to invalidate all other pointers, reuse the same reference id for + * the cast result. + * This reference id can't be used for nullness propagation, + * as cast might return NULL for a non-NULL input. + * Hence, explore the NULL case as a separate branch. + */ + branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false); + if (IS_ERR(branch)) + return PTR_ERR(branch); + + r0 = &branch->frame[branch->curframe]->regs[BPF_REG_0]; + __mark_reg_known_zero(r0); + r0->type = SCALAR_VALUE; + + regs[BPF_REG_0].type &= ~PTR_MAYBE_NULL; + regs[BPF_REG_0].id = meta.ref_obj.id; } else if (is_acquire_function(func_id, meta.map.ptr)) { - int id = acquire_reference(env, insn_idx); + int id = acquire_reference(env, insn_idx, 0); if (id < 0) return id; - /* For mark_ptr_or_null_reg() */ + regs[BPF_REG_0].id = id; - /* For release_reference() */ - regs[BPF_REG_0].ref_obj_id = id; } + if (func_id == BPF_FUNC_dynptr_data) + regs[BPF_REG_0].parent_id = meta.dynptr.id; + err = do_refine_retval_range(env, regs, fn->ret_type, func_id, &meta); if (err) return err; @@ -10819,7 +10669,6 @@ static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta) return meta->kfunc_flags & KF_RELEASE; } - static bool is_kfunc_destructive(struct bpf_kfunc_call_arg_meta *meta) { return meta->kfunc_flags & KF_DESTRUCTIVE; @@ -10896,6 +10745,11 @@ static bool is_kfunc_arg_nullable(const struct btf *btf, const struct btf_param return btf_param_match_suffix(btf, arg, "__nullable"); } +static bool is_kfunc_arg_nonown_allowed(const struct btf *btf, const struct btf_param *arg) +{ + return btf_param_match_suffix(btf, arg, "__nonown_allowed"); +} + static bool is_kfunc_arg_const_str(const struct btf *btf, const struct btf_param *arg) { return btf_param_match_suffix(btf, arg, "__str"); @@ -11136,10 +10990,15 @@ enum special_kfunc_type { KF_bpf_list_push_front, KF_bpf_list_push_back_impl, KF_bpf_list_push_back, + KF_bpf_list_add, KF_bpf_list_pop_front, KF_bpf_list_pop_back, + KF_bpf_list_del, KF_bpf_list_front, KF_bpf_list_back, + KF_bpf_list_is_first, + KF_bpf_list_is_last, + KF_bpf_list_empty, KF_bpf_cast_to_kern_ctx, KF_bpf_rdonly_cast, KF_bpf_rcu_read_lock, @@ -11204,10 +11063,15 @@ BTF_ID(func, bpf_list_push_front_impl) BTF_ID(func, bpf_list_push_front) BTF_ID(func, bpf_list_push_back_impl) BTF_ID(func, bpf_list_push_back) +BTF_ID(func, bpf_list_add) BTF_ID(func, bpf_list_pop_front) BTF_ID(func, bpf_list_pop_back) +BTF_ID(func, bpf_list_del) BTF_ID(func, bpf_list_front) BTF_ID(func, bpf_list_back) +BTF_ID(func, bpf_list_is_first) +BTF_ID(func, bpf_list_is_last) +BTF_ID(func, bpf_list_empty) BTF_ID(func, bpf_cast_to_kern_ctx) BTF_ID(func, bpf_rdonly_cast) BTF_ID(func, bpf_rcu_read_lock) @@ -11319,7 +11183,8 @@ static bool is_bpf_list_push_kfunc(u32 func_id) return func_id == special_kfunc_list[KF_bpf_list_push_front] || func_id == special_kfunc_list[KF_bpf_list_push_front_impl] || func_id == special_kfunc_list[KF_bpf_list_push_back] || - func_id == special_kfunc_list[KF_bpf_list_push_back_impl]; + func_id == special_kfunc_list[KF_bpf_list_push_back_impl] || + func_id == special_kfunc_list[KF_bpf_list_add]; } static bool is_bpf_rbtree_add_kfunc(u32 func_id) @@ -11368,15 +11233,12 @@ bool bpf_is_kfunc_pkt_changing(struct bpf_kfunc_call_arg_meta *meta) } static enum kfunc_ptr_arg_type -get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, - struct bpf_kfunc_call_arg_meta *meta, +get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, struct bpf_func_state *caller, + struct bpf_reg_state *regs, struct bpf_kfunc_call_arg_meta *meta, const struct btf_type *t, const struct btf_type *ref_t, const char *ref_tname, const struct btf_param *args, - int argno, int nargs) + int arg, int nargs, argno_t argno, struct bpf_reg_state *reg) { - u32 regno = argno + 1; - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = ®s[regno]; bool arg_mem_size = false; if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || @@ -11384,9 +11246,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, meta->func_id == special_kfunc_list[KF_bpf_session_cookie]) return KF_ARG_PTR_TO_CTX; - if (argno + 1 < nargs && - (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]) || - is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]))) + if (arg + 1 < nargs && + (is_kfunc_arg_mem_size(meta->btf, &args[arg + 1], get_func_arg_reg(caller, regs, arg + 1)) || + is_kfunc_arg_const_mem_size(meta->btf, &args[arg + 1], get_func_arg_reg(caller, regs, arg + 1)))) arg_mem_size = true; /* In this function, we verify the kfunc's BTF as per the argument type, @@ -11394,68 +11256,69 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, * type to our caller. When a set of conditions hold in the BTF type of * arguments, we resolve it to a known kfunc_ptr_arg_type. */ - if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno)) + if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), arg)) return KF_ARG_PTR_TO_CTX; - if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && bpf_register_is_null(reg) && + if (is_kfunc_arg_nullable(meta->btf, &args[arg]) && bpf_register_is_null(reg) && !arg_mem_size) return KF_ARG_PTR_TO_NULL; - if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno])) + if (is_kfunc_arg_alloc_obj(meta->btf, &args[arg])) return KF_ARG_PTR_TO_ALLOC_BTF_ID; - if (is_kfunc_arg_refcounted_kptr(meta->btf, &args[argno])) + if (is_kfunc_arg_refcounted_kptr(meta->btf, &args[arg])) return KF_ARG_PTR_TO_REFCOUNTED_KPTR; - if (is_kfunc_arg_dynptr(meta->btf, &args[argno])) + if (is_kfunc_arg_dynptr(meta->btf, &args[arg])) return KF_ARG_PTR_TO_DYNPTR; - if (is_kfunc_arg_iter(meta, argno, &args[argno])) + if (is_kfunc_arg_iter(meta, arg, &args[arg])) return KF_ARG_PTR_TO_ITER; - if (is_kfunc_arg_list_head(meta->btf, &args[argno])) + if (is_kfunc_arg_list_head(meta->btf, &args[arg])) return KF_ARG_PTR_TO_LIST_HEAD; - if (is_kfunc_arg_list_node(meta->btf, &args[argno])) + if (is_kfunc_arg_list_node(meta->btf, &args[arg])) return KF_ARG_PTR_TO_LIST_NODE; - if (is_kfunc_arg_rbtree_root(meta->btf, &args[argno])) + if (is_kfunc_arg_rbtree_root(meta->btf, &args[arg])) return KF_ARG_PTR_TO_RB_ROOT; - if (is_kfunc_arg_rbtree_node(meta->btf, &args[argno])) + if (is_kfunc_arg_rbtree_node(meta->btf, &args[arg])) return KF_ARG_PTR_TO_RB_NODE; - if (is_kfunc_arg_const_str(meta->btf, &args[argno])) + if (is_kfunc_arg_const_str(meta->btf, &args[arg])) return KF_ARG_PTR_TO_CONST_STR; - if (is_kfunc_arg_map(meta->btf, &args[argno])) + if (is_kfunc_arg_map(meta->btf, &args[arg])) return KF_ARG_PTR_TO_MAP; - if (is_kfunc_arg_wq(meta->btf, &args[argno])) + if (is_kfunc_arg_wq(meta->btf, &args[arg])) return KF_ARG_PTR_TO_WORKQUEUE; - if (is_kfunc_arg_timer(meta->btf, &args[argno])) + if (is_kfunc_arg_timer(meta->btf, &args[arg])) return KF_ARG_PTR_TO_TIMER; - if (is_kfunc_arg_task_work(meta->btf, &args[argno])) + if (is_kfunc_arg_task_work(meta->btf, &args[arg])) return KF_ARG_PTR_TO_TASK_WORK; - if (is_kfunc_arg_irq_flag(meta->btf, &args[argno])) + if (is_kfunc_arg_irq_flag(meta->btf, &args[arg])) return KF_ARG_PTR_TO_IRQ_FLAG; - if (is_kfunc_arg_res_spin_lock(meta->btf, &args[argno])) + if (is_kfunc_arg_res_spin_lock(meta->btf, &args[arg])) return KF_ARG_PTR_TO_RES_SPIN_LOCK; if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { if (!btf_type_is_struct(ref_t)) { - verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n", - meta->func_name, argno, btf_type_str(ref_t), ref_tname); + verbose(env, "kernel function %s %s pointer type %s %s is not supported\n", + meta->func_name, reg_arg_name(env, argno), + btf_type_str(ref_t), ref_tname); return -EINVAL; } return KF_ARG_PTR_TO_BTF_ID; } - if (is_kfunc_arg_callback(env, meta->btf, &args[argno])) + if (is_kfunc_arg_callback(env, meta->btf, &args[arg])) return KF_ARG_PTR_TO_CALLBACK; /* This is the catch all argument type of register types supported by @@ -11465,8 +11328,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, */ if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(env, meta->btf, ref_t, 0) && (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) { - verbose(env, "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n", - argno, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : ""); + verbose(env, "%s pointer type %s %s must point to %sscalar, or struct with scalar\n", + reg_arg_name(env, argno), + btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : ""); return -EINVAL; } return arg_mem_size ? KF_ARG_PTR_TO_MEM_SIZE : KF_ARG_PTR_TO_MEM; @@ -11477,7 +11341,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, const struct btf_type *ref_t, const char *ref_tname, u32 ref_id, struct bpf_kfunc_call_arg_meta *meta, - int argno) + int arg, argno_t argno) { const struct btf_type *reg_ref_t; bool strict_type_match = false; @@ -11519,7 +11383,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, * btf_struct_ids_match() to walk the struct at the 0th offset, and * resolve types. */ - if ((is_kfunc_release(meta) && reg->ref_obj_id) || + if ((is_kfunc_release(meta) && reg_is_referenced(env, reg)) || btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id)) strict_type_match = true; @@ -11535,19 +11399,19 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, */ taking_projection = btf_is_projection_of(ref_tname, reg_ref_tname); if (!taking_projection && !struct_same) { - verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n", - meta->func_name, argno, btf_type_str(ref_t), ref_tname, argno + 1, + verbose(env, "kernel function %s %s expected pointer to %s %s but %s has a pointer to %s %s\n", + meta->func_name, reg_arg_name(env, argno), + btf_type_str(ref_t), ref_tname, reg_arg_name(env, argno), btf_type_str(reg_ref_t), reg_ref_tname); return -EINVAL; } return 0; } -static int process_irq_flag(struct bpf_verifier_env *env, int regno, +static int process_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta) { - struct bpf_reg_state *reg = reg_state(env, regno); - int err, kfunc_class = IRQ_NATIVE_KFUNC; + int err, spi, kfunc_class = IRQ_NATIVE_KFUNC; bool irq_save; if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save] || @@ -11567,11 +11431,13 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno, if (irq_save) { if (!is_irq_flag_reg_valid_uninit(env, reg)) { - verbose(env, "expected uninitialized irq flag as arg#%d\n", regno - 1); + verbose(env, "expected uninitialized irq flag as %s\n", + reg_arg_name(env, argno)); return -EINVAL; } - err = check_mem_access(env, env->insn_idx, regno, 0, BPF_DW, BPF_WRITE, -1, false, false); + err = check_mem_access(env, env->insn_idx, reg, argno, 0, BPF_DW, + BPF_WRITE, -1, false, false); if (err) return err; @@ -11581,13 +11447,16 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno, } else { err = is_irq_flag_reg_valid_init(env, reg); if (err) { - verbose(env, "expected an initialized irq flag as arg#%d\n", regno - 1); + verbose(env, "expected an initialized irq flag as %s\n", + reg_arg_name(env, argno)); return err; } - err = mark_irq_flag_read(env, reg); - if (err) - return err; + spi = irq_flag_get_spi(env, reg); + if (spi < 0) + return spi; + + mark_stack_slots_scratched(env, spi, 1); err = unmark_stack_slot_irq_flag(env, reg, kfunc_class); if (err) @@ -11618,36 +11487,21 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state return 0; } -static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_obj_id) +static void ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 id) { - struct bpf_verifier_state *state = env->cur_state; struct bpf_func_state *unused; struct bpf_reg_state *reg; - int i; - - if (!ref_obj_id) { - verifier_bug(env, "ref_obj_id is zero for owning -> non-owning conversion"); - return -EFAULT; - } - for (i = 0; i < state->acquired_refs; i++) { - if (state->refs[i].id != ref_obj_id) - continue; + WARN_ON_ONCE(release_reference_nomark(env->cur_state, id)); - /* Clear ref_obj_id here so release_reference doesn't clobber - * the whole reg - */ - bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({ - if (reg->ref_obj_id == ref_obj_id) { - reg->ref_obj_id = 0; - ref_set_non_owning(env, reg); - } - })); - return 0; - } + bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({ + if (reg->id == id) { + reg->id = 0; + ref_set_non_owning(env, reg); + } + })); - verifier_bug(env, "ref state missing for ref_obj_id"); - return -EFAULT; + return; } /* Implementation details: @@ -11728,8 +11582,12 @@ static bool is_bpf_list_api_kfunc(u32 btf_id) return is_bpf_list_push_kfunc(btf_id) || btf_id == special_kfunc_list[KF_bpf_list_pop_front] || btf_id == special_kfunc_list[KF_bpf_list_pop_back] || + btf_id == special_kfunc_list[KF_bpf_list_del] || btf_id == special_kfunc_list[KF_bpf_list_front] || - btf_id == special_kfunc_list[KF_bpf_list_back]; + btf_id == special_kfunc_list[KF_bpf_list_back] || + btf_id == special_kfunc_list[KF_bpf_list_is_first] || + btf_id == special_kfunc_list[KF_bpf_list_is_last] || + btf_id == special_kfunc_list[KF_bpf_list_empty]; } static bool is_bpf_rbtree_api_kfunc(u32 btf_id) @@ -11850,7 +11708,10 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env, switch (node_field_type) { case BPF_LIST_NODE: - ret = is_bpf_list_push_kfunc(kfunc_btf_id); + ret = is_bpf_list_push_kfunc(kfunc_btf_id) || + kfunc_btf_id == special_kfunc_list[KF_bpf_list_del] || + kfunc_btf_id == special_kfunc_list[KF_bpf_list_is_first] || + kfunc_btf_id == special_kfunc_list[KF_bpf_list_is_last]; break; case BPF_RB_NODE: ret = (is_bpf_rbtree_add_kfunc(kfunc_btf_id) || @@ -11872,7 +11733,7 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env, static int __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, + struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta, enum btf_field_type head_field_type, struct btf_field **head_field) @@ -11893,8 +11754,8 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env, head_type_name = btf_field_type_name(head_field_type); if (!tnum_is_const(reg->var_off)) { verbose(env, - "R%d doesn't have constant offset. %s has to be at the constant offset\n", - regno, head_type_name); + "%s doesn't have constant offset. %s has to be at the constant offset\n", + reg_arg_name(env, argno), head_type_name); return -EINVAL; } @@ -11922,24 +11783,24 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env, } static int process_kf_arg_ptr_to_list_head(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, + struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta) { - return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_LIST_HEAD, + return __process_kf_arg_ptr_to_graph_root(env, reg, argno, meta, BPF_LIST_HEAD, &meta->arg_list_head.field); } static int process_kf_arg_ptr_to_rbtree_root(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, + struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta) { - return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_RB_ROOT, + return __process_kf_arg_ptr_to_graph_root(env, reg, argno, meta, BPF_RB_ROOT, &meta->arg_rbtree_root.field); } static int __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, + struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta, enum btf_field_type head_field_type, enum btf_field_type node_field_type, @@ -11961,8 +11822,8 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env, node_type_name = btf_field_type_name(node_field_type); if (!tnum_is_const(reg->var_off)) { verbose(env, - "R%d doesn't have constant offset. %s has to be at the constant offset\n", - regno, node_type_name); + "%s doesn't have constant offset. %s has to be at the constant offset\n", + reg_arg_name(env, argno), node_type_name); return -EINVAL; } @@ -12003,19 +11864,19 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env, } static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, + struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta) { - return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta, + return __process_kf_arg_ptr_to_graph_node(env, reg, argno, meta, BPF_LIST_HEAD, BPF_LIST_NODE, &meta->arg_list_head.field); } static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, + struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta) { - return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta, + return __process_kf_arg_ptr_to_graph_node(env, reg, argno, meta, BPF_RB_ROOT, BPF_RB_NODE, &meta->arg_rbtree_root.field); } @@ -12046,6 +11907,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ int insn_idx) { const char *func_name = meta->func_name, *ref_tname; + struct bpf_func_state *caller = cur_func(env); + struct bpf_reg_state *regs = cur_regs(env); const struct btf *btf = meta->btf; const struct btf_param *args; struct btf_record *rec; @@ -12054,20 +11917,31 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ args = (const struct btf_param *)(meta->func_proto + 1); nargs = btf_type_vlen(meta->func_proto); - if (nargs > MAX_BPF_FUNC_REG_ARGS) { + if (nargs > MAX_BPF_FUNC_ARGS) { verbose(env, "Function %s has %d > %d args\n", func_name, nargs, - MAX_BPF_FUNC_REG_ARGS); + MAX_BPF_FUNC_ARGS); return -EINVAL; } + if (nargs > MAX_BPF_FUNC_REG_ARGS && !bpf_jit_supports_stack_args()) { + verbose(env, "JIT does not support kfunc %s() with %d args\n", + func_name, nargs); + return -ENOTSUPP; + } + + ret = check_outgoing_stack_args(env, caller, nargs); + if (ret) + return ret; /* Check that BTF function arguments match actual types that the * verifier sees. */ for (i = 0; i < nargs; i++) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[i + 1]; + struct bpf_reg_state *reg = get_func_arg_reg(caller, regs, i); const struct btf_type *t, *ref_t, *resolve_ret; enum bpf_arg_type arg_type = ARG_DONTCARE; - u32 regno = i + 1, ref_id, type_size; + argno_t argno = argno_from_arg(i + 1); + int regno = reg_from_argno(argno); + u32 ref_id, type_size; bool is_ret_buf_sz = false; int kf_arg_type; @@ -12077,6 +11951,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verifier_bug(env, "Only 1 prog->aux argument supported per-kfunc"); return -EFAULT; } + if (regno < 0) { + verbose(env, "%s prog->aux cannot be a stack argument\n", + reg_arg_name(env, argno)); + return -EINVAL; + } meta->arg_prog = true; cur_aux(env)->arg_prog = regno; continue; @@ -12089,7 +11968,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (btf_type_is_scalar(t)) { if (reg->type != SCALAR_VALUE) { - verbose(env, "R%d is not a scalar\n", regno); + verbose(env, "%s is not a scalar\n", reg_arg_name(env, argno)); return -EINVAL; } @@ -12099,10 +11978,14 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EFAULT; } if (!tnum_is_const(reg->var_off)) { - verbose(env, "R%d must be a known constant\n", regno); + verbose(env, "%s must be a known constant\n", + reg_arg_name(env, argno)); return -EINVAL; } - ret = mark_chain_precision(env, regno); + if (regno >= 0) + ret = mark_chain_precision(env, regno); + else + ret = mark_stack_arg_precision(env, i); if (ret < 0) return ret; meta->arg_constant.found = true; @@ -12121,12 +12004,16 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } if (!tnum_is_const(reg->var_off)) { - verbose(env, "R%d is not a const\n", regno); + verbose(env, "%s is not a const\n", + reg_arg_name(env, argno)); return -EINVAL; } meta->r0_size = reg->var_off.value; - ret = mark_chain_precision(env, regno); + if (regno >= 0) + ret = mark_chain_precision(env, regno); + else + ret = mark_stack_arg_precision(env, i); if (ret) return ret; } @@ -12134,32 +12021,33 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } if (!btf_type_is_ptr(t)) { - verbose(env, "Unrecognized arg#%d type %s\n", i, btf_type_str(t)); + verbose(env, "Unrecognized %s type %s\n", + reg_arg_name(env, argno), btf_type_str(t)); return -EINVAL; } if ((bpf_register_is_null(reg) || type_may_be_null(reg->type)) && !is_kfunc_arg_nullable(meta->btf, &args[i])) { - verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i); + verbose(env, "Possibly NULL pointer passed to trusted %s\n", + reg_arg_name(env, argno)); return -EACCES; } - if (reg->ref_obj_id) { - if (is_kfunc_release(meta) && meta->ref_obj_id) { - verifier_bug(env, "more than one arg with ref_obj_id R%d %u %u", - regno, reg->ref_obj_id, - meta->ref_obj_id); - return -EFAULT; - } - meta->ref_obj_id = reg->ref_obj_id; - if (is_kfunc_release(meta)) - meta->release_regno = regno; + if (regno == meta->release_regno && !is_kfunc_arg_dynptr(meta->btf, &args[i]) && + !reg_is_referenced(env, reg) && !bpf_register_is_null(reg)) { + verbose(env, "release kfunc %s expects referenced PTR_TO_BTF_ID passed to %s\n", + func_name, reg_arg_name(env, argno)); + return -EINVAL; } + if (reg_is_referenced(env, reg)) + update_ref_obj(&meta->ref_obj, reg); + ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); ref_tname = btf_name_by_offset(btf, ref_t->name_off); - kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs); + kf_arg_type = get_kfunc_ptr_arg_type(env, caller, regs, meta, t, ref_t, ref_tname, + args, i, nargs, argno, reg); if (kf_arg_type < 0) return kf_arg_type; @@ -12168,7 +12056,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ continue; case KF_ARG_PTR_TO_MAP: if (!reg->map_ptr) { - verbose(env, "pointer in R%d isn't map pointer\n", regno); + verbose(env, "pointer in %s isn't map pointer\n", + reg_arg_name(env, argno)); return -EINVAL; } if (meta->map.ptr && (reg->map_ptr->record->wq_off >= 0 || @@ -12204,18 +12093,19 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ fallthrough; case KF_ARG_PTR_TO_ALLOC_BTF_ID: case KF_ARG_PTR_TO_BTF_ID: - if (!is_trusted_reg(reg)) { + if (!is_trusted_reg(env, reg)) { if (!is_kfunc_rcu(meta)) { - verbose(env, "R%d must be referenced or trusted\n", regno); + verbose(env, "%s must be referenced or trusted\n", + reg_arg_name(env, argno)); return -EINVAL; } if (!is_rcu_reg(reg)) { - verbose(env, "R%d must be a rcu pointer\n", regno); + verbose(env, "%s must be a rcu pointer\n", + reg_arg_name(env, argno)); return -EINVAL; } } fallthrough; - case KF_ARG_PTR_TO_DYNPTR: case KF_ARG_PTR_TO_ITER: case KF_ARG_PTR_TO_LIST_HEAD: case KF_ARG_PTR_TO_LIST_NODE: @@ -12232,6 +12122,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_IRQ_FLAG: case KF_ARG_PTR_TO_RES_SPIN_LOCK: break; + case KF_ARG_PTR_TO_DYNPTR: + arg_type = ARG_PTR_TO_DYNPTR; + break; case KF_ARG_PTR_TO_CTX: arg_type = ARG_PTR_TO_CTX; break; @@ -12240,17 +12133,17 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EFAULT; } - if (is_kfunc_release(meta) && reg->ref_obj_id) + if (regno == meta->release_regno) arg_type |= OBJ_RELEASE; - ret = check_func_arg_reg_off(env, reg, regno, arg_type); + ret = check_func_arg_reg_off(env, reg, argno, arg_type); if (ret < 0) return ret; switch (kf_arg_type) { case KF_ARG_PTR_TO_CTX: if (reg->type != PTR_TO_CTX) { - verbose(env, "arg#%d expected pointer to ctx, but got %s\n", - i, reg_type_str(env, reg->type)); + verbose(env, "%s expected pointer to ctx, but got %s\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EINVAL; } @@ -12264,19 +12157,22 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_ALLOC_BTF_ID: if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC)) { if (!is_bpf_obj_drop_kfunc(meta->func_id)) { - verbose(env, "arg#%d expected for bpf_obj_drop()\n", i); + verbose(env, "%s expected for bpf_obj_drop()\n", + reg_arg_name(env, argno)); return -EINVAL; } } else if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC | MEM_PERCPU)) { if (!is_bpf_percpu_obj_drop_kfunc(meta->func_id)) { - verbose(env, "arg#%d expected for bpf_percpu_obj_drop()\n", i); + verbose(env, "%s expected for bpf_percpu_obj_drop()\n", + reg_arg_name(env, argno)); return -EINVAL; } } else { - verbose(env, "arg#%d expected pointer to allocated object\n", i); + verbose(env, "%s expected pointer to allocated object\n", + reg_arg_name(env, argno)); return -EINVAL; } - if (!reg->ref_obj_id) { + if (!reg_is_referenced(env, reg)) { verbose(env, "allocated object must be referenced\n"); return -EINVAL; } @@ -12288,10 +12184,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_DYNPTR: { enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR; - int clone_ref_obj_id = 0; - - if (reg->type == CONST_PTR_TO_DYNPTR) - dynptr_arg_type |= MEM_RDONLY; if (is_kfunc_arg_uninit(btf, &args[i])) dynptr_arg_type |= MEM_UNINIT; @@ -12305,11 +12197,10 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) { dynptr_arg_type |= DYNPTR_TYPE_FILE; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_file_discard]) { - dynptr_arg_type |= DYNPTR_TYPE_FILE; - meta->release_regno = regno; + dynptr_arg_type |= DYNPTR_TYPE_FILE | OBJ_RELEASE; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] && (dynptr_arg_type & MEM_UNINIT)) { - enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type; + enum bpf_dynptr_type parent_type = meta->dynptr.type; if (parent_type == BPF_DYNPTR_TYPE_INVALID) { verifier_bug(env, "no dynptr type for parent of clone"); @@ -12317,29 +12208,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } dynptr_arg_type |= (unsigned int)get_dynptr_type_flag(parent_type); - clone_ref_obj_id = meta->initialized_dynptr.ref_obj_id; - if (dynptr_type_refcounted(parent_type) && !clone_ref_obj_id) { - verifier_bug(env, "missing ref obj id for parent of clone"); - return -EFAULT; - } } - ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type, clone_ref_obj_id); + ret = process_dynptr_func(env, reg, argno, insn_idx, dynptr_arg_type, + &meta->ref_obj, &meta->dynptr); if (ret < 0) return ret; - - if (!(dynptr_arg_type & MEM_UNINIT)) { - int id = dynptr_id(env, reg); - - if (id < 0) { - verifier_bug(env, "failed to obtain dynptr id"); - return id; - } - meta->initialized_dynptr.id = id; - meta->initialized_dynptr.type = dynptr_get_type(env, reg); - meta->initialized_dynptr.ref_obj_id = dynptr_ref_obj_id(env, reg); - } - break; } case KF_ARG_PTR_TO_ITER: @@ -12349,63 +12223,78 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EINVAL; } } - ret = process_iter_arg(env, regno, insn_idx, meta); + ret = process_iter_arg(env, reg, argno, insn_idx, meta); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_LIST_HEAD: if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { - verbose(env, "arg#%d expected pointer to map value or allocated object\n", i); + verbose(env, "%s expected pointer to map value or allocated object\n", + reg_arg_name(env, argno)); return -EINVAL; } - if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) { + if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && + !reg_is_referenced(env, reg)) { verbose(env, "allocated object must be referenced\n"); return -EINVAL; } - ret = process_kf_arg_ptr_to_list_head(env, reg, regno, meta); + ret = process_kf_arg_ptr_to_list_head(env, reg, argno, meta); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_RB_ROOT: if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { - verbose(env, "arg#%d expected pointer to map value or allocated object\n", i); + verbose(env, "%s expected pointer to map value or allocated object\n", + reg_arg_name(env, argno)); return -EINVAL; } - if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) { + if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && + !reg_is_referenced(env, reg)) { verbose(env, "allocated object must be referenced\n"); return -EINVAL; } - ret = process_kf_arg_ptr_to_rbtree_root(env, reg, regno, meta); + ret = process_kf_arg_ptr_to_rbtree_root(env, reg, argno, meta); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_LIST_NODE: + if (is_kfunc_arg_nonown_allowed(btf, &args[i]) && + type_is_non_owning_ref(reg->type) && !reg_is_referenced(env, reg)) { + /* Allow bpf_list_front/back return value for + * __nonown_allowed list-node arguments. + */ + goto check_ok; + } if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { - verbose(env, "arg#%d expected pointer to allocated object\n", i); + verbose(env, "%s expected pointer to allocated object\n", + reg_arg_name(env, argno)); return -EINVAL; } - if (!reg->ref_obj_id) { + if (!reg_is_referenced(env, reg)) { verbose(env, "allocated object must be referenced\n"); return -EINVAL; } - ret = process_kf_arg_ptr_to_list_node(env, reg, regno, meta); +check_ok: + ret = process_kf_arg_ptr_to_list_node(env, reg, argno, meta); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_RB_NODE: if (is_bpf_rbtree_add_kfunc(meta->func_id)) { if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { - verbose(env, "arg#%d expected pointer to allocated object\n", i); + verbose(env, "%s expected pointer to allocated object\n", + reg_arg_name(env, argno)); return -EINVAL; } - if (!reg->ref_obj_id) { + if (!reg_is_referenced(env, reg)) { verbose(env, "allocated object must be referenced\n"); return -EINVAL; } } else { - if (!type_is_non_owning_ref(reg->type) && !reg->ref_obj_id) { + if (!type_is_non_owning_ref(reg->type) && + !reg_is_referenced(env, reg)) { verbose(env, "%s can only take non-owning or refcounted bpf_rb_node pointer\n", func_name); return -EINVAL; } @@ -12415,7 +12304,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } } - ret = process_kf_arg_ptr_to_rbtree_node(env, reg, regno, meta); + ret = process_kf_arg_ptr_to_rbtree_node(env, reg, argno, meta); if (ret < 0) return ret; break; @@ -12430,38 +12319,44 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if ((base_type(reg->type) != PTR_TO_BTF_ID || (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) && !reg2btf_ids[base_type(reg->type)]) { - verbose(env, "arg#%d is %s ", i, reg_type_str(env, reg->type)); + verbose(env, "%s is %s ", reg_arg_name(env, argno), + reg_type_str(env, reg->type)); verbose(env, "expected %s or socket\n", reg_type_str(env, base_type(reg->type) | (type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS))); return -EINVAL; } - ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i); + ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i, argno); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_MEM: resolve_ret = btf_resolve_size(btf, ref_t, &type_size); if (IS_ERR(resolve_ret)) { - verbose(env, "arg#%d reference type('%s %s') size cannot be determined: %ld\n", - i, btf_type_str(ref_t), ref_tname, PTR_ERR(resolve_ret)); + verbose(env, "%s reference type('%s %s') size cannot be determined: %ld\n", + reg_arg_name(env, argno), btf_type_str(ref_t), + ref_tname, PTR_ERR(resolve_ret)); return -EINVAL; } - ret = check_mem_reg(env, reg, regno, type_size); + ret = check_mem_reg(env, reg, argno, type_size); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_MEM_SIZE: { - struct bpf_reg_state *buff_reg = ®s[regno]; + struct bpf_reg_state *buff_reg = reg; const struct btf_param *buff_arg = &args[i]; - struct bpf_reg_state *size_reg = ®s[regno + 1]; + struct bpf_reg_state *size_reg = get_func_arg_reg(caller, regs, i + 1); const struct btf_param *size_arg = &args[i + 1]; + argno_t next_argno = argno_from_arg(i + 2); if (!bpf_register_is_null(buff_reg) || !is_kfunc_arg_nullable(meta->btf, buff_arg)) { - ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1); + ret = check_kfunc_mem_size_reg(env, buff_reg, size_reg, + argno, next_argno); if (ret < 0) { - verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1); + verbose(env, "%s and ", reg_arg_name(env, argno)); + verbose(env, "%s memory, len pair leads to invalid memory access\n", + reg_arg_name(env, next_argno)); return ret; } } @@ -12472,7 +12367,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EFAULT; } if (!tnum_is_const(size_reg->var_off)) { - verbose(env, "R%d must be a known constant\n", regno + 1); + verbose(env, "%s must be a known constant\n", + reg_arg_name(env, next_argno)); return -EINVAL; } meta->arg_constant.found = true; @@ -12485,14 +12381,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } case KF_ARG_PTR_TO_CALLBACK: if (reg->type != PTR_TO_FUNC) { - verbose(env, "arg%d expected pointer to func\n", i); + verbose(env, "%s expected pointer to func\n", reg_arg_name(env, argno)); return -EINVAL; } meta->subprogno = reg->subprogno; break; case KF_ARG_PTR_TO_REFCOUNTED_KPTR: if (!type_is_ptr_alloc_obj(reg->type)) { - verbose(env, "arg#%d is neither owning or non-owning ref\n", i); + verbose(env, "%s is neither owning or non-owning ref\n", + reg_arg_name(env, argno)); return -EINVAL; } if (!type_is_non_owning_ref(reg->type)) @@ -12505,7 +12402,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } if (rec->refcount_off < 0) { - verbose(env, "arg#%d doesn't point to a type with bpf_refcount field\n", i); + verbose(env, "%s doesn't point to a type with bpf_refcount field\n", + reg_arg_name(env, argno)); return -EINVAL; } @@ -12514,46 +12412,51 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ break; case KF_ARG_PTR_TO_CONST_STR: if (reg->type != PTR_TO_MAP_VALUE) { - verbose(env, "arg#%d doesn't point to a const string\n", i); + verbose(env, "%s doesn't point to a const string\n", + reg_arg_name(env, argno)); return -EINVAL; } - ret = check_reg_const_str(env, reg, regno); + ret = check_arg_const_str(env, reg, argno); if (ret) return ret; break; case KF_ARG_PTR_TO_WORKQUEUE: if (reg->type != PTR_TO_MAP_VALUE) { - verbose(env, "arg#%d doesn't point to a map value\n", i); + verbose(env, "%s doesn't point to a map value\n", + reg_arg_name(env, argno)); return -EINVAL; } - ret = check_map_field_pointer(env, regno, BPF_WORKQUEUE, &meta->map); + ret = check_map_field_pointer(env, reg, argno, BPF_WORKQUEUE, &meta->map); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_TIMER: if (reg->type != PTR_TO_MAP_VALUE) { - verbose(env, "arg#%d doesn't point to a map value\n", i); + verbose(env, "%s doesn't point to a map value\n", + reg_arg_name(env, argno)); return -EINVAL; } - ret = process_timer_kfunc(env, regno, meta); + ret = process_timer_kfunc(env, reg, argno, meta); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_TASK_WORK: if (reg->type != PTR_TO_MAP_VALUE) { - verbose(env, "arg#%d doesn't point to a map value\n", i); + verbose(env, "%s doesn't point to a map value\n", + reg_arg_name(env, argno)); return -EINVAL; } - ret = check_map_field_pointer(env, regno, BPF_TASK_WORK, &meta->map); + ret = check_map_field_pointer(env, reg, argno, BPF_TASK_WORK, &meta->map); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_IRQ_FLAG: if (reg->type != PTR_TO_STACK) { - verbose(env, "arg#%d doesn't point to an irq flag on stack\n", i); + verbose(env, "%s doesn't point to an irq flag on stack\n", + reg_arg_name(env, argno)); return -EINVAL; } - ret = process_irq_flag(env, regno, meta); + ret = process_irq_flag(env, reg, argno, meta); if (ret < 0) return ret; break; @@ -12562,7 +12465,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ int flags = PROCESS_RES_LOCK; if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { - verbose(env, "arg#%d doesn't point to map value or allocated object\n", i); + verbose(env, "%s doesn't point to map value or allocated object\n", + reg_arg_name(env, argno)); return -EINVAL; } @@ -12574,7 +12478,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] || meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) flags |= PROCESS_LOCK_IRQ; - ret = process_spin_lock(env, regno, flags); + ret = process_spin_lock(env, reg, argno, flags); if (ret < 0) return ret; break; @@ -12582,12 +12486,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } } - if (is_kfunc_release(meta) && !meta->release_regno) { - verbose(env, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n", - func_name); - return -EINVAL; - } - return 0; } @@ -12614,6 +12512,10 @@ int bpf_fetch_kfunc_arg_meta(struct bpf_verifier_env *env, meta->kfunc_flags = *kfunc.flags; + /* Only support release referenced argument passed by register */ + if (is_kfunc_release(meta)) + meta->release_regno = BPF_REG_1; + return 0; } @@ -12943,7 +12845,7 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca } } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice] || meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) { - enum bpf_type_flag type_flag = get_dynptr_type_flag(meta->initialized_dynptr.type); + enum bpf_type_flag type_flag = get_dynptr_type_flag(meta->dynptr.type); mark_reg_known_zero(env, regs, BPF_REG_0); @@ -12967,16 +12869,11 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca } } - if (!meta->initialized_dynptr.id) { + if (!meta->dynptr.id) { verifier_bug(env, "no dynptr id"); return -EFAULT; } - regs[BPF_REG_0].dynptr_id = meta->initialized_dynptr.id; - - /* we don't need to set BPF_REG_0's ref obj id - * because packet slices are not refcounted (see - * dynptr_type_refcounted) - */ + regs[BPF_REG_0].parent_id = meta->dynptr.id; } else { return 0; } @@ -12990,7 +12887,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { bool sleepable, rcu_lock, rcu_unlock, preempt_disable, preempt_enable; - u32 i, nargs, ptr_type_id, release_ref_obj_id; + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); struct bpf_reg_state *regs = cur_regs(env); const char *func_name, *ptr_type_name; const struct btf_type *t, *ptr_type; @@ -12998,7 +12895,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn_aux_data *insn_aux; int err, insn_idx = *insn_idx_p; const struct btf_param *args; + u32 i, nargs, ptr_type_id; struct btf *desc_btf; + int id; /* skip for now, but return error when we find this in fixup_kfunc_call */ if (!insn->imm) @@ -13065,6 +12964,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (err < 0) return err; + if ((is_bpf_obj_drop_kfunc(meta.func_id) || + is_bpf_percpu_obj_drop_kfunc(meta.func_id)) && (is_tracing_prog_type(prog_type) || + /* is_tracing_prog_type() for now doesn't cover non-iterator tracing progs. */ + (prog_type == BPF_PROG_TYPE_TRACING && env->prog->expected_attach_type != BPF_TRACE_ITER + && !env->prog->sleepable))) { + struct btf_struct_meta *struct_meta; + + struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id); + if (struct_meta && btf_record_has_nmi_unsafe_fields(struct_meta->record)) { + verbose(env, "%s cannot be used in tracing programs on types with NMI unsafe fields\n", + func_name); + return -EINVAL; + } + } + if (is_bpf_rbtree_add_kfunc(meta.func_id)) { err = push_callback_call(env, insn, insn_idx, meta.subprogno, set_rbtree_add_callback_state); @@ -13109,22 +13023,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (rcu_lock) { env->cur_state->active_rcu_locks++; } else if (rcu_unlock) { - struct bpf_func_state *state; - struct bpf_reg_state *reg; - u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER); - if (env->cur_state->active_rcu_locks == 0) { verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name); return -EINVAL; } - if (--env->cur_state->active_rcu_locks == 0) { - bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, clear_mask, ({ - if (reg->type & MEM_RCU) { - reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL); - reg->type |= PTR_UNTRUSTED; - } - })); - } + if (--env->cur_state->active_rcu_locks == 0) + invalidate_rcu_protected_refs(env); } else if (preempt_disable) { env->cur_state->active_preempt_locks++; } else if (preempt_enable) { @@ -13155,37 +13059,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now. */ if (meta.release_regno) { - struct bpf_reg_state *reg = ®s[meta.release_regno]; - - if (meta.initialized_dynptr.ref_obj_id) { - err = unmark_stack_slots_dynptr(env, reg); - } else { - err = release_reference(env, reg->ref_obj_id); - if (err) - verbose(env, "kfunc %s#%d reference has not been acquired before\n", - func_name, meta.func_id); - } + err = release_reg(env, ®s[meta.release_regno], false, !!meta.dynptr.id); if (err) return err; } if (is_bpf_list_push_kfunc(meta.func_id) || is_bpf_rbtree_add_kfunc(meta.func_id)) { - release_ref_obj_id = regs[BPF_REG_2].ref_obj_id; + id = regs[BPF_REG_2].id; insn_aux->insert_off = regs[BPF_REG_2].var_off.value; insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id); - err = ref_convert_owning_non_owning(env, release_ref_obj_id); - if (err) { - verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n", - func_name, meta.func_id); - return err; - } - - err = release_reference(env, release_ref_obj_id); - if (err) { - verbose(env, "kfunc %s#%d reference has not been acquired before\n", - func_name, meta.func_id); - return err; - } + ref_convert_owning_non_owning(env, id); } if (meta.func_id == special_kfunc_list[KF_bpf_throw]) { @@ -13212,6 +13095,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, bpf_mark_reg_not_init(env, ®s[regno]); regs[regno].subreg_def = DEF_NOT_SUBREG; } + invalidate_outgoing_stack_args(env, cur_func(env)); /* Check return type */ t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL); @@ -13269,8 +13153,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].type |= MEM_RDONLY; /* Ensures we don't access the memory after a release_reference() */ - if (meta.ref_obj_id) - regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; + if (meta.ref_obj.id) { + err = validate_ref_obj(env, &meta.ref_obj); + if (err) + return err; + regs[BPF_REG_0].parent_id = meta.ref_obj.id; + } if (is_kfunc_rcu_protected(&meta)) regs[BPF_REG_0].type |= MEM_RCU; @@ -13316,13 +13204,10 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *)); if (is_kfunc_acquire(&meta)) { - int id = acquire_reference(env, insn_idx); - + id = acquire_reference(env, insn_idx, 0); if (id < 0) return id; - if (is_kfunc_ret_null(&meta)) - regs[BPF_REG_0].id = id; - regs[BPF_REG_0].ref_obj_id = id; + regs[BPF_REG_0].id = id; } else if (is_rbtree_node_type(ptr_type) || is_list_node_type(ptr_type)) { ref_set_non_owning(env, ®s[BPF_REG_0]); } @@ -13344,8 +13229,18 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, clear_all_pkt_pointers(env); nargs = btf_type_vlen(meta.func_proto); + if (nargs > MAX_BPF_FUNC_REG_ARGS) { + struct bpf_func_state *caller = cur_func(env); + struct bpf_subprog_info *caller_info = &env->subprog_info[caller->subprogno]; + u16 out_stack_arg_cnt = nargs - MAX_BPF_FUNC_REG_ARGS; + u16 stack_arg_cnt = bpf_in_stack_arg_cnt(caller_info) + out_stack_arg_cnt; + + if (stack_arg_cnt > caller_info->stack_arg_cnt) + caller_info->stack_arg_cnt = stack_arg_cnt; + } + args = (const struct btf_param *)(meta.func_proto + 1); - for (i = 0; i < nargs; i++) { + for (i = 0; i < min_t(int, nargs, MAX_BPF_FUNC_REG_ARGS); i++) { u32 regno = i + 1; t = btf_type_skip_modifiers(desc_btf, args[i].type, NULL); @@ -13377,7 +13272,7 @@ static bool check_reg_sane_offset_scalar(struct bpf_verifier_env *env, { bool known = tnum_is_const(reg->var_off); s64 val = reg->var_off.value; - s64 smin = reg->smin_value; + s64 smin = reg_smin(reg); if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { verbose(env, "math between %s pointer and %lld is not allowed\n", @@ -13406,7 +13301,7 @@ static bool check_reg_sane_offset_ptr(struct bpf_verifier_env *env, { bool known = tnum_is_const(reg->var_off); s64 val = reg->var_off.value; - s64 smin = reg->smin_value; + s64 smin = reg_smin(reg); if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { verbose(env, "%s pointer offset %lld is not allowed\n", @@ -13448,7 +13343,7 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, break; case PTR_TO_MAP_VALUE: max = ptr_reg->map_ptr->value_size; - ptr_limit = mask_to_left ? ptr_reg->smin_value : ptr_reg->umax_value; + ptr_limit = mask_to_left ? reg_smin(ptr_reg) : reg_umax(ptr_reg); break; default: return REASON_TYPE; @@ -13537,7 +13432,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : &info->aux; struct bpf_verifier_state *vstate = env->cur_state; bool off_is_imm = tnum_is_const(off_reg->var_off); - bool off_is_neg = off_reg->smin_value < 0; + bool off_is_neg = reg_smin(off_reg) < 0; bool ptr_is_dst_reg = ptr_reg == dst_reg; u8 opcode = BPF_OP(insn->code); u32 alu_state, alu_limit; @@ -13556,7 +13451,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, if (!commit_window) { if (!tnum_is_const(off_reg->var_off) && - (off_reg->smin_value < 0) != (off_reg->smax_value < 0)) + (reg_smin(off_reg) < 0) != (reg_smax(off_reg) < 0)) return REASON_BOUNDS; info->mask_to_left = (opcode == BPF_ADD && off_is_neg) || @@ -13612,7 +13507,7 @@ do_sim: */ if (!ptr_is_dst_reg) { tmp = *dst_reg; - copy_register_state(dst_reg, ptr_reg); + *dst_reg = *ptr_reg; } err = sanitize_speculative_path(env, NULL, env->insn_idx + 1, env->insn_idx); if (err < 0) @@ -13706,7 +13601,7 @@ static int check_stack_access_for_ptr_arithmetic( static int sanitize_check_bounds(struct bpf_verifier_env *env, const struct bpf_insn *insn, - const struct bpf_reg_state *dst_reg) + struct bpf_reg_state *dst_reg) { u32 dst = insn->dst_reg; @@ -13723,7 +13618,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env, return -EACCES; break; case PTR_TO_MAP_VALUE: - if (check_map_access(env, dst, 0, 1, false, ACCESS_HELPER)) { + if (check_map_access(env, dst_reg, argno_from_reg(dst), 0, 1, false, ACCESS_HELPER)) { verbose(env, "R%d pointer arithmetic of map value goes out of range, " "prohibited for !root\n", dst); return -EACCES; @@ -13750,10 +13645,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = state->regs, *dst_reg; bool known = tnum_is_const(off_reg->var_off); - s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value, - smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; - u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value, - umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value; + s64 smin_val = reg_smin(off_reg), smax_val = reg_smax(off_reg); + u64 umin_val = reg_umin(off_reg), umax_val = reg_umax(off_reg); struct bpf_sanitize_info info = {}; u8 opcode = BPF_OP(insn->code); u32 dst = insn->dst_reg; @@ -13855,16 +13748,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, * added into the variable offset, and we copy the fixed offset * from ptr_reg. */ - if (check_add_overflow(smin_ptr, smin_val, &dst_reg->smin_value) || - check_add_overflow(smax_ptr, smax_val, &dst_reg->smax_value)) { - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } - if (check_add_overflow(umin_ptr, umin_val, &dst_reg->umin_value) || - check_add_overflow(umax_ptr, umax_val, &dst_reg->umax_value)) { - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; - } + dst_reg->r64 = cnum64_add(ptr_reg->r64, off_reg->r64); dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off); dst_reg->raw = ptr_reg->raw; if (reg_is_pkt_pointer(ptr_reg)) { @@ -13896,24 +13780,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst); return -EACCES; } - /* A new variable offset is created. If the subtrahend is known - * nonnegative, then any reg->range we had before is still good. - */ - if (check_sub_overflow(smin_ptr, smax_val, &dst_reg->smin_value) || - check_sub_overflow(smax_ptr, smin_val, &dst_reg->smax_value)) { - /* Overflow possible, we know nothing */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } - if (umin_ptr < umax_val) { - /* Overflow possible, we know nothing */ - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; - } else { - /* Cannot overflow (as long as bounds are consistent) */ - dst_reg->umin_value = umin_ptr - umax_val; - dst_reg->umax_value = umax_ptr - umin_val; - } + dst_reg->r64 = cnum64_add(ptr_reg->r64, cnum64_negate(off_reg->r64)); dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off); dst_reg->raw = ptr_reg->raw; if (reg_is_pkt_pointer(ptr_reg)) { @@ -13970,227 +13837,123 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, static void scalar32_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s32 *dst_smin = &dst_reg->s32_min_value; - s32 *dst_smax = &dst_reg->s32_max_value; - u32 *dst_umin = &dst_reg->u32_min_value; - u32 *dst_umax = &dst_reg->u32_max_value; - u32 umin_val = src_reg->u32_min_value; - u32 umax_val = src_reg->u32_max_value; - bool min_overflow, max_overflow; - - if (check_add_overflow(*dst_smin, src_reg->s32_min_value, dst_smin) || - check_add_overflow(*dst_smax, src_reg->s32_max_value, dst_smax)) { - *dst_smin = S32_MIN; - *dst_smax = S32_MAX; - } - - /* If either all additions overflow or no additions overflow, then - * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax = - * dst_umax + src_umax. Otherwise (some additions overflow), set - * the output bounds to unbounded. - */ - min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin); - max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax); - - if (!min_overflow && max_overflow) { - *dst_umin = 0; - *dst_umax = U32_MAX; - } + dst_reg->r32 = cnum32_add(dst_reg->r32, src_reg->r32); } static void scalar_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s64 *dst_smin = &dst_reg->smin_value; - s64 *dst_smax = &dst_reg->smax_value; - u64 *dst_umin = &dst_reg->umin_value; - u64 *dst_umax = &dst_reg->umax_value; - u64 umin_val = src_reg->umin_value; - u64 umax_val = src_reg->umax_value; - bool min_overflow, max_overflow; - - if (check_add_overflow(*dst_smin, src_reg->smin_value, dst_smin) || - check_add_overflow(*dst_smax, src_reg->smax_value, dst_smax)) { - *dst_smin = S64_MIN; - *dst_smax = S64_MAX; - } - - /* If either all additions overflow or no additions overflow, then - * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax = - * dst_umax + src_umax. Otherwise (some additions overflow), set - * the output bounds to unbounded. - */ - min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin); - max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax); - - if (!min_overflow && max_overflow) { - *dst_umin = 0; - *dst_umax = U64_MAX; - } + dst_reg->r64 = cnum64_add(dst_reg->r64, src_reg->r64); } static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s32 *dst_smin = &dst_reg->s32_min_value; - s32 *dst_smax = &dst_reg->s32_max_value; - u32 *dst_umin = &dst_reg->u32_min_value; - u32 *dst_umax = &dst_reg->u32_max_value; - u32 umin_val = src_reg->u32_min_value; - u32 umax_val = src_reg->u32_max_value; - bool min_underflow, max_underflow; - - if (check_sub_overflow(*dst_smin, src_reg->s32_max_value, dst_smin) || - check_sub_overflow(*dst_smax, src_reg->s32_min_value, dst_smax)) { - /* Overflow possible, we know nothing */ - *dst_smin = S32_MIN; - *dst_smax = S32_MAX; - } - - /* If either all subtractions underflow or no subtractions - * underflow, it is okay to set: dst_umin = dst_umin - src_umax, - * dst_umax = dst_umax - src_umin. Otherwise (some subtractions - * underflow), set the output bounds to unbounded. - */ - min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin); - max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax); - - if (min_underflow && !max_underflow) { - *dst_umin = 0; - *dst_umax = U32_MAX; - } + dst_reg->r32 = cnum32_add(dst_reg->r32, cnum32_negate(src_reg->r32)); } static void scalar_min_max_sub(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s64 *dst_smin = &dst_reg->smin_value; - s64 *dst_smax = &dst_reg->smax_value; - u64 *dst_umin = &dst_reg->umin_value; - u64 *dst_umax = &dst_reg->umax_value; - u64 umin_val = src_reg->umin_value; - u64 umax_val = src_reg->umax_value; - bool min_underflow, max_underflow; - - if (check_sub_overflow(*dst_smin, src_reg->smax_value, dst_smin) || - check_sub_overflow(*dst_smax, src_reg->smin_value, dst_smax)) { - /* Overflow possible, we know nothing */ - *dst_smin = S64_MIN; - *dst_smax = S64_MAX; - } - - /* If either all subtractions underflow or no subtractions - * underflow, it is okay to set: dst_umin = dst_umin - src_umax, - * dst_umax = dst_umax - src_umin. Otherwise (some subtractions - * underflow), set the output bounds to unbounded. - */ - min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin); - max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax); - - if (min_underflow && !max_underflow) { - *dst_umin = 0; - *dst_umax = U64_MAX; - } + dst_reg->r64 = cnum64_add(dst_reg->r64, cnum64_negate(src_reg->r64)); } static void scalar32_min_max_mul(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s32 *dst_smin = &dst_reg->s32_min_value; - s32 *dst_smax = &dst_reg->s32_max_value; - u32 *dst_umin = &dst_reg->u32_min_value; - u32 *dst_umax = &dst_reg->u32_max_value; + s32 smin = reg_s32_min(dst_reg); + s32 smax = reg_s32_max(dst_reg); + u32 umin = reg_u32_min(dst_reg); + u32 umax = reg_u32_max(dst_reg); s32 tmp_prod[4]; - if (check_mul_overflow(*dst_umax, src_reg->u32_max_value, dst_umax) || - check_mul_overflow(*dst_umin, src_reg->u32_min_value, dst_umin)) { + if (check_mul_overflow(umax, reg_u32_max(src_reg), &umax) || + check_mul_overflow(umin, reg_u32_min(src_reg), &umin)) { /* Overflow possible, we know nothing */ - *dst_umin = 0; - *dst_umax = U32_MAX; + umin = 0; + umax = U32_MAX; } - if (check_mul_overflow(*dst_smin, src_reg->s32_min_value, &tmp_prod[0]) || - check_mul_overflow(*dst_smin, src_reg->s32_max_value, &tmp_prod[1]) || - check_mul_overflow(*dst_smax, src_reg->s32_min_value, &tmp_prod[2]) || - check_mul_overflow(*dst_smax, src_reg->s32_max_value, &tmp_prod[3])) { + if (check_mul_overflow(smin, reg_s32_min(src_reg), &tmp_prod[0]) || + check_mul_overflow(smin, reg_s32_max(src_reg), &tmp_prod[1]) || + check_mul_overflow(smax, reg_s32_min(src_reg), &tmp_prod[2]) || + check_mul_overflow(smax, reg_s32_max(src_reg), &tmp_prod[3])) { /* Overflow possible, we know nothing */ - *dst_smin = S32_MIN; - *dst_smax = S32_MAX; + smin = S32_MIN; + smax = S32_MAX; } else { - *dst_smin = min_array(tmp_prod, 4); - *dst_smax = max_array(tmp_prod, 4); + smin = min_array(tmp_prod, 4); + smax = max_array(tmp_prod, 4); } + + dst_reg->r32 = cnum32_intersect(cnum32_from_urange(umin, umax), + cnum32_from_srange(smin, smax)); } static void scalar_min_max_mul(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s64 *dst_smin = &dst_reg->smin_value; - s64 *dst_smax = &dst_reg->smax_value; - u64 *dst_umin = &dst_reg->umin_value; - u64 *dst_umax = &dst_reg->umax_value; + s64 smin = reg_smin(dst_reg); + s64 smax = reg_smax(dst_reg); + u64 umin = reg_umin(dst_reg); + u64 umax = reg_umax(dst_reg); s64 tmp_prod[4]; - if (check_mul_overflow(*dst_umax, src_reg->umax_value, dst_umax) || - check_mul_overflow(*dst_umin, src_reg->umin_value, dst_umin)) { + if (check_mul_overflow(umax, reg_umax(src_reg), &umax) || + check_mul_overflow(umin, reg_umin(src_reg), &umin)) { /* Overflow possible, we know nothing */ - *dst_umin = 0; - *dst_umax = U64_MAX; + umin = 0; + umax = U64_MAX; } - if (check_mul_overflow(*dst_smin, src_reg->smin_value, &tmp_prod[0]) || - check_mul_overflow(*dst_smin, src_reg->smax_value, &tmp_prod[1]) || - check_mul_overflow(*dst_smax, src_reg->smin_value, &tmp_prod[2]) || - check_mul_overflow(*dst_smax, src_reg->smax_value, &tmp_prod[3])) { + if (check_mul_overflow(smin, reg_smin(src_reg), &tmp_prod[0]) || + check_mul_overflow(smin, reg_smax(src_reg), &tmp_prod[1]) || + check_mul_overflow(smax, reg_smin(src_reg), &tmp_prod[2]) || + check_mul_overflow(smax, reg_smax(src_reg), &tmp_prod[3])) { /* Overflow possible, we know nothing */ - *dst_smin = S64_MIN; - *dst_smax = S64_MAX; + smin = S64_MIN; + smax = S64_MAX; } else { - *dst_smin = min_array(tmp_prod, 4); - *dst_smax = max_array(tmp_prod, 4); + smin = min_array(tmp_prod, 4); + smax = max_array(tmp_prod, 4); } + + dst_reg->r64 = cnum64_intersect(cnum64_from_urange(umin, umax), + cnum64_from_srange(smin, smax)); } static void scalar32_min_max_udiv(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u32 *dst_umin = &dst_reg->u32_min_value; - u32 *dst_umax = &dst_reg->u32_max_value; - u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */ + u32 src_val = reg_u32_min(src_reg); /* non-zero, const divisor */ - *dst_umin = *dst_umin / src_val; - *dst_umax = *dst_umax / src_val; + reg_set_urange32(dst_reg, reg_u32_min(dst_reg) / src_val, + reg_u32_max(dst_reg) / src_val); /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; reset_reg64_and_tnum(dst_reg); } static void scalar_min_max_udiv(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u64 *dst_umin = &dst_reg->umin_value; - u64 *dst_umax = &dst_reg->umax_value; - u64 src_val = src_reg->umin_value; /* non-zero, const divisor */ + u64 src_val = reg_umin(src_reg); /* non-zero, const divisor */ - *dst_umin = div64_u64(*dst_umin, src_val); - *dst_umax = div64_u64(*dst_umax, src_val); + reg_set_urange64(dst_reg, div64_u64(reg_umin(dst_reg), src_val), + div64_u64(reg_umax(dst_reg), src_val)); /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; reset_reg32_and_tnum(dst_reg); } static void scalar32_min_max_sdiv(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s32 *dst_smin = &dst_reg->s32_min_value; - s32 *dst_smax = &dst_reg->s32_max_value; - s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */ + s32 smin = reg_s32_min(dst_reg); + s32 smax = reg_s32_max(dst_reg); + s32 src_val = reg_s32_min(src_reg); /* non-zero, const divisor */ s32 res1, res2; /* BPF div specification: S32_MIN / -1 = S32_MIN */ - if (*dst_smin == S32_MIN && src_val == -1) { + if (smin == S32_MIN && src_val == -1) { /* * If the dividend range contains more than just S32_MIN, * we cannot precisely track the result, so it becomes unbounded. @@ -14199,35 +13962,34 @@ static void scalar32_min_max_sdiv(struct bpf_reg_state *dst_reg, * = {S32_MIN} U [S32_MAX-9, S32_MAX] = [S32_MIN, S32_MAX] * Otherwise (if dividend is exactly S32_MIN), result remains S32_MIN. */ - if (*dst_smax != S32_MIN) { - *dst_smin = S32_MIN; - *dst_smax = S32_MAX; + if (smax != S32_MIN) { + smin = S32_MIN; + smax = S32_MAX; } goto reset; } - res1 = *dst_smin / src_val; - res2 = *dst_smax / src_val; - *dst_smin = min(res1, res2); - *dst_smax = max(res1, res2); + res1 = smin / src_val; + res2 = smax / src_val; + smin = min(res1, res2); + smax = max(res1, res2); reset: + reg_set_srange32(dst_reg, smin, smax); /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->u32_min_value = 0; - dst_reg->u32_max_value = U32_MAX; reset_reg64_and_tnum(dst_reg); } static void scalar_min_max_sdiv(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s64 *dst_smin = &dst_reg->smin_value; - s64 *dst_smax = &dst_reg->smax_value; - s64 src_val = src_reg->smin_value; /* non-zero, const divisor */ + s64 smin = reg_smin(dst_reg); + s64 smax = reg_smax(dst_reg); + s64 src_val = reg_smin(src_reg); /* non-zero, const divisor */ s64 res1, res2; /* BPF div specification: S64_MIN / -1 = S64_MIN */ - if (*dst_smin == S64_MIN && src_val == -1) { + if (smin == S64_MIN && src_val == -1) { /* * If the dividend range contains more than just S64_MIN, * we cannot precisely track the result, so it becomes unbounded. @@ -14236,79 +13998,66 @@ static void scalar_min_max_sdiv(struct bpf_reg_state *dst_reg, * = {S64_MIN} U [S64_MAX-9, S64_MAX] = [S64_MIN, S64_MAX] * Otherwise (if dividend is exactly S64_MIN), result remains S64_MIN. */ - if (*dst_smax != S64_MIN) { - *dst_smin = S64_MIN; - *dst_smax = S64_MAX; + if (smax != S64_MIN) { + smin = S64_MIN; + smax = S64_MAX; } goto reset; } - res1 = div64_s64(*dst_smin, src_val); - res2 = div64_s64(*dst_smax, src_val); - *dst_smin = min(res1, res2); - *dst_smax = max(res1, res2); + res1 = div64_s64(smin, src_val); + res2 = div64_s64(smax, src_val); + smin = min(res1, res2); + smax = max(res1, res2); reset: + reg_set_srange64(dst_reg, smin, smax); /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; reset_reg32_and_tnum(dst_reg); } static void scalar32_min_max_umod(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u32 *dst_umin = &dst_reg->u32_min_value; - u32 *dst_umax = &dst_reg->u32_max_value; - u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */ + u32 src_val = reg_u32_min(src_reg); /* non-zero, const divisor */ u32 res_max = src_val - 1; /* * If dst_umax <= res_max, the result remains unchanged. * e.g., [2, 5] % 10 = [2, 5]. */ - if (*dst_umax <= res_max) + if (reg_u32_max(dst_reg) <= res_max) return; - *dst_umin = 0; - *dst_umax = min(*dst_umax, res_max); + reg_set_urange32(dst_reg, 0, min(reg_u32_max(dst_reg), res_max)); /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; reset_reg64_and_tnum(dst_reg); } static void scalar_min_max_umod(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u64 *dst_umin = &dst_reg->umin_value; - u64 *dst_umax = &dst_reg->umax_value; - u64 src_val = src_reg->umin_value; /* non-zero, const divisor */ + u64 src_val = reg_umin(src_reg); /* non-zero, const divisor */ u64 res_max = src_val - 1; /* * If dst_umax <= res_max, the result remains unchanged. * e.g., [2, 5] % 10 = [2, 5]. */ - if (*dst_umax <= res_max) + if (reg_umax(dst_reg) <= res_max) return; - *dst_umin = 0; - *dst_umax = min(*dst_umax, res_max); + reg_set_urange64(dst_reg, 0, min(reg_umax(dst_reg), res_max)); /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; reset_reg32_and_tnum(dst_reg); } static void scalar32_min_max_smod(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s32 *dst_smin = &dst_reg->s32_min_value; - s32 *dst_smax = &dst_reg->s32_max_value; - s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */ + s32 src_val = reg_s32_min(src_reg); /* non-zero, const divisor */ /* * Safe absolute value calculation: @@ -14328,33 +14077,26 @@ static void scalar32_min_max_smod(struct bpf_reg_state *dst_reg, * If the dividend is already within the result range, * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5]. */ - if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs) + if (reg_s32_min(dst_reg) >= -res_max_abs && reg_s32_max(dst_reg) <= res_max_abs) return; /* General case: result has the same sign as the dividend. */ - if (*dst_smin >= 0) { - *dst_smin = 0; - *dst_smax = min(*dst_smax, res_max_abs); - } else if (*dst_smax <= 0) { - *dst_smax = 0; - *dst_smin = max(*dst_smin, -res_max_abs); + if (reg_s32_min(dst_reg) >= 0) { + reg_set_srange32(dst_reg, 0, min(reg_s32_max(dst_reg), res_max_abs)); + } else if (reg_s32_max(dst_reg) <= 0) { + reg_set_srange32(dst_reg, max(reg_s32_min(dst_reg), -res_max_abs), 0); } else { - *dst_smin = -res_max_abs; - *dst_smax = res_max_abs; + reg_set_srange32(dst_reg, -res_max_abs, res_max_abs); } /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->u32_min_value = 0; - dst_reg->u32_max_value = U32_MAX; reset_reg64_and_tnum(dst_reg); } static void scalar_min_max_smod(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s64 *dst_smin = &dst_reg->smin_value; - s64 *dst_smax = &dst_reg->smax_value; - s64 src_val = src_reg->smin_value; /* non-zero, const divisor */ + s64 src_val = reg_smin(src_reg); /* non-zero, const divisor */ /* * Safe absolute value calculation: @@ -14374,24 +14116,19 @@ static void scalar_min_max_smod(struct bpf_reg_state *dst_reg, * If the dividend is already within the result range, * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5]. */ - if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs) + if (reg_smin(dst_reg) >= -res_max_abs && reg_smax(dst_reg) <= res_max_abs) return; /* General case: result has the same sign as the dividend. */ - if (*dst_smin >= 0) { - *dst_smin = 0; - *dst_smax = min(*dst_smax, res_max_abs); - } else if (*dst_smax <= 0) { - *dst_smax = 0; - *dst_smin = max(*dst_smin, -res_max_abs); + if (reg_smin(dst_reg) >= 0) { + reg_set_srange64(dst_reg, 0, min(reg_smax(dst_reg), res_max_abs)); + } else if (reg_smax(dst_reg) <= 0) { + reg_set_srange64(dst_reg, max(reg_smin(dst_reg), -res_max_abs), 0); } else { - *dst_smin = -res_max_abs; - *dst_smax = res_max_abs; + reg_set_srange64(dst_reg, -res_max_abs, res_max_abs); } /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; reset_reg32_and_tnum(dst_reg); } @@ -14401,7 +14138,7 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, bool src_known = tnum_subreg_is_const(src_reg->var_off); bool dst_known = tnum_subreg_is_const(dst_reg->var_off); struct tnum var32_off = tnum_subreg(dst_reg->var_off); - u32 umax_val = src_reg->u32_max_value; + u32 umax_val = reg_u32_max(src_reg); if (src_known && dst_known) { __mark_reg32_known(dst_reg, var32_off.value); @@ -14411,19 +14148,9 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, /* We get our minimum from the var_off, since that's inherently * bitwise. Our maximum is the minimum of the operands' maxima. */ - dst_reg->u32_min_value = var32_off.value; - dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val); - - /* Safe to set s32 bounds by casting u32 result into s32 when u32 - * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. - */ - if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) { - dst_reg->s32_min_value = dst_reg->u32_min_value; - dst_reg->s32_max_value = dst_reg->u32_max_value; - } else { - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; - } + reg_set_urange32(dst_reg, + var32_off.value, + min(reg_u32_max(dst_reg), umax_val)); } static void scalar_min_max_and(struct bpf_reg_state *dst_reg, @@ -14431,7 +14158,7 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg, { bool src_known = tnum_is_const(src_reg->var_off); bool dst_known = tnum_is_const(dst_reg->var_off); - u64 umax_val = src_reg->umax_value; + u64 umax_val = reg_umax(src_reg); if (src_known && dst_known) { __mark_reg_known(dst_reg, dst_reg->var_off.value); @@ -14441,19 +14168,10 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg, /* We get our minimum from the var_off, since that's inherently * bitwise. Our maximum is the minimum of the operands' maxima. */ - dst_reg->umin_value = dst_reg->var_off.value; - dst_reg->umax_value = min(dst_reg->umax_value, umax_val); + reg_set_urange64(dst_reg, + dst_reg->var_off.value, + min(reg_umax(dst_reg), umax_val)); - /* Safe to set s64 bounds by casting u64 result into s64 when u64 - * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. - */ - if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) { - dst_reg->smin_value = dst_reg->umin_value; - dst_reg->smax_value = dst_reg->umax_value; - } else { - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); } @@ -14464,7 +14182,7 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, bool src_known = tnum_subreg_is_const(src_reg->var_off); bool dst_known = tnum_subreg_is_const(dst_reg->var_off); struct tnum var32_off = tnum_subreg(dst_reg->var_off); - u32 umin_val = src_reg->u32_min_value; + u32 umin_val = reg_u32_min(src_reg); if (src_known && dst_known) { __mark_reg32_known(dst_reg, var32_off.value); @@ -14474,19 +14192,9 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, /* We get our maximum from the var_off, and our minimum is the * maximum of the operands' minima */ - dst_reg->u32_min_value = max(dst_reg->u32_min_value, umin_val); - dst_reg->u32_max_value = var32_off.value | var32_off.mask; - - /* Safe to set s32 bounds by casting u32 result into s32 when u32 - * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. - */ - if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) { - dst_reg->s32_min_value = dst_reg->u32_min_value; - dst_reg->s32_max_value = dst_reg->u32_max_value; - } else { - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; - } + reg_set_urange32(dst_reg, + max(reg_u32_min(dst_reg), umin_val), + var32_off.value | var32_off.mask); } static void scalar_min_max_or(struct bpf_reg_state *dst_reg, @@ -14494,7 +14202,7 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg, { bool src_known = tnum_is_const(src_reg->var_off); bool dst_known = tnum_is_const(dst_reg->var_off); - u64 umin_val = src_reg->umin_value; + u64 umin_val = reg_umin(src_reg); if (src_known && dst_known) { __mark_reg_known(dst_reg, dst_reg->var_off.value); @@ -14504,19 +14212,10 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg, /* We get our maximum from the var_off, and our minimum is the * maximum of the operands' minima */ - dst_reg->umin_value = max(dst_reg->umin_value, umin_val); - dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask; + reg_set_urange64(dst_reg, + max(reg_umin(dst_reg), umin_val), + dst_reg->var_off.value | dst_reg->var_off.mask); - /* Safe to set s64 bounds by casting u64 result into s64 when u64 - * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. - */ - if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) { - dst_reg->smin_value = dst_reg->umin_value; - dst_reg->smax_value = dst_reg->umax_value; - } else { - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); } @@ -14534,19 +14233,7 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg, } /* We get both minimum and maximum from the var32_off. */ - dst_reg->u32_min_value = var32_off.value; - dst_reg->u32_max_value = var32_off.value | var32_off.mask; - - /* Safe to set s32 bounds by casting u32 result into s32 when u32 - * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. - */ - if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) { - dst_reg->s32_min_value = dst_reg->u32_min_value; - dst_reg->s32_max_value = dst_reg->u32_max_value; - } else { - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; - } + reg_set_urange32(dst_reg, var32_off.value, var32_off.value | var32_off.mask); } static void scalar_min_max_xor(struct bpf_reg_state *dst_reg, @@ -14562,46 +14249,30 @@ static void scalar_min_max_xor(struct bpf_reg_state *dst_reg, } /* We get both minimum and maximum from the var_off. */ - dst_reg->umin_value = dst_reg->var_off.value; - dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask; - - /* Safe to set s64 bounds by casting u64 result into s64 when u64 - * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. - */ - if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) { - dst_reg->smin_value = dst_reg->umin_value; - dst_reg->smax_value = dst_reg->umax_value; - } else { - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } - - __update_reg_bounds(dst_reg); + reg_set_urange64(dst_reg, + dst_reg->var_off.value, + dst_reg->var_off.value | dst_reg->var_off.mask); } static void __scalar32_min_max_lsh(struct bpf_reg_state *dst_reg, u64 umin_val, u64 umax_val) { - /* We lose all sign bit information (except what we can pick - * up from var_off) - */ - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; /* If we might shift our top bit out, then we know nothing */ - if (umax_val > 31 || dst_reg->u32_max_value > 1ULL << (31 - umax_val)) { - dst_reg->u32_min_value = 0; - dst_reg->u32_max_value = U32_MAX; - } else { - dst_reg->u32_min_value <<= umin_val; - dst_reg->u32_max_value <<= umax_val; - } + if (umax_val > 31 || reg_u32_max(dst_reg) > 1ULL << (31 - umax_val)) + reg_set_urange32(dst_reg, 0, U32_MAX); + else + /* We lose all sign bit information (except what we can pick + * up from var_off) + */ + reg_set_urange32(dst_reg, reg_u32_min(dst_reg) << umin_val, + reg_u32_max(dst_reg) << umax_val); } static void scalar32_min_max_lsh(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u32 umax_val = src_reg->u32_max_value; - u32 umin_val = src_reg->u32_min_value; + u32 umax_val = reg_u32_max(src_reg); + u32 umin_val = reg_u32_min(src_reg); /* u32 alu operation will zext upper bits */ struct tnum subreg = tnum_subreg(dst_reg->var_off); @@ -14618,34 +14289,34 @@ static void scalar32_min_max_lsh(struct bpf_reg_state *dst_reg, static void __scalar64_min_max_lsh(struct bpf_reg_state *dst_reg, u64 umin_val, u64 umax_val) { + struct cnum64 u, s; + /* Special case <<32 because it is a common compiler pattern to sign * extend subreg by doing <<32 s>>32. smin/smax assignments are correct * because s32 bounds don't flip sign when shifting to the left by * 32bits. */ - if (umin_val == 32 && umax_val == 32) { - dst_reg->smax_value = (s64)dst_reg->s32_max_value << 32; - dst_reg->smin_value = (s64)dst_reg->s32_min_value << 32; - } else { - dst_reg->smax_value = S64_MAX; - dst_reg->smin_value = S64_MIN; - } + if (umin_val == 32 && umax_val == 32) + s = cnum64_from_srange((s64)reg_s32_min(dst_reg) << 32, + (s64)reg_s32_max(dst_reg) << 32); + else + s = CNUM64_UNBOUNDED; /* If we might shift our top bit out, then we know nothing */ - if (dst_reg->umax_value > 1ULL << (63 - umax_val)) { - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; - } else { - dst_reg->umin_value <<= umin_val; - dst_reg->umax_value <<= umax_val; - } + if (reg_umax(dst_reg) > 1ULL << (63 - umax_val)) + u = CNUM64_UNBOUNDED; + else + u = cnum64_from_urange(reg_umin(dst_reg) << umin_val, + reg_umax(dst_reg) << umax_val); + + dst_reg->r64 = cnum64_intersect(u, s); } static void scalar_min_max_lsh(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u64 umax_val = src_reg->umax_value; - u64 umin_val = src_reg->umin_value; + u64 umax_val = reg_umax(src_reg); + u64 umin_val = reg_umin(src_reg); /* scalar64 calc uses 32bit unshifted bounds so must be called first */ __scalar64_min_max_lsh(dst_reg, umin_val, umax_val); @@ -14660,8 +14331,8 @@ static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { struct tnum subreg = tnum_subreg(dst_reg->var_off); - u32 umax_val = src_reg->u32_max_value; - u32 umin_val = src_reg->u32_min_value; + u32 umax_val = reg_u32_max(src_reg); + u32 umin_val = reg_u32_min(src_reg); /* BPF_RSH is an unsigned shift. If the value in dst_reg might * be negative, then either: @@ -14677,12 +14348,10 @@ static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg, * and rely on inferring new ones from the unsigned bounds and * var_off of the result. */ - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; dst_reg->var_off = tnum_rshift(subreg, umin_val); - dst_reg->u32_min_value >>= umax_val; - dst_reg->u32_max_value >>= umin_val; + reg_set_urange32(dst_reg, reg_u32_min(dst_reg) >> umax_val, + reg_u32_max(dst_reg) >> umin_val); __mark_reg64_unbounded(dst_reg); __update_reg32_bounds(dst_reg); @@ -14691,8 +14360,8 @@ static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg, static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u64 umax_val = src_reg->umax_value; - u64 umin_val = src_reg->umin_value; + u64 umax_val = reg_umax(src_reg); + u64 umin_val = reg_umin(src_reg); /* BPF_RSH is an unsigned shift. If the value in dst_reg might * be negative, then either: @@ -14708,11 +14377,9 @@ static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg, * and rely on inferring new ones from the unsigned bounds and * var_off of the result. */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val); - dst_reg->umin_value >>= umax_val; - dst_reg->umax_value >>= umin_val; + reg_set_urange64(dst_reg, reg_umin(dst_reg) >> umax_val, + reg_umax(dst_reg) >> umin_val); /* Its not easy to operate on alu32 bounds here because it depends * on bits being shifted in. Take easy way out and mark unbounded @@ -14725,22 +14392,19 @@ static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg, static void scalar32_min_max_arsh(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u64 umin_val = src_reg->u32_min_value; + u64 umin_val = reg_u32_min(src_reg); /* Upon reaching here, src_known is true and * umax_val is equal to umin_val. + * Blow away the dst_reg umin_value/umax_value and rely on + * dst_reg var_off to refine the result. */ - dst_reg->s32_min_value = (u32)(((s32)dst_reg->s32_min_value) >> umin_val); - dst_reg->s32_max_value = (u32)(((s32)dst_reg->s32_max_value) >> umin_val); + reg_set_srange32(dst_reg, + (u32)(((s32)reg_s32_min(dst_reg)) >> umin_val), + (u32)(((s32)reg_s32_max(dst_reg)) >> umin_val)); dst_reg->var_off = tnum_arshift(tnum_subreg(dst_reg->var_off), umin_val, 32); - /* blow away the dst_reg umin_value/umax_value and rely on - * dst_reg var_off to refine the result. - */ - dst_reg->u32_min_value = 0; - dst_reg->u32_max_value = U32_MAX; - __mark_reg64_unbounded(dst_reg); __update_reg32_bounds(dst_reg); } @@ -14748,22 +14412,16 @@ static void scalar32_min_max_arsh(struct bpf_reg_state *dst_reg, static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u64 umin_val = src_reg->umin_value; + u64 umin_val = reg_umin(src_reg); /* Upon reaching here, src_known is true and umax_val is equal * to umin_val. */ - dst_reg->smin_value >>= umin_val; - dst_reg->smax_value >>= umin_val; + reg_set_srange64(dst_reg, reg_smin(dst_reg) >> umin_val, + reg_smax(dst_reg) >> umin_val); dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val, 64); - /* blow away the dst_reg umin_value/umax_value and rely on - * dst_reg var_off to refine the result. - */ - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; - /* Its not easy to operate on alu32 bounds here because it depends * on bits being shifted in from upper 32-bits. Take easy way out * and mark unbounded so we can recalculate later from tnum. @@ -14829,13 +14487,13 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, if (insn_bitness == 32) { if (tnum_subreg_is_const(src_reg->var_off) - && src_reg->s32_min_value == src_reg->s32_max_value - && src_reg->u32_min_value == src_reg->u32_max_value) + && reg_s32_min(src_reg) == reg_s32_max(src_reg) + && reg_u32_min(src_reg) == reg_u32_max(src_reg)) src_is_const = true; } else { if (tnum_is_const(src_reg->var_off) - && src_reg->smin_value == src_reg->smax_value - && src_reg->umin_value == src_reg->umax_value) + && reg_smin(src_reg) == reg_smax(src_reg) + && reg_umin(src_reg) == reg_umax(src_reg)) src_is_const = true; } @@ -14865,7 +14523,7 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, case BPF_LSH: case BPF_RSH: case BPF_ARSH: - return (src_is_const && src_reg->umax_value < insn_bitness); + return (src_is_const && reg_umax(src_reg) < insn_bitness); default: return false; } @@ -14878,9 +14536,9 @@ static int maybe_fork_scalars(struct bpf_verifier_env *env, struct bpf_insn *ins struct bpf_reg_state *regs; bool alu32; - if (dst_reg->smin_value == -1 && dst_reg->smax_value == 0) + if (reg_smin(dst_reg) == -1 && reg_smax(dst_reg) == 0) alu32 = false; - else if (dst_reg->s32_min_value == -1 && dst_reg->s32_max_value == 0) + else if (reg_s32_min(dst_reg) == -1 && reg_s32_max(dst_reg) == 0) alu32 = true; else return 0; @@ -14964,7 +14622,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, break; case BPF_DIV: /* BPF div specification: x / 0 = 0 */ - if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0)) { + if ((alu32 && reg_u32_min(&src_reg) == 0) || (!alu32 && reg_umin(&src_reg) == 0)) { ___mark_reg_known(dst_reg, 0); break; } @@ -14981,7 +14639,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, break; case BPF_MOD: /* BPF mod specification: x % 0 = x */ - if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0)) + if ((alu32 && reg_u32_min(&src_reg) == 0) || (!alu32 && reg_umin(&src_reg) == 0)) break; if (alu32) if (off == 1) @@ -15169,7 +14827,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * umax_value before the ALU operation. After adjust_scalar_min_max_vals(), * alu32 ops will have zero-extended the result, making umax_value <= U32_MAX. */ - u64 dst_umax = dst_reg->umax_value; + u64 dst_umax = reg_umax(dst_reg); err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg); if (err) @@ -15299,7 +14957,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) * copy register state to dest reg */ assign_scalar_id_before_mov(env, src_reg); - copy_register_state(dst_reg, src_reg); + *dst_reg = *src_reg; dst_reg->subreg_def = DEF_NOT_SUBREG; } else { /* case: R1 = (s8, s16 s32)R2 */ @@ -15311,10 +14969,10 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else if (src_reg->type == SCALAR_VALUE) { bool no_sext; - no_sext = src_reg->umax_value < (1ULL << (insn->off - 1)); + no_sext = reg_umax(src_reg) < (1ULL << (insn->off - 1)); if (no_sext) assign_scalar_id_before_mov(env, src_reg); - copy_register_state(dst_reg, src_reg); + *dst_reg = *src_reg; if (!no_sext) clear_scalar_id(dst_reg); coerce_reg_to_size_sx(dst_reg, insn->off >> 3); @@ -15336,7 +14994,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (is_src_reg_u32) assign_scalar_id_before_mov(env, src_reg); - copy_register_state(dst_reg, src_reg); + *dst_reg = *src_reg; /* Make sure ID is cleared if src_reg is not in u32 * range otherwise dst_reg min/max could be incorrectly * propagated into src_reg by sync_linked_regs() @@ -15346,11 +15004,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) dst_reg->subreg_def = env->insn_idx + 1; } else { /* case: W1 = (s8, s16)W2 */ - bool no_sext = src_reg->umax_value < (1ULL << (insn->off - 1)); + bool no_sext = reg_umax(src_reg) < (1ULL << (insn->off - 1)); if (no_sext) assign_scalar_id_before_mov(env, src_reg); - copy_register_state(dst_reg, src_reg); + *dst_reg = *src_reg; if (!no_sext) clear_scalar_id(dst_reg); dst_reg->subreg_def = env->insn_idx + 1; @@ -15428,17 +15086,17 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, struct bpf_reg_state *reg; int new_range; - if (dst_reg->umax_value == 0 && range_right_open) + if (reg_umax(dst_reg) == 0 && range_right_open) /* This doesn't give us any range */ return; - if (dst_reg->umax_value > MAX_PACKET_OFF) + if (reg_umax(dst_reg) > MAX_PACKET_OFF) /* Risk of overflow. For instance, ptr + (1<<63) may be less * than pkt_end, but that's because it's also less than pkt. */ return; - new_range = dst_reg->umax_value; + new_range = reg_umax(dst_reg); if (range_right_open) new_range++; @@ -15487,7 +15145,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, /* If our ids match, then we must have the same max_value. And we * don't care about the other reg's fixed offset, since if it's too big * the range won't allow anything. - * dst_reg->umax_value is known < MAX_PACKET_OFF, therefore it fits in a u16. + * reg_umax(dst_reg) is known < MAX_PACKET_OFF, therefore it fits in a u16. */ bpf_for_each_reg_in_vstate(vstate, state, reg, ({ if (reg->type == type && reg->id == dst_reg->id) @@ -15543,14 +15201,14 @@ static int is_scalar_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_s { struct tnum t1 = is_jmp32 ? tnum_subreg(reg1->var_off) : reg1->var_off; struct tnum t2 = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off; - u64 umin1 = is_jmp32 ? (u64)reg1->u32_min_value : reg1->umin_value; - u64 umax1 = is_jmp32 ? (u64)reg1->u32_max_value : reg1->umax_value; - s64 smin1 = is_jmp32 ? (s64)reg1->s32_min_value : reg1->smin_value; - s64 smax1 = is_jmp32 ? (s64)reg1->s32_max_value : reg1->smax_value; - u64 umin2 = is_jmp32 ? (u64)reg2->u32_min_value : reg2->umin_value; - u64 umax2 = is_jmp32 ? (u64)reg2->u32_max_value : reg2->umax_value; - s64 smin2 = is_jmp32 ? (s64)reg2->s32_min_value : reg2->smin_value; - s64 smax2 = is_jmp32 ? (s64)reg2->s32_max_value : reg2->smax_value; + u64 umin1 = is_jmp32 ? (u64)reg_u32_min(reg1) : reg_umin(reg1); + u64 umax1 = is_jmp32 ? (u64)reg_u32_max(reg1) : reg_umax(reg1); + s64 smin1 = is_jmp32 ? (s64)reg_s32_min(reg1) : reg_smin(reg1); + s64 smax1 = is_jmp32 ? (s64)reg_s32_max(reg1) : reg_smax(reg1); + u64 umin2 = is_jmp32 ? (u64)reg_u32_min(reg2) : reg_umin(reg2); + u64 umax2 = is_jmp32 ? (u64)reg_u32_max(reg2) : reg_umax(reg2); + s64 smin2 = is_jmp32 ? (s64)reg_s32_min(reg2) : reg_smin(reg2); + s64 smax2 = is_jmp32 ? (s64)reg_s32_max(reg2) : reg_smax(reg2); if (reg1 == reg2) { switch (opcode) { @@ -15595,11 +15253,11 @@ static int is_scalar_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_s * utilize 32-bit subrange knowledge to eliminate * branches that can't be taken a priori */ - if (reg1->u32_min_value > reg2->u32_max_value || - reg1->u32_max_value < reg2->u32_min_value) + if (reg_u32_min(reg1) > reg_u32_max(reg2) || + reg_u32_max(reg1) < reg_u32_min(reg2)) return 0; - if (reg1->s32_min_value > reg2->s32_max_value || - reg1->s32_max_value < reg2->s32_min_value) + if (reg_s32_min(reg1) > reg_s32_max(reg2) || + reg_s32_max(reg1) < reg_s32_min(reg2)) return 0; } break; @@ -15621,11 +15279,11 @@ static int is_scalar_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_s * utilize 32-bit subrange knowledge to eliminate * branches that can't be taken a priori */ - if (reg1->u32_min_value > reg2->u32_max_value || - reg1->u32_max_value < reg2->u32_min_value) + if (reg_u32_min(reg1) > reg_u32_max(reg2) || + reg_u32_max(reg1) < reg_u32_min(reg2)) return 1; - if (reg1->s32_min_value > reg2->s32_max_value || - reg1->s32_max_value < reg2->s32_min_value) + if (reg_s32_min(reg1) > reg_s32_max(reg2) || + reg_s32_max(reg1) < reg_s32_min(reg2)) return 1; } break; @@ -15780,7 +15438,7 @@ static int is_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_state *r if (!is_reg_const(reg2, is_jmp32)) return -1; - if (!reg_not_null(reg1)) + if (!reg_not_null(env, reg1)) return -1; /* If pointer is valid tests against zero will fail so we can @@ -15852,27 +15510,15 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state switch (opcode) { case BPF_JEQ: if (is_jmp32) { - reg1->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value); - reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value); - reg1->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value); - reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value); - reg2->u32_min_value = reg1->u32_min_value; - reg2->u32_max_value = reg1->u32_max_value; - reg2->s32_min_value = reg1->s32_min_value; - reg2->s32_max_value = reg1->s32_max_value; + reg1->r32 = cnum32_intersect(reg1->r32, reg2->r32); + reg2->r32 = reg1->r32; t = tnum_intersect(tnum_subreg(reg1->var_off), tnum_subreg(reg2->var_off)); reg1->var_off = tnum_with_subreg(reg1->var_off, t); reg2->var_off = tnum_with_subreg(reg2->var_off, t); } else { - reg1->umin_value = max(reg1->umin_value, reg2->umin_value); - reg1->umax_value = min(reg1->umax_value, reg2->umax_value); - reg1->smin_value = max(reg1->smin_value, reg2->smin_value); - reg1->smax_value = min(reg1->smax_value, reg2->smax_value); - reg2->umin_value = reg1->umin_value; - reg2->umax_value = reg1->umax_value; - reg2->smin_value = reg1->smin_value; - reg2->smax_value = reg1->smax_value; + reg1->r64 = cnum64_intersect(reg1->r64, reg2->r64); + reg2->r64 = reg1->r64; reg1->var_off = tnum_intersect(reg1->var_off, reg2->var_off); reg2->var_off = reg1->var_off; @@ -15889,32 +15535,11 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state */ val = reg_const_value(reg2, is_jmp32); if (is_jmp32) { - /* u32_min_value is not equal to 0xffffffff at this point, - * because otherwise u32_max_value is 0xffffffff as well, - * in such a case both reg1 and reg2 would be constants, - * jump would be predicted and regs_refine_cond_op() - * wouldn't be called. - * - * Same reasoning works for all {u,s}{min,max}{32,64} cases - * below. - */ - if (reg1->u32_min_value == (u32)val) - reg1->u32_min_value++; - if (reg1->u32_max_value == (u32)val) - reg1->u32_max_value--; - if (reg1->s32_min_value == (s32)val) - reg1->s32_min_value++; - if (reg1->s32_max_value == (s32)val) - reg1->s32_max_value--; + /* Complement of the range [val, val] as cnum32. */ + cnum32_intersect_with(®1->r32, (struct cnum32){ val + 1, U32_MAX - 1 }); } else { - if (reg1->umin_value == (u64)val) - reg1->umin_value++; - if (reg1->umax_value == (u64)val) - reg1->umax_value--; - if (reg1->smin_value == (s64)val) - reg1->smin_value++; - if (reg1->smax_value == (s64)val) - reg1->smax_value--; + /* Complement of the range [val, val] as cnum64. */ + cnum64_intersect_with(®1->r64, (struct cnum64){ val + 1, U64_MAX - 1 }); } break; case BPF_JSET: @@ -15961,38 +15586,38 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state break; case BPF_JLE: if (is_jmp32) { - reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value); - reg2->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value); + cnum32_intersect_with_urange(®1->r32, 0, reg_u32_max(reg2)); + cnum32_intersect_with_urange(®2->r32, reg_u32_min(reg1), U32_MAX); } else { - reg1->umax_value = min(reg1->umax_value, reg2->umax_value); - reg2->umin_value = max(reg1->umin_value, reg2->umin_value); + cnum64_intersect_with_urange(®1->r64, 0, reg_umax(reg2)); + cnum64_intersect_with_urange(®2->r64, reg_umin(reg1), U64_MAX); } break; case BPF_JLT: if (is_jmp32) { - reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value - 1); - reg2->u32_min_value = max(reg1->u32_min_value + 1, reg2->u32_min_value); + cnum32_intersect_with_urange(®1->r32, 0, reg_u32_max(reg2) - 1); + cnum32_intersect_with_urange(®2->r32, reg_u32_min(reg1) + 1, U32_MAX); } else { - reg1->umax_value = min(reg1->umax_value, reg2->umax_value - 1); - reg2->umin_value = max(reg1->umin_value + 1, reg2->umin_value); + cnum64_intersect_with_urange(®1->r64, 0, reg_umax(reg2) - 1); + cnum64_intersect_with_urange(®2->r64, reg_umin(reg1) + 1, U64_MAX); } break; case BPF_JSLE: if (is_jmp32) { - reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value); - reg2->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value); + cnum32_intersect_with_srange(®1->r32, S32_MIN, reg_s32_max(reg2)); + cnum32_intersect_with_srange(®2->r32, reg_s32_min(reg1), S32_MAX); } else { - reg1->smax_value = min(reg1->smax_value, reg2->smax_value); - reg2->smin_value = max(reg1->smin_value, reg2->smin_value); + cnum64_intersect_with_srange(®1->r64, S64_MIN, reg_smax(reg2)); + cnum64_intersect_with_srange(®2->r64, reg_smin(reg1), S64_MAX); } break; case BPF_JSLT: if (is_jmp32) { - reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value - 1); - reg2->s32_min_value = max(reg1->s32_min_value + 1, reg2->s32_min_value); + cnum32_intersect_with_srange(®1->r32, S32_MIN, reg_s32_max(reg2) - 1); + cnum32_intersect_with_srange(®2->r32, reg_s32_min(reg1) + 1, S32_MAX); } else { - reg1->smax_value = min(reg1->smax_value, reg2->smax_value - 1); - reg2->smin_value = max(reg1->smin_value + 1, reg2->smin_value); + cnum64_intersect_with_srange(®1->r64, S64_MIN, reg_smax(reg2) - 1); + cnum64_intersect_with_srange(®2->r64, reg_smin(reg1) + 1, S64_MAX); } break; default: @@ -16030,7 +15655,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, WARN_ON_ONCE(!tnum_equals_const(reg->var_off, 0))) return; if (is_null) { - /* We don't need id and ref_obj_id from this point + /* We don't need id from this point * onwards anymore, thus we should better reset it, * so that state pruning has chances to take effect. */ @@ -16042,15 +15667,10 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, mark_ptr_not_null_reg(reg); - if (!reg_may_point_to_spin_lock(reg)) { - /* For not-NULL ptr, reg->ref_obj_id will be reset - * in release_reference(). - * - * reg->id is still used by spin_lock ptr. Other - * than spin_lock ptr type, reg->id can be reset. - */ - reg->id = 0; - } + /* + * reg->id is preserved for object relationship tracking + * and spin_lock lock state tracking + */ } } @@ -16062,10 +15682,9 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, { struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = state->regs, *reg; - u32 ref_obj_id = regs[regno].ref_obj_id; u32 id = regs[regno].id; - if (ref_obj_id && ref_obj_id == id && is_null) + if (is_null && find_reference_state(vstate, id)) /* regs[regno] is in the " == NULL" branch. * No one could have freed the reference state before * doing the NULL check. @@ -16263,7 +15882,7 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s reg->delta == known_reg->delta) { s32 saved_subreg_def = reg->subreg_def; - copy_register_state(reg, known_reg); + *reg = *known_reg; reg->subreg_def = saved_subreg_def; } else { s32 saved_subreg_def = reg->subreg_def; @@ -16274,7 +15893,7 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s __mark_reg_known(&fake_reg, (s64)reg->delta - (s64)known_reg->delta); /* reg = known_reg; reg += delta */ - copy_register_state(reg, known_reg); + *reg = *known_reg; /* * Must preserve off, id and subreg_def flag, * otherwise another sync_linked_regs() will be incorrect. @@ -16371,16 +15990,16 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } if (insn_flags) { - err = bpf_push_jmp_history(env, this_branch, insn_flags, 0); + err = bpf_push_jmp_history(env, this_branch, insn_flags, 0, 0, 0); if (err) return err; } is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; - copy_register_state(&env->false_reg1, dst_reg); - copy_register_state(&env->false_reg2, src_reg); - copy_register_state(&env->true_reg1, dst_reg); - copy_register_state(&env->true_reg2, src_reg); + env->false_reg1 = *dst_reg; + env->false_reg2 = *src_reg; + env->true_reg1 = *dst_reg; + env->true_reg2 = *src_reg; pred = is_branch_taken(env, dst_reg, src_reg, opcode, is_jmp32); if (pred >= 0) { /* If we get here with a dst_reg pointer type it is because @@ -16435,7 +16054,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (dst_reg->type == SCALAR_VALUE && dst_reg->id) collect_linked_regs(env, this_branch, dst_reg->id, &linked_regs); if (linked_regs.cnt > 1) { - err = bpf_push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs)); + err = bpf_push_jmp_history(env, this_branch, 0, 0, 0, linked_regs_pack(&linked_regs)); if (err) return err; } @@ -16449,11 +16068,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (err) return err; - copy_register_state(dst_reg, &env->false_reg1); - copy_register_state(src_reg, &env->false_reg2); - copy_register_state(&other_branch_regs[insn->dst_reg], &env->true_reg1); + *dst_reg = env->false_reg1; + *src_reg = env->false_reg2; + other_branch_regs[insn->dst_reg] = env->true_reg1; if (BPF_SRC(insn->code) == BPF_X) - copy_register_state(&other_branch_regs[insn->src_reg], &env->true_reg2); + other_branch_regs[insn->src_reg] = env->true_reg2; if (BPF_SRC(insn->code) == BPF_X && src_reg->type == SCALAR_VALUE && src_reg->id && @@ -16788,6 +16407,9 @@ static bool return_retval_range(struct bpf_verifier_env *env, struct bpf_retval_ case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_TRACE_FSESSION: + case BPF_TRACE_FENTRY_MULTI: + case BPF_TRACE_FEXIT_MULTI: + case BPF_TRACE_FSESSION_MULTI: *range = retval_range(0, 0); break; case BPF_TRACE_RAW_TP: @@ -16904,8 +16526,8 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char ret_type = btf_type_resolve_ptr(prog->aux->attach_btf, prog->aux->attach_func_proto->type, NULL); - if (ret_type && ret_type == reg_type && reg->ref_obj_id) - return __check_ptr_off_reg(env, reg, regno, false); + if (ret_type && ret_type == reg_type && reg_is_referenced(env, reg)) + return __check_ptr_off_reg(env, reg, argno_from_reg(regno), false); } /* eBPF calling convention is such that R0 is used @@ -16977,6 +16599,10 @@ static int check_global_subprog_return_code(struct bpf_verifier_env *env) if (err) return err; + /* Pointers to arena are safe to pass between subprograms. */ + if (is_arena_reg(env, BPF_REG_0)) + return 0; + if (is_pointer_value(env, BPF_REG_0)) { verbose(env, "R%d leaks addr as return value\n", BPF_REG_0); return -EACCES; @@ -17493,16 +17119,16 @@ static int indirect_jump_min_max_index(struct bpf_verifier_env *env, u32 *pmin_index, u32 *pmax_index) { struct bpf_reg_state *reg = reg_state(env, regno); - u64 min_index = reg->umin_value; - u64 max_index = reg->umax_value; + u64 min_index = reg_umin(reg); + u64 max_index = reg_umax(reg); const u32 size = 8; if (min_index > (u64) U32_MAX * size) { - verbose(env, "the sum of R%u umin_value %llu is too big\n", regno, reg->umin_value); + verbose(env, "the sum of R%u umin_value %llu is too big\n", regno, reg_umin(reg)); return -ERANGE; } if (max_index > (u64) U32_MAX * size) { - verbose(env, "the sum of R%u umax_value %llu is too big\n", regno, reg->umax_value); + verbose(env, "the sum of R%u umax_value %llu is too big\n", regno, reg_umax(reg)); return -ERANGE; } @@ -17601,6 +17227,14 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) return check_store_reg(env, insn, false); case BPF_ST: { + /* Handle stack arg write (store immediate) */ + if (is_stack_arg_st(insn)) { + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + + return check_stack_arg_write(env, state, insn->off, NULL); + } + enum bpf_reg_type dst_reg_type; err = check_reg_arg(env, insn->dst_reg, SRC_OP); @@ -17609,7 +17243,7 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) dst_reg_type = cur_regs(env)[insn->dst_reg].type; - err = check_mem_access(env, env->insn_idx, insn->dst_reg, + err = check_mem_access(env, env->insn_idx, cur_regs(env) + insn->dst_reg, argno_from_reg(insn->dst_reg), insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, false, false); if (err) @@ -17635,6 +17269,8 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) } } mark_reg_scratched(env, BPF_REG_0); + if (bpf_in_stack_arg_cnt(&env->subprog_info[cur_func(env)->subprogno])) + cur_func(env)->no_stack_arg_load = true; if (insn->src_reg == BPF_PSEUDO_CALL) return check_func_call(env, insn, &env->insn_idx); if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) @@ -17732,7 +17368,7 @@ static int do_check(struct bpf_verifier_env *env) } if (bpf_is_jmp_point(env, env->insn_idx)) { - err = bpf_push_jmp_history(env, state, 0, 0); + err = bpf_push_jmp_history(env, state, 0, 0, 0, 0); if (err) return err; } @@ -18117,11 +17753,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, if (prog->sleepable) switch (map->map_type) { case BPF_MAP_TYPE_HASH: + case BPF_MAP_TYPE_RHASH: case BPF_MAP_TYPE_LRU_HASH: case BPF_MAP_TYPE_ARRAY: case BPF_MAP_TYPE_PERCPU_HASH: case BPF_MAP_TYPE_PERCPU_ARRAY: case BPF_MAP_TYPE_LRU_PERCPU_HASH: + case BPF_MAP_TYPE_LPM_TRIE: case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH_OF_MAPS: case BPF_MAP_TYPE_RINGBUF: @@ -18439,11 +18077,12 @@ static int check_and_resolve_insns(struct bpf_verifier_env *env) return err; for (i = 0; i < insn_cnt; i++, insn++) { - if (insn->dst_reg >= MAX_BPF_REG) { + if (insn->dst_reg >= MAX_BPF_REG && + !is_stack_arg_st(insn) && !is_stack_arg_stx(insn)) { verbose(env, "R%d is invalid\n", insn->dst_reg); return -EINVAL; } - if (insn->src_reg >= MAX_BPF_REG) { + if (insn->src_reg >= MAX_BPF_REG && !is_stack_arg_ldx(insn)) { verbose(env, "R%d is invalid\n", insn->src_reg); return -EINVAL; } @@ -18750,7 +18389,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) goto out; } } - for (i = BPF_REG_1; i <= sub->arg_cnt; i++) { + for (i = BPF_REG_1; i <= min_t(u32, sub->arg_cnt, MAX_BPF_FUNC_REG_ARGS); i++) { arg = &sub->args[i - BPF_REG_1]; reg = ®s[i]; @@ -18760,9 +18399,9 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) } else if (arg->arg_type == ARG_ANYTHING) { reg->type = SCALAR_VALUE; mark_reg_unknown(env, regs, i); - } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) { + } else if (arg->arg_type == ARG_PTR_TO_DYNPTR) { /* assume unspecial LOCAL dynptr type */ - __mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen); + __mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen, 0); } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) { reg->type = PTR_TO_MEM; reg->type |= arg->arg_type & @@ -18788,11 +18427,17 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) mark_reg_unknown(env, regs, i); } else { verifier_bug(env, "unhandled arg#%d type %d", - i - BPF_REG_1, arg->arg_type); + i - BPF_REG_1 + 1, arg->arg_type); ret = -EFAULT; goto out; } } + if (env->prog->type == BPF_PROG_TYPE_EXT && sub->arg_cnt > MAX_BPF_FUNC_REG_ARGS) { + verbose(env, "freplace programs with >%d args not supported yet\n", + MAX_BPF_FUNC_REG_ARGS); + ret = -EINVAL; + goto out; + } } else { /* if main BPF program has associated BTF info, validate that * it's matching expected signature, and otherwise mark BTF @@ -18800,8 +18445,11 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) */ if (env->prog->aux->func_info_aux) { ret = btf_prepare_func_args(env, 0); - if (ret || sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_PTR_TO_CTX) + if (ret || sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_PTR_TO_CTX) { env->prog->aux->func_info_aux[0].unreliable = true; + sub->arg_cnt = 1; + sub->stack_arg_cnt = 0; + } } /* 1st arg to a function */ @@ -18811,9 +18459,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) /* Acquire references for struct_ops program arguments tagged with "__ref" */ if (!subprog && env->prog->type == BPF_PROG_TYPE_STRUCT_OPS) { - for (i = 0; i < aux->ctx_arg_info_size; i++) - aux->ctx_arg_info[i].ref_obj_id = aux->ctx_arg_info[i].refcounted ? - acquire_reference(env, 0) : 0; + for (i = 0; i < aux->ctx_arg_info_size; i++) { + ret = aux->ctx_arg_info[i].refcounted ? acquire_reference(env, 0, 0) : 0; + if (ret < 0) + goto out; + + aux->ctx_arg_info[i].ref_id = ret; + } } ret = do_check(env); @@ -18849,6 +18501,7 @@ static int do_check_subprogs(struct bpf_verifier_env *env) struct bpf_prog_aux *aux = env->prog->aux; struct bpf_func_info_aux *sub_aux; int i, ret, new_cnt; + u32 insn_processed; if (!aux->func_info) return 0; @@ -18863,6 +18516,8 @@ again: if (!bpf_subprog_is_global(env, i)) continue; + insn_processed = env->insn_processed; + sub_aux = subprog_aux(env, i); if (!sub_aux->called || sub_aux->verified) continue; @@ -18870,6 +18525,7 @@ again: env->insn_idx = env->subprog_info[i].start; WARN_ON_ONCE(env->insn_idx == 0); ret = do_check_common(env, i); + env->subprog_info[i].insn_processed = env->insn_processed - insn_processed; if (ret) { return ret; } else if (env->log.level & BPF_LOG_LEVEL) { @@ -18896,10 +18552,12 @@ again: static int do_check_main(struct bpf_verifier_env *env) { + u32 insn_processed = env->insn_processed; int ret; env->insn_idx = 0; ret = do_check_common(env, 0); + env->subprog_info[0].insn_processed = env->insn_processed - insn_processed; if (!ret) env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; return ret; @@ -18908,19 +18566,20 @@ static int do_check_main(struct bpf_verifier_env *env) static void print_verification_stats(struct bpf_verifier_env *env) { - int i; + /* Skip over hidden subprogs which are not verified. */ + int i, subprog_cnt = env->subprog_cnt - env->hidden_subprog_cnt; if (env->log.level & BPF_LOG_STATS) { verbose(env, "verification time %lld usec\n", div_u64(env->verification_time, 1000)); - verbose(env, "stack depth "); - for (i = 0; i < env->subprog_cnt; i++) { - u32 depth = env->subprog_info[i].stack_depth; - - verbose(env, "%d", depth); - if (i + 1 < env->subprog_cnt) - verbose(env, "+"); - } + verbose(env, "stack depth %d", env->subprog_info[0].stack_depth); + for (i = 1; i < subprog_cnt; i++) + verbose(env, "+%d", env->subprog_info[i].stack_depth); + verbose(env, " max %d\n", env->max_stack_depth); + verbose(env, "insns processed %d", env->subprog_info[0].insn_processed); + for (i = 1; i < subprog_cnt; i++) + if (bpf_subprog_is_global(env, i)) + verbose(env, "+%d", env->subprog_info[i].insn_processed); verbose(env, "\n"); } verbose(env, "processed %d insns (limit %d) max_states_per_insn %d " @@ -19142,6 +18801,60 @@ static int check_attach_modify_return(unsigned long addr, const char *func_name) #endif /* CONFIG_FUNCTION_ERROR_INJECTION */ +static bool is_tracing_multi_id(const struct bpf_prog *prog, u32 btf_id) +{ + return is_tracing_multi(prog->expected_attach_type) && bpf_multi_func_btf_id[0] == btf_id; +} + +static int btf_id_allow_sleepable(u32 btf_id, unsigned long addr, const struct bpf_prog *prog, + const struct btf *btf) +{ + const struct btf_type *t; + const char *tname; + + switch (prog->type) { + case BPF_PROG_TYPE_TRACING: + t = btf_type_by_id(btf, btf_id); + if (!t) + return -EINVAL; + tname = btf_name_by_offset(btf, t->name_off); + if (!tname) + return -EINVAL; + + /* + * *.multi sleepable programs will pass initial sleepable check, + * the actual attached btf ids are checked later during the link + * attachment. + */ + if (is_tracing_multi_id(prog, btf_id)) + return 0; + if (!check_attach_sleepable(btf_id, addr, tname)) + return 0; + /* + * fentry/fexit/fmod_ret progs can also be sleepable if they are + * in the fmodret id set with the KF_SLEEPABLE flag. + */ + else { + u32 *flags = btf_kfunc_is_modify_return(btf, btf_id, prog); + + if (flags && (*flags & KF_SLEEPABLE)) + return 0; + } + break; + case BPF_PROG_TYPE_LSM: + /* + * LSM progs check that they are attached to bpf_lsm_*() funcs. + * Only some of them are sleepable. + */ + if (bpf_lsm_is_sleepable_hook(btf_id)) + return 0; + break; + default: + break; + } + return -EINVAL; +} + int bpf_check_attach_target(struct bpf_verifier_log *log, const struct bpf_prog *prog, const struct bpf_prog *tgt_prog, @@ -19264,7 +18977,10 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, prog_extension && (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY || tgt_prog->expected_attach_type == BPF_TRACE_FEXIT || - tgt_prog->expected_attach_type == BPF_TRACE_FSESSION)) { + tgt_prog->expected_attach_type == BPF_TRACE_FENTRY_MULTI || + tgt_prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI || + tgt_prog->expected_attach_type == BPF_TRACE_FSESSION || + tgt_prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI)) { /* Program extensions can extend all program types * except fentry/fexit. The reason is the following. * The fentry/fexit programs are used for performance @@ -19314,6 +19030,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, btp = bpf_get_raw_tracepoint(tname); if (!btp) return -EINVAL; + if (prog->sleepable && !tracepoint_is_faultable(btp->tp)) { + bpf_log(log, "Sleepable program cannot attach to non-faultable tracepoint %s\n", + tname); + bpf_put_raw_tracepoint(btp); + return -EINVAL; + } fname = kallsyms_lookup((unsigned long)btp->bpf_func, NULL, NULL, NULL, trace_symbol); bpf_put_raw_tracepoint(btp); @@ -19364,7 +19086,11 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_TRACE_FSESSION: - if (prog->expected_attach_type == BPF_TRACE_FSESSION && + case BPF_TRACE_FSESSION_MULTI: + case BPF_TRACE_FENTRY_MULTI: + case BPF_TRACE_FEXIT_MULTI: + if ((prog->expected_attach_type == BPF_TRACE_FSESSION || + prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) && !bpf_jit_supports_fsession()) { bpf_log(log, "JIT does not support fsession\n"); return -EOPNOTSUPP; @@ -19393,7 +19119,18 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, if (ret < 0) return ret; - if (tgt_prog) { + /* + * *.multi programs don't need an address during program + * verification, we just take the module ref if needed. + */ + if (is_tracing_multi_id(prog, btf_id)) { + if (btf_is_module(btf)) { + mod = btf_try_get_module(btf); + if (!mod) + return -ENOENT; + } + addr = 0; + } else if (tgt_prog) { if (subprog == 0) addr = (long) tgt_prog->bpf_func; else @@ -19418,32 +19155,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, } if (prog->sleepable) { - ret = -EINVAL; - switch (prog->type) { - case BPF_PROG_TYPE_TRACING: - if (!check_attach_sleepable(btf_id, addr, tname)) - ret = 0; - /* fentry/fexit/fmod_ret progs can also be sleepable if they are - * in the fmodret id set with the KF_SLEEPABLE flag. - */ - else { - u32 *flags = btf_kfunc_is_modify_return(btf, btf_id, - prog); - - if (flags && (*flags & KF_SLEEPABLE)) - ret = 0; - } - break; - case BPF_PROG_TYPE_LSM: - /* LSM progs check that they are attached to bpf_lsm_*() funcs. - * Only some of them are sleepable. - */ - if (bpf_lsm_is_sleepable_hook(btf_id)) - ret = 0; - break; - default: - break; - } + ret = btf_id_allow_sleepable(btf_id, addr, prog, btf); if (ret) { module_put(mod); bpf_log(log, "%s is not sleepable\n", tname); @@ -19530,14 +19242,22 @@ static bool can_be_sleepable(struct bpf_prog *prog) case BPF_MODIFY_RETURN: case BPF_TRACE_ITER: case BPF_TRACE_FSESSION: + case BPF_TRACE_RAW_TP: + case BPF_TRACE_FENTRY_MULTI: + case BPF_TRACE_FEXIT_MULTI: + case BPF_TRACE_FSESSION_MULTI: return true; default: return false; } } - return prog->type == BPF_PROG_TYPE_LSM || - prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ || - prog->type == BPF_PROG_TYPE_STRUCT_OPS; + if (prog->type == BPF_PROG_TYPE_LSM) + return prog->expected_attach_type != BPF_LSM_CGROUP; + + return prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ || + prog->type == BPF_PROG_TYPE_STRUCT_OPS || + prog->type == BPF_PROG_TYPE_RAW_TRACEPOINT || + prog->type == BPF_PROG_TYPE_TRACEPOINT; } static int check_attach_btf_id(struct bpf_verifier_env *env) @@ -19559,7 +19279,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } if (prog->sleepable && !can_be_sleepable(prog)) { - verbose(env, "Only fentry/fexit/fsession/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n"); + verbose(env, "Program of this type cannot be sleepable\n"); return -EINVAL; } @@ -19612,6 +19332,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return -EINVAL; } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT || prog->expected_attach_type == BPF_TRACE_FSESSION || + prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI || prog->expected_attach_type == BPF_MODIFY_RETURN) && btf_id_set_contains(&noreturn_deny, btf_id)) { verbose(env, "Attaching fexit/fsession/fmod_ret to __noreturn function '%s' is rejected.\n", @@ -19619,6 +19340,14 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return -EINVAL; } + /* + * We don't get trampoline for tracing_multi programs at this point, + * it's done when tracing_multi link is created. + */ + if (prog->type == BPF_PROG_TYPE_TRACING && + is_tracing_multi(prog->expected_attach_type)) + return 0; + key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id); tr = bpf_trampoline_get(key, &tgt_info); if (!tr) @@ -19631,6 +19360,62 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return 0; } +int bpf_check_attach_btf_id_multi(struct btf *btf, struct bpf_prog *prog, u32 btf_id, + struct bpf_attach_target_info *tgt_info) +{ + const struct btf_type *t; + unsigned long addr; + const char *tname; + int err; + + if (!btf_id || !btf) + return -EINVAL; + + /* Check noreturn attachment. */ + if ((prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI || + prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) && + btf_id_set_contains(&noreturn_deny, btf_id)) + return -EINVAL; + /* Check denied attachment. */ + if (btf_id_set_contains(&btf_id_deny, btf_id)) + return -EINVAL; + + /* Check and get function target data. */ + t = btf_type_by_id(btf, btf_id); + if (!t) + return -EINVAL; + tname = btf_name_by_offset(btf, t->name_off); + if (!tname) + return -EINVAL; + if (!btf_type_is_func(t)) + return -EINVAL; + t = btf_type_by_id(btf, t->type); + if (!btf_type_is_func_proto(t)) + return -EINVAL; + err = btf_distill_func_proto(NULL, btf, t, tname, &tgt_info->fmodel); + if (err < 0) + return err; + if (btf_is_module(btf)) { + /* The bpf program already holds reference to module. */ + if (WARN_ON_ONCE(!prog->aux->mod)) + return -EINVAL; + addr = find_kallsyms_symbol_value(prog->aux->mod, tname); + } else { + addr = kallsyms_lookup_name(tname); + } + if (!addr || !ftrace_location(addr)) + return -ENOENT; + + /* Check sleepable program attachment. */ + if (prog->sleepable) { + err = btf_id_allow_sleepable(btf_id, addr, prog, btf); + if (err) + return err; + } + tgt_info->tgt_addr = addr; + return 0; +} + struct btf *bpf_get_btf_vmlinux(void) { if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { @@ -19849,8 +19634,11 @@ int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int struct_meta_reg = BPF_REG_3; int node_offset_reg = BPF_REG_4; - /* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */ - if (is_bpf_rbtree_add_kfunc(desc->func_id)) { + /* list_add/rbtree_add have an extra arg (prev/less), + * so args-to-fixup are in diff regs. + */ + if (desc->func_id == special_kfunc_list[KF_bpf_list_add] || + is_bpf_rbtree_add_kfunc(desc->func_id)) { struct_meta_reg = BPF_REG_4; node_offset_reg = BPF_REG_5; } @@ -19868,7 +19656,9 @@ int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); *cnt = 1; } else if (desc->func_id == special_kfunc_list[KF_bpf_session_is_return] && - env->prog->expected_attach_type == BPF_TRACE_FSESSION) { + (env->prog->expected_attach_type == BPF_TRACE_FSESSION || + env->prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI)) { + /* * inline the bpf_session_is_return() for fsession: * bool bpf_session_is_return(void *ctx) @@ -19881,7 +19671,8 @@ int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1); *cnt = 3; } else if (desc->func_id == special_kfunc_list[KF_bpf_session_cookie] && - env->prog->expected_attach_type == BPF_TRACE_FSESSION) { + (env->prog->expected_attach_type == BPF_TRACE_FSESSION || + env->prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI)) { /* * inline bpf_session_cookie() for fsession: * __u64 *bpf_session_cookie(void *ctx) @@ -19912,12 +19703,12 @@ int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return 0; } -int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) +int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, + struct bpf_log_attr *attr_log) { u64 start_time = ktime_get_ns(); struct bpf_verifier_env *env; int i, len, ret = -EINVAL, err; - u32 log_true_size; bool is_priv; BTF_TYPE_EMIT(enum bpf_features); @@ -19964,9 +19755,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 /* user could have requested verbose verifier output * and supplied buffer to store the verification trace */ - ret = bpf_vlog_init(&env->log, attr->log_level, - (char __user *) (unsigned long) attr->log_buf, - attr->log_size); + ret = bpf_vlog_init(&env->log, attr_log->level, attr_log->ubuf, attr_log->size); if (ret) goto err_unlock; @@ -20128,17 +19917,10 @@ skip_full_check: env->prog->aux->verified_insns = env->insn_processed; /* preserve original error even if log finalization is successful */ - err = bpf_vlog_finalize(&env->log, &log_true_size); + err = bpf_log_attr_finalize(attr_log, &env->log); if (err) ret = err; - if (uattr_size >= offsetofend(union bpf_attr, log_true_size) && - copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, log_true_size), - &log_true_size, sizeof(log_true_size))) { - ret = -EFAULT; - goto err_release_maps; - } - if (ret) goto err_release_maps; |
