From cf83b2d2e2b64920bd6999b199dfa271d7e94cf8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 27 Oct 2020 23:10:54 -0700 Subject: bpf: Permit cond_resched for some iterators Commit e679654a704e ("bpf: Fix a rcu_sched stall issue with bpf task/task_file iterator") tries to fix rcu stalls warning which is caused by bpf task_file iterator when running "bpftool prog". rcu: INFO: rcu_sched self-detected stall on CPU rcu: \x097-....: (20999 ticks this GP) idle=302/1/0x4000000000000000 softirq=1508852/1508852 fqs=4913 \x09(t=21031 jiffies g=2534773 q=179750) NMI backtrace for cpu 7 CPU: 7 PID: 184195 Comm: bpftool Kdump: loaded Tainted: G W 5.8.0-00004-g68bfc7f8c1b4 #6 Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A17 05/03/2019 Call Trace: dump_stack+0x57/0x70 nmi_cpu_backtrace.cold+0x14/0x53 ? lapic_can_unplug_cpu.cold+0x39/0x39 nmi_trigger_cpumask_backtrace+0xb7/0xc7 rcu_dump_cpu_stacks+0xa2/0xd0 rcu_sched_clock_irq.cold+0x1ff/0x3d9 ? tick_nohz_handler+0x100/0x100 update_process_times+0x5b/0x90 tick_sched_timer+0x5e/0xf0 __hrtimer_run_queues+0x12a/0x2a0 hrtimer_interrupt+0x10e/0x280 __sysvec_apic_timer_interrupt+0x51/0xe0 asm_call_on_stack+0xf/0x20 sysvec_apic_timer_interrupt+0x6f/0x80 ... task_file_seq_next+0x52/0xa0 bpf_seq_read+0xb9/0x320 vfs_read+0x9d/0x180 ksys_read+0x5f/0xe0 do_syscall_64+0x38/0x60 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The fix is to limit the number of bpf program runs to be one million. This fixed the program in most cases. But we also found under heavy load, which can increase the wallclock time for bpf_seq_read(), the warning may still be possible. For example, calling bpf_delay() in the "while" loop of bpf_seq_read(), which will introduce artificial delay, the warning will show up in my qemu run. static unsigned q; volatile unsigned *p = &q; volatile unsigned long long ll; static void bpf_delay(void) { int i, j; for (i = 0; i < 10000; i++) for (j = 0; j < 10000; j++) ll += *p; } There are two ways to fix this issue. One is to reduce the above one million threshold to say 100,000 and hopefully rcu warning will not show up any more. Another is to introduce a target feature which enables bpf_seq_read() calling cond_resched(). This patch took second approach as the first approach may cause more -EAGAIN failures for read() syscalls. Note that not all bpf_iter targets can permit cond_resched() in bpf_seq_read() as some, e.g., netlink seq iterator, rcu read lock critical section spans through seq_ops->next() -> seq_ops->show() -> seq_ops->next(). For the kernel code with the above hack, "bpftool p" roughly takes 38 seconds to finish on my VM with 184 bpf program runs. Using the following command, I am able to collect the number of context switches: perf stat -e context-switches -- ./bpftool p >& log Without this patch, 69 context-switches With this patch, 75 context-switches This patch added additional 6 context switches, roughly every 6 seconds to reschedule, to avoid lengthy no-rescheduling which may cause the above RCU warnings. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201028061054.1411116-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 14 ++++++++++++++ kernel/bpf/task_iter.c | 2 ++ 2 files changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 8f10e30ea0b0..5454161407f1 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -67,6 +67,15 @@ static void bpf_iter_done_stop(struct seq_file *seq) iter_priv->done_stop = true; } +static bool bpf_iter_support_resched(struct seq_file *seq) +{ + struct bpf_iter_priv_data *iter_priv; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + return iter_priv->tinfo->reg_info->feature & BPF_ITER_RESCHED; +} + /* maximum visited objects before bailing out */ #define MAX_ITER_OBJECTS 1000000 @@ -83,6 +92,7 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, struct seq_file *seq = file->private_data; size_t n, offs, copied = 0; int err = 0, num_objs = 0; + bool can_resched; void *p; mutex_lock(&seq->lock); @@ -135,6 +145,7 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, goto done; } + can_resched = bpf_iter_support_resched(seq); while (1) { loff_t pos = seq->index; @@ -180,6 +191,9 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, } break; } + + if (can_resched) + cond_resched(); } stop: offs = seq->count; diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 5b6af30bfbcd..1fdb2fc196cd 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -337,6 +337,7 @@ static const struct bpf_iter_seq_info task_seq_info = { static struct bpf_iter_reg task_reg_info = { .target = "task", + .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__task, task), @@ -354,6 +355,7 @@ static const struct bpf_iter_seq_info task_file_seq_info = { static struct bpf_iter_reg task_file_reg_info = { .target = "task_file", + .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__task_file, task), -- cgit v1.2.3 From c50eb518e262fa06bd334e6eec172eaf5d7a5bd9 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 29 Oct 2020 00:19:24 -0700 Subject: bpf: Use separate lockdep class for each hashtab If a hashtab is accessed in both NMI and non-NMI contexts, it may cause deadlock in bucket->lock. LOCKDEP NMI warning highlighted this issue: ./test_progs -t stacktrace [ 74.828970] [ 74.828971] ================================ [ 74.828972] WARNING: inconsistent lock state [ 74.828973] 5.9.0-rc8+ #275 Not tainted [ 74.828974] -------------------------------- [ 74.828975] inconsistent {INITIAL USE} -> {IN-NMI} usage. [ 74.828976] taskset/1174 [HC2[2]:SC0[0]:HE0:SE1] takes: [ 74.828977] ffffc90000ee96b0 (&htab->buckets[i].raw_lock){....}-{2:2}, at: htab_map_update_elem+0x271/0x5a0 [ 74.828981] {INITIAL USE} state was registered at: [ 74.828982] lock_acquire+0x137/0x510 [ 74.828983] _raw_spin_lock_irqsave+0x43/0x90 [ 74.828984] htab_map_update_elem+0x271/0x5a0 [ 74.828984] 0xffffffffa0040b34 [ 74.828985] trace_call_bpf+0x159/0x310 [ 74.828986] perf_trace_run_bpf_submit+0x5f/0xd0 [ 74.828987] perf_trace_urandom_read+0x1be/0x220 [ 74.828988] urandom_read_nowarn.isra.0+0x26f/0x380 [ 74.828989] vfs_read+0xf8/0x280 [ 74.828989] ksys_read+0xc9/0x160 [ 74.828990] do_syscall_64+0x33/0x40 [ 74.828991] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 74.828992] irq event stamp: 1766 [ 74.828993] hardirqs last enabled at (1765): [] asm_exc_page_fault+0x1e/0x30 [ 74.828994] hardirqs last disabled at (1766): [] irqentry_enter+0x37/0x60 [ 74.828995] softirqs last enabled at (856): [] fpu__clear+0xac/0x120 [ 74.828996] softirqs last disabled at (854): [] fpu__clear+0x20/0x120 [ 74.828997] [ 74.828998] other info that might help us debug this: [ 74.828999] Possible unsafe locking scenario: [ 74.828999] [ 74.829000] CPU0 [ 74.829001] ---- [ 74.829001] lock(&htab->buckets[i].raw_lock); [ 74.829003] [ 74.829004] lock(&htab->buckets[i].raw_lock); [ 74.829006] [ 74.829006] *** DEADLOCK *** [ 74.829007] [ 74.829008] 1 lock held by taskset/1174: [ 74.829008] #0: ffff8883ec3fd020 (&cpuctx_lock){-...}-{2:2}, at: perf_event_task_tick+0x101/0x650 [ 74.829012] [ 74.829013] stack backtrace: [ 74.829014] CPU: 0 PID: 1174 Comm: taskset Not tainted 5.9.0-rc8+ #275 [ 74.829015] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014 [ 74.829016] Call Trace: [ 74.829016] [ 74.829017] dump_stack+0x9a/0xd0 [ 74.829018] lock_acquire+0x461/0x510 [ 74.829019] ? lock_release+0x6b0/0x6b0 [ 74.829020] ? stack_map_get_build_id_offset+0x45e/0x800 [ 74.829021] ? htab_map_update_elem+0x271/0x5a0 [ 74.829022] ? rcu_read_lock_held_common+0x1a/0x50 [ 74.829022] ? rcu_read_lock_held+0x5f/0xb0 [ 74.829023] _raw_spin_lock_irqsave+0x43/0x90 [ 74.829024] ? htab_map_update_elem+0x271/0x5a0 [ 74.829025] htab_map_update_elem+0x271/0x5a0 [ 74.829026] bpf_prog_1fd9e30e1438d3c5_oncpu+0x9c/0xe88 [ 74.829027] bpf_overflow_handler+0x127/0x320 [ 74.829028] ? perf_event_text_poke_output+0x4d0/0x4d0 [ 74.829029] ? sched_clock_cpu+0x18/0x130 [ 74.829030] __perf_event_overflow+0xae/0x190 [ 74.829030] handle_pmi_common+0x34c/0x470 [ 74.829031] ? intel_pmu_save_and_restart+0x90/0x90 [ 74.829032] ? lock_acquire+0x3f8/0x510 [ 74.829033] ? lock_release+0x6b0/0x6b0 [ 74.829034] intel_pmu_handle_irq+0x11e/0x240 [ 74.829034] perf_event_nmi_handler+0x40/0x60 [ 74.829035] nmi_handle+0x110/0x360 [ 74.829036] ? __intel_pmu_enable_all.constprop.0+0x72/0xf0 [ 74.829037] default_do_nmi+0x6b/0x170 [ 74.829038] exc_nmi+0x106/0x130 [ 74.829038] end_repeat_nmi+0x16/0x55 [ 74.829039] RIP: 0010:__intel_pmu_enable_all.constprop.0+0x72/0xf0 [ 74.829042] Code: 2f 1f 03 48 8d bb b8 0c 00 00 e8 29 09 41 00 48 ... [ 74.829043] RSP: 0000:ffff8880a604fc90 EFLAGS: 00000002 [ 74.829044] RAX: 000000070000000f RBX: ffff8883ec2195a0 RCX: 000000000000038f [ 74.829045] RDX: 0000000000000007 RSI: ffffffff82e72c20 RDI: ffff8883ec21a258 [ 74.829046] RBP: 000000070000000f R08: ffffffff8101b013 R09: fffffbfff0a7982d [ 74.829047] R10: ffffffff853cc167 R11: fffffbfff0a7982c R12: 0000000000000000 [ 74.829049] R13: ffff8883ec3f0af0 R14: ffff8883ec3fd120 R15: ffff8883e9c92098 [ 74.829049] ? intel_pmu_lbr_enable_all+0x43/0x240 [ 74.829050] ? __intel_pmu_enable_all.constprop.0+0x72/0xf0 [ 74.829051] ? __intel_pmu_enable_all.constprop.0+0x72/0xf0 [ 74.829052] [ 74.829053] perf_event_task_tick+0x48d/0x650 [ 74.829054] scheduler_tick+0x129/0x210 [ 74.829054] update_process_times+0x37/0x70 [ 74.829055] tick_sched_handle.isra.0+0x35/0x90 [ 74.829056] tick_sched_timer+0x8f/0xb0 [ 74.829057] __hrtimer_run_queues+0x364/0x7d0 [ 74.829058] ? tick_sched_do_timer+0xa0/0xa0 [ 74.829058] ? enqueue_hrtimer+0x1e0/0x1e0 [ 74.829059] ? recalibrate_cpu_khz+0x10/0x10 [ 74.829060] ? ktime_get_update_offsets_now+0x1a3/0x360 [ 74.829061] hrtimer_interrupt+0x1bb/0x360 [ 74.829062] ? rcu_read_lock_sched_held+0xa1/0xd0 [ 74.829063] __sysvec_apic_timer_interrupt+0xed/0x3d0 [ 74.829064] sysvec_apic_timer_interrupt+0x3f/0xd0 [ 74.829064] ? asm_sysvec_apic_timer_interrupt+0xa/0x20 [ 74.829065] asm_sysvec_apic_timer_interrupt+0x12/0x20 [ 74.829066] RIP: 0033:0x7fba18d579b4 [ 74.829068] Code: 74 54 44 0f b6 4a 04 41 83 e1 0f 41 80 f9 ... [ 74.829069] RSP: 002b:00007ffc9ba69570 EFLAGS: 00000206 [ 74.829071] RAX: 00007fba192084c0 RBX: 00007fba18c24d28 RCX: 00000000000007a4 [ 74.829072] RDX: 00007fba18c30488 RSI: 0000000000000000 RDI: 000000000000037b [ 74.829073] RBP: 00007fba18ca5760 R08: 00007fba18c248fc R09: 00007fba18c94c30 [ 74.829074] R10: 000000000000002f R11: 0000000000073c30 R12: 00007ffc9ba695e0 [ 74.829075] R13: 00000000000003f3 R14: 00007fba18c21ac8 R15: 00000000000058d6 However, such warning should not apply across multiple hashtabs. The system will not deadlock if one hashtab is used in NMI, while another hashtab is used in non-NMI. Use separate lockdep class for each hashtab, so that we don't get this false alert. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201029071925.3103400-2-songliubraving@fb.com --- kernel/bpf/hashtab.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 1815e97d4c9c..278da031c91a 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -99,6 +99,7 @@ struct bpf_htab { u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ u32 hashrnd; + struct lock_class_key lockdep_key; }; /* each htab element is struct htab_elem + key + value */ @@ -136,12 +137,18 @@ static void htab_init_buckets(struct bpf_htab *htab) { unsigned i; + lockdep_register_key(&htab->lockdep_key); for (i = 0; i < htab->n_buckets; i++) { INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); - if (htab_use_raw_lock(htab)) + if (htab_use_raw_lock(htab)) { raw_spin_lock_init(&htab->buckets[i].raw_lock); - else + lockdep_set_class(&htab->buckets[i].raw_lock, + &htab->lockdep_key); + } else { spin_lock_init(&htab->buckets[i].lock); + lockdep_set_class(&htab->buckets[i].lock, + &htab->lockdep_key); + } } } @@ -1312,6 +1319,7 @@ static void htab_map_free(struct bpf_map *map) free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); + lockdep_unregister_key(&htab->lockdep_key); kfree(htab); } -- cgit v1.2.3 From 20b6cc34ea74b6a84599c1f8a70f3315b56a1883 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 29 Oct 2020 00:19:25 -0700 Subject: bpf: Avoid hashtab deadlock with map_locked If a hashtab is accessed in both non-NMI and NMI context, the system may deadlock on bucket->lock. Fix this issue with percpu counter map_locked. map_locked rejects concurrent access to the same bucket from the same CPU. To reduce memory overhead, map_locked is not added per bucket. Instead, 8 percpu counters are added to each hashtab. buckets are assigned to these counters based on the lower bits of its hash. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201029071925.3103400-3-songliubraving@fb.com --- kernel/bpf/hashtab.c | 114 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 82 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 278da031c91a..da59ba978d17 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -86,6 +86,9 @@ struct bucket { }; }; +#define HASHTAB_MAP_LOCK_COUNT 8 +#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1) + struct bpf_htab { struct bpf_map map; struct bucket *buckets; @@ -100,6 +103,7 @@ struct bpf_htab { u32 elem_size; /* size of each element in bytes */ u32 hashrnd; struct lock_class_key lockdep_key; + int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT]; }; /* each htab element is struct htab_elem + key + value */ @@ -152,26 +156,41 @@ static void htab_init_buckets(struct bpf_htab *htab) } } -static inline unsigned long htab_lock_bucket(const struct bpf_htab *htab, - struct bucket *b) +static inline int htab_lock_bucket(const struct bpf_htab *htab, + struct bucket *b, u32 hash, + unsigned long *pflags) { unsigned long flags; + hash = hash & HASHTAB_MAP_LOCK_MASK; + + migrate_disable(); + if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { + __this_cpu_dec(*(htab->map_locked[hash])); + migrate_enable(); + return -EBUSY; + } + if (htab_use_raw_lock(htab)) raw_spin_lock_irqsave(&b->raw_lock, flags); else spin_lock_irqsave(&b->lock, flags); - return flags; + *pflags = flags; + + return 0; } static inline void htab_unlock_bucket(const struct bpf_htab *htab, - struct bucket *b, + struct bucket *b, u32 hash, unsigned long flags) { + hash = hash & HASHTAB_MAP_LOCK_MASK; if (htab_use_raw_lock(htab)) raw_spin_unlock_irqrestore(&b->raw_lock, flags); else spin_unlock_irqrestore(&b->lock, flags); + __this_cpu_dec(*(htab->map_locked[hash])); + migrate_enable(); } static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); @@ -429,8 +448,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); struct bpf_htab *htab; + int err, i; u64 cost; - int err; htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) @@ -487,6 +506,13 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (!htab->buckets) goto free_charge; + for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) { + htab->map_locked[i] = __alloc_percpu_gfp(sizeof(int), + sizeof(int), GFP_USER); + if (!htab->map_locked[i]) + goto free_map_locked; + } + if (htab->map.map_flags & BPF_F_ZERO_SEED) htab->hashrnd = 0; else @@ -497,7 +523,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (prealloc) { err = prealloc_init(htab); if (err) - goto free_buckets; + goto free_map_locked; if (!percpu && !lru) { /* lru itself can remove the least used element, so @@ -513,7 +539,9 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) free_prealloc: prealloc_destroy(htab); -free_buckets: +free_map_locked: + for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) + free_percpu(htab->map_locked[i]); bpf_map_area_free(htab->buckets); free_charge: bpf_map_charge_finish(&htab->map.memory); @@ -694,12 +722,15 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) struct hlist_nulls_node *n; unsigned long flags; struct bucket *b; + int ret; tgt_l = container_of(node, struct htab_elem, lru_node); b = __select_bucket(htab, tgt_l->hash); head = &b->head; - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags); + if (ret) + return false; hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l == tgt_l) { @@ -707,7 +738,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) break; } - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, tgt_l->hash, flags); return l == tgt_l; } @@ -979,7 +1010,9 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, */ } - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1020,7 +1053,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, } ret = 0; err: - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); return ret; } @@ -1058,7 +1091,9 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, return -ENOMEM; memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1077,7 +1112,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, ret = 0; err: - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); if (ret) bpf_lru_push_free(&htab->lru, &l_new->lru_node); @@ -1112,7 +1147,9 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, b = __select_bucket(htab, hash); head = &b->head; - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1135,7 +1172,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, } ret = 0; err: - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); return ret; } @@ -1175,7 +1212,9 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, return -ENOMEM; } - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1197,7 +1236,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, } ret = 0; err: - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); if (l_new) bpf_lru_push_free(&htab->lru, &l_new->lru_node); return ret; @@ -1225,7 +1264,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) struct htab_elem *l; unsigned long flags; u32 hash, key_size; - int ret = -ENOENT; + int ret; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); @@ -1235,17 +1274,20 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l = lookup_elem_raw(head, hash, key, key_size); if (l) { hlist_nulls_del_rcu(&l->hash_node); free_htab_elem(htab, l); - ret = 0; + } else { + ret = -ENOENT; } - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); return ret; } @@ -1257,7 +1299,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) struct htab_elem *l; unsigned long flags; u32 hash, key_size; - int ret = -ENOENT; + int ret; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); @@ -1267,16 +1309,18 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l = lookup_elem_raw(head, hash, key, key_size); - if (l) { + if (l) hlist_nulls_del_rcu(&l->hash_node); - ret = 0; - } + else + ret = -ENOENT; - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); if (l) bpf_lru_push_free(&htab->lru, &l->lru_node); return ret; @@ -1302,6 +1346,7 @@ static void delete_all_elements(struct bpf_htab *htab) static void htab_map_free(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + int i; /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. * bpf_free_used_maps() is called after bpf prog is no longer executing. @@ -1320,6 +1365,8 @@ static void htab_map_free(struct bpf_map *map) free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); lockdep_unregister_key(&htab->lockdep_key); + for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) + free_percpu(htab->map_locked[i]); kfree(htab); } @@ -1423,8 +1470,11 @@ again_nocopy: b = &htab->buckets[batch]; head = &b->head; /* do not grab the lock unless need it (bucket_cnt > 0). */ - if (locked) - flags = htab_lock_bucket(htab, b); + if (locked) { + ret = htab_lock_bucket(htab, b, batch, &flags); + if (ret) + goto next_batch; + } bucket_cnt = 0; hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) @@ -1441,7 +1491,7 @@ again_nocopy: /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it. */ - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, batch, flags); rcu_read_unlock(); bpf_enable_instrumentation(); goto after_loop; @@ -1452,7 +1502,7 @@ again_nocopy: /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it. */ - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, batch, flags); rcu_read_unlock(); bpf_enable_instrumentation(); kvfree(keys); @@ -1505,7 +1555,7 @@ again_nocopy: dst_val += value_size; } - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, batch, flags); locked = false; while (node_to_free) { -- cgit v1.2.3 From 8aaeed81fcb917b5cf4976932c5baefa1471128b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 2 Nov 2020 03:41:00 -0800 Subject: bpf: Fix error path in htab_map_alloc() syzbot was able to trigger a use-after-free in htab_map_alloc() [1] htab_map_alloc() lacks a call to lockdep_unregister_key() in its error path. lockdep_register_key() and lockdep_unregister_key() can not fail, it seems better to use them right after htab allocation and before htab freeing, avoiding more goto/labels in htab_map_alloc() [1] BUG: KASAN: use-after-free in lockdep_register_key+0x356/0x3e0 kernel/locking/lockdep.c:1182 Read of size 8 at addr ffff88805fa67ad8 by task syz-executor.3/2356 CPU: 1 PID: 2356 Comm: syz-executor.3 Not tainted 5.9.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x107/0x163 lib/dump_stack.c:118 print_address_description.constprop.0.cold+0xae/0x4c8 mm/kasan/report.c:385 __kasan_report mm/kasan/report.c:545 [inline] kasan_report.cold+0x1f/0x37 mm/kasan/report.c:562 lockdep_register_key+0x356/0x3e0 kernel/locking/lockdep.c:1182 htab_init_buckets kernel/bpf/hashtab.c:144 [inline] htab_map_alloc+0x6c5/0x14a0 kernel/bpf/hashtab.c:521 find_and_alloc_map kernel/bpf/syscall.c:122 [inline] map_create kernel/bpf/syscall.c:825 [inline] __do_sys_bpf+0xa80/0x5180 kernel/bpf/syscall.c:4381 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x45deb9 Code: 0d b4 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 db b3 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f0eafee1c78 EFLAGS: 00000246 ORIG_RAX: 0000000000000141 RAX: ffffffffffffffda RBX: 0000000000001a00 RCX: 000000000045deb9 RDX: 0000000000000040 RSI: 0000000020000040 RDI: 405a020000000000 RBP: 000000000118bf60 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 000000000118bf2c R13: 00007ffd3cf9eabf R14: 00007f0eafee29c0 R15: 000000000118bf2c Allocated by task 2053: kasan_save_stack+0x1b/0x40 mm/kasan/common.c:48 kasan_set_track mm/kasan/common.c:56 [inline] __kasan_kmalloc.constprop.0+0xc2/0xd0 mm/kasan/common.c:461 kmalloc include/linux/slab.h:554 [inline] kzalloc include/linux/slab.h:666 [inline] htab_map_alloc+0xdf/0x14a0 kernel/bpf/hashtab.c:454 find_and_alloc_map kernel/bpf/syscall.c:122 [inline] map_create kernel/bpf/syscall.c:825 [inline] __do_sys_bpf+0xa80/0x5180 kernel/bpf/syscall.c:4381 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Freed by task 2053: kasan_save_stack+0x1b/0x40 mm/kasan/common.c:48 kasan_set_track+0x1c/0x30 mm/kasan/common.c:56 kasan_set_free_info+0x1b/0x30 mm/kasan/generic.c:355 __kasan_slab_free+0x102/0x140 mm/kasan/common.c:422 slab_free_hook mm/slub.c:1544 [inline] slab_free_freelist_hook+0x5d/0x150 mm/slub.c:1577 slab_free mm/slub.c:3142 [inline] kfree+0xdb/0x360 mm/slub.c:4124 htab_map_alloc+0x3f9/0x14a0 kernel/bpf/hashtab.c:549 find_and_alloc_map kernel/bpf/syscall.c:122 [inline] map_create kernel/bpf/syscall.c:825 [inline] __do_sys_bpf+0xa80/0x5180 kernel/bpf/syscall.c:4381 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The buggy address belongs to the object at ffff88805fa67800 which belongs to the cache kmalloc-1k of size 1024 The buggy address is located 728 bytes inside of 1024-byte region [ffff88805fa67800, ffff88805fa67c00) The buggy address belongs to the page: page:000000003c5582c4 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x5fa60 head:000000003c5582c4 order:3 compound_mapcount:0 compound_pincount:0 flags: 0xfff00000010200(slab|head) raw: 00fff00000010200 ffffea0000bc1200 0000000200000002 ffff888010041140 raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88805fa67980: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88805fa67a00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88805fa67b00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88805fa67b80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Fixes: c50eb518e262 ("bpf: Use separate lockdep class for each hashtab") Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20201102114100.3103180-1-eric.dumazet@gmail.com --- kernel/bpf/hashtab.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index da59ba978d17..23f73d4649c9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -141,7 +141,6 @@ static void htab_init_buckets(struct bpf_htab *htab) { unsigned i; - lockdep_register_key(&htab->lockdep_key); for (i = 0; i < htab->n_buckets; i++) { INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); if (htab_use_raw_lock(htab)) { @@ -455,6 +454,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (!htab) return ERR_PTR(-ENOMEM); + lockdep_register_key(&htab->lockdep_key); + bpf_map_init_from_attr(&htab->map, attr); if (percpu_lru) { @@ -546,6 +547,7 @@ free_map_locked: free_charge: bpf_map_charge_finish(&htab->map.memory); free_htab: + lockdep_unregister_key(&htab->lockdep_key); kfree(htab); return ERR_PTR(err); } @@ -1364,9 +1366,9 @@ static void htab_map_free(struct bpf_map *map) free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); - lockdep_unregister_key(&htab->lockdep_key); for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) free_percpu(htab->map_locked[i]); + lockdep_unregister_key(&htab->lockdep_key); kfree(htab); } -- cgit v1.2.3 From c6bde958a62b8ca5ee8d2c1fe429aec4ad54efad Mon Sep 17 00:00:00 2001 From: Florian Lehner Date: Thu, 29 Oct 2020 21:14:42 +0100 Subject: bpf: Lift hashtab key_size limit Currently key_size of hashtab is limited to MAX_BPF_STACK. As the key of hashtab can also be a value from a per cpu map it can be larger than MAX_BPF_STACK. The use-case for this patch originates to implement allow/disallow lists for files and file paths. The maximum length of file paths is defined by PATH_MAX with 4096 chars including nul. This limit exceeds MAX_BPF_STACK. Changelog: v5: - Fix cast overflow v4: - Utilize BPF skeleton in tests - Rebase v3: - Rebase v2: - Add a test for bpf side Signed-off-by: Florian Lehner Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201029201442.596690-1-dev@der-flo.net --- kernel/bpf/hashtab.c | 16 +++----- .../selftests/bpf/prog_tests/hash_large_key.c | 43 +++++++++++++++++++++ .../selftests/bpf/progs/test_hash_large_key.c | 44 ++++++++++++++++++++++ tools/testing/selftests/bpf/test_maps.c | 3 +- 4 files changed, 94 insertions(+), 12 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/hash_large_key.c create mode 100644 tools/testing/selftests/bpf/progs/test_hash_large_key.c (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 23f73d4649c9..7bf18d92af41 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -415,17 +415,11 @@ static int htab_map_alloc_check(union bpf_attr *attr) attr->value_size == 0) return -EINVAL; - if (attr->key_size > MAX_BPF_STACK) - /* eBPF programs initialize keys on stack, so they cannot be - * larger than max stack size - */ - return -E2BIG; - - if (attr->value_size >= KMALLOC_MAX_SIZE - - MAX_BPF_STACK - sizeof(struct htab_elem)) - /* if value_size is bigger, the user space won't be able to - * access the elements via bpf syscall. This check also makes - * sure that the elem_size doesn't overflow and it's + if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE - + sizeof(struct htab_elem)) + /* if key_size + value_size is bigger, the user space won't be + * able to access the elements via bpf syscall. This check + * also makes sure that the elem_size doesn't overflow and it's * kmalloc-able later in htab_map_update_elem() */ return -E2BIG; diff --git a/tools/testing/selftests/bpf/prog_tests/hash_large_key.c b/tools/testing/selftests/bpf/prog_tests/hash_large_key.c new file mode 100644 index 000000000000..34684c0fc76d --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/hash_large_key.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "test_hash_large_key.skel.h" + +void test_hash_large_key(void) +{ + int err, value = 21, duration = 0, hash_map_fd; + struct test_hash_large_key *skel; + + struct bigelement { + int a; + char b[4096]; + long long c; + } key; + bzero(&key, sizeof(key)); + + skel = test_hash_large_key__open_and_load(); + if (CHECK(!skel, "skel_open_and_load", "skeleton open/load failed\n")) + return; + + hash_map_fd = bpf_map__fd(skel->maps.hash_map); + if (CHECK(hash_map_fd < 0, "bpf_map__fd", "failed\n")) + goto cleanup; + + err = test_hash_large_key__attach(skel); + if (CHECK(err, "attach_raw_tp", "err %d\n", err)) + goto cleanup; + + err = bpf_map_update_elem(hash_map_fd, &key, &value, BPF_ANY); + if (CHECK(err, "bpf_map_update_elem", "errno=%d\n", errno)) + goto cleanup; + + key.c = 1; + err = bpf_map_lookup_elem(hash_map_fd, &key, &value); + if (CHECK(err, "bpf_map_lookup_elem", "errno=%d\n", errno)) + goto cleanup; + + CHECK_FAIL(value != 42); + +cleanup: + test_hash_large_key__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_hash_large_key.c b/tools/testing/selftests/bpf/progs/test_hash_large_key.c new file mode 100644 index 000000000000..473a22794a62 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_hash_large_key.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 2); + __type(key, struct bigelement); + __type(value, __u32); +} hash_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, struct bigelement); +} key_map SEC(".maps"); + +struct bigelement { + int a; + char b[4096]; + long long c; +}; + +SEC("raw_tracepoint/sys_enter") +int bpf_hash_large_key_test(void *ctx) +{ + int zero = 0, err = 1, value = 42; + struct bigelement *key; + + key = bpf_map_lookup_elem(&key_map, &zero); + if (!key) + return 0; + + key->c = 1; + if (bpf_map_update_elem(&hash_map, key, &value, BPF_ANY)) + return 0; + + return 0; +} + diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 0d92ebcb335d..0ad3e6305ff0 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -1223,9 +1223,10 @@ out_map_in_map: static void test_map_large(void) { + struct bigkey { int a; - char b[116]; + char b[4096]; long long c; } key; int fd, i, value; -- cgit v1.2.3 From 9e7a4d9831e836eb03dedab89902277ee94eb7a6 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 6 Nov 2020 10:37:39 +0000 Subject: bpf: Allow LSM programs to use bpf spin locks Usage of spin locks was not allowed for tracing programs due to insufficient preemption checks. The verifier does not currently prevent LSM programs from using spin locks, but the helpers are not exposed via bpf_lsm_func_proto. Based on the discussion in [1], non-sleepable LSM programs should be able to use bpf_spin_{lock, unlock}. Sleepable LSM programs can be preempted which means that allowng spin locks will need more work (disabling preemption and the verifier ensuring that no sleepable helpers are called when a spin lock is held). [1]: https://lore.kernel.org/bpf/20201103153132.2717326-1-kpsingh@chromium.org/T/#md601a053229287659071600d3483523f752cd2fb Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20201106103747.2780972-2-kpsingh@chromium.org --- kernel/bpf/bpf_lsm.c | 4 ++++ kernel/bpf/verifier.c | 20 +++++++++++++++----- 2 files changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 78ea8a7bd27f..cd8a617f2109 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -59,6 +59,10 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_spin_lock: + return &bpf_spin_lock_proto; + case BPF_FUNC_spin_unlock: + return &bpf_spin_unlock_proto; default: return tracing_prog_func_proto(func_id, prog); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6200519582a6..f863aa84d0a2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9719,11 +9719,21 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n"); } - if ((is_tracing_prog_type(prog_type) || - prog_type == BPF_PROG_TYPE_SOCKET_FILTER) && - map_value_has_spin_lock(map)) { - verbose(env, "tracing progs cannot use bpf_spin_lock yet\n"); - return -EINVAL; + if (map_value_has_spin_lock(map)) { + if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { + verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n"); + return -EINVAL; + } + + if (is_tracing_prog_type(prog_type)) { + verbose(env, "tracing progs cannot use bpf_spin_lock yet\n"); + return -EINVAL; + } + + if (prog->aux->sleepable) { + verbose(env, "sleepable progs cannot use bpf_spin_lock yet\n"); + return -EINVAL; + } } if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && -- cgit v1.2.3 From 4cf1bc1f10452065a29d576fc5693fc4fab5b919 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 6 Nov 2020 10:37:40 +0000 Subject: bpf: Implement task local storage Similar to bpf_local_storage for sockets and inodes add local storage for task_struct. The life-cycle of storage is managed with the life-cycle of the task_struct. i.e. the storage is destroyed along with the owning task with a callback to the bpf_task_storage_free from the task_free LSM hook. The BPF LSM allocates an __rcu pointer to the bpf_local_storage in the security blob which are now stackable and can co-exist with other LSMs. The userspace map operations can be done by using a pid fd as a key passed to the lookup, update and delete operations. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20201106103747.2780972-3-kpsingh@chromium.org --- include/linux/bpf_lsm.h | 23 +++ include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 39 +++++ kernel/bpf/Makefile | 1 + kernel/bpf/bpf_lsm.c | 4 + kernel/bpf/bpf_task_storage.c | 315 +++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 3 +- kernel/bpf/verifier.c | 10 ++ security/bpf/hooks.c | 2 + tools/include/uapi/linux/bpf.h | 39 +++++ 10 files changed, 436 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/bpf_task_storage.c (limited to 'kernel') diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index aaacb6aafc87..73226181b744 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -7,6 +7,7 @@ #ifndef _LINUX_BPF_LSM_H #define _LINUX_BPF_LSM_H +#include #include #include @@ -35,9 +36,21 @@ static inline struct bpf_storage_blob *bpf_inode( return inode->i_security + bpf_lsm_blob_sizes.lbs_inode; } +static inline struct bpf_storage_blob *bpf_task( + const struct task_struct *task) +{ + if (unlikely(!task->security)) + return NULL; + + return task->security + bpf_lsm_blob_sizes.lbs_task; +} + extern const struct bpf_func_proto bpf_inode_storage_get_proto; extern const struct bpf_func_proto bpf_inode_storage_delete_proto; +extern const struct bpf_func_proto bpf_task_storage_get_proto; +extern const struct bpf_func_proto bpf_task_storage_delete_proto; void bpf_inode_storage_free(struct inode *inode); +void bpf_task_storage_free(struct task_struct *task); #else /* !CONFIG_BPF_LSM */ @@ -53,10 +66,20 @@ static inline struct bpf_storage_blob *bpf_inode( return NULL; } +static inline struct bpf_storage_blob *bpf_task( + const struct task_struct *task) +{ + return NULL; +} + static inline void bpf_inode_storage_free(struct inode *inode) { } +static inline void bpf_task_storage_free(struct task_struct *task) +{ +} + #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 2e6f568377f1..99f7fd657d87 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -109,6 +109,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) #endif #ifdef CONFIG_BPF_LSM BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e6ceac3f7d62..f4037b2161a6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -157,6 +157,7 @@ enum bpf_map_type { BPF_MAP_TYPE_STRUCT_OPS, BPF_MAP_TYPE_RINGBUF, BPF_MAP_TYPE_INODE_STORAGE, + BPF_MAP_TYPE_TASK_STORAGE, }; /* Note that tracing related programs such as @@ -3742,6 +3743,42 @@ union bpf_attr { * Return * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. + * + * void *bpf_task_storage_get(struct bpf_map *map, struct task_struct *task, void *value, u64 flags) + * Description + * Get a bpf_local_storage from the *task*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *task* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this + * helper enforces the key must be an task_struct and the map must also + * be a **BPF_MAP_TYPE_TASK_STORAGE**. + * + * Underneath, the value is stored locally at *task* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *task*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * long bpf_task_storage_delete(struct bpf_map *map, struct task_struct *task) + * Description + * Delete a bpf_local_storage from a *task*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3900,6 +3937,8 @@ union bpf_attr { FN(bpf_per_cpu_ptr), \ FN(bpf_this_cpu_ptr), \ FN(redirect_peer), \ + FN(task_storage_get), \ + FN(task_storage_delete), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index bdc8cd1b6767..f0b93ced5a7f 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,6 +6,7 @@ obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_i obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o +obj-${CONFIG_BPF_LSM} += bpf_task_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index cd8a617f2109..e92c51bebb47 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -63,6 +63,10 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_spin_lock_proto; case BPF_FUNC_spin_unlock: return &bpf_spin_unlock_proto; + case BPF_FUNC_task_storage_get: + return &bpf_task_storage_get_proto; + case BPF_FUNC_task_storage_delete: + return &bpf_task_storage_delete_proto; default: return tracing_prog_func_proto(func_id, prog); } diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c new file mode 100644 index 000000000000..39a45fba4fb0 --- /dev/null +++ b/kernel/bpf/bpf_task_storage.c @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020 Facebook + * Copyright 2020 Google LLC. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +DEFINE_BPF_STORAGE_CACHE(task_cache); + +static struct bpf_local_storage __rcu **task_storage_ptr(void *owner) +{ + struct task_struct *task = owner; + struct bpf_storage_blob *bsb; + + bsb = bpf_task(task); + if (!bsb) + return NULL; + return &bsb->storage; +} + +static struct bpf_local_storage_data * +task_storage_lookup(struct task_struct *task, struct bpf_map *map, + bool cacheit_lockit) +{ + struct bpf_local_storage *task_storage; + struct bpf_local_storage_map *smap; + struct bpf_storage_blob *bsb; + + bsb = bpf_task(task); + if (!bsb) + return NULL; + + task_storage = rcu_dereference(bsb->storage); + if (!task_storage) + return NULL; + + smap = (struct bpf_local_storage_map *)map; + return bpf_local_storage_lookup(task_storage, smap, cacheit_lockit); +} + +void bpf_task_storage_free(struct task_struct *task) +{ + struct bpf_local_storage_elem *selem; + struct bpf_local_storage *local_storage; + bool free_task_storage = false; + struct bpf_storage_blob *bsb; + struct hlist_node *n; + + bsb = bpf_task(task); + if (!bsb) + return; + + rcu_read_lock(); + + local_storage = rcu_dereference(bsb->storage); + if (!local_storage) { + rcu_read_unlock(); + return; + } + + /* Neither the bpf_prog nor the bpf-map's syscall + * could be modifying the local_storage->list now. + * Thus, no elem can be added-to or deleted-from the + * local_storage->list by the bpf_prog or by the bpf-map's syscall. + * + * It is racing with bpf_local_storage_map_free() alone + * when unlinking elem from the local_storage->list and + * the map's bucket->list. + */ + raw_spin_lock_bh(&local_storage->lock); + hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { + /* Always unlink from map before unlinking from + * local_storage. + */ + bpf_selem_unlink_map(selem); + free_task_storage = bpf_selem_unlink_storage_nolock( + local_storage, selem, false); + } + raw_spin_unlock_bh(&local_storage->lock); + rcu_read_unlock(); + + /* free_task_storage should always be true as long as + * local_storage->list was non-empty. + */ + if (free_task_storage) + kfree_rcu(local_storage, rcu); +} + +static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_local_storage_data *sdata; + struct task_struct *task; + unsigned int f_flags; + struct pid *pid; + int fd, err; + + fd = *(int *)key; + pid = pidfd_get_pid(fd, &f_flags); + if (IS_ERR(pid)) + return ERR_CAST(pid); + + /* We should be in an RCU read side critical section, it should be safe + * to call pid_task. + */ + WARN_ON_ONCE(!rcu_read_lock_held()); + task = pid_task(pid, PIDTYPE_PID); + if (!task) { + err = -ENOENT; + goto out; + } + + sdata = task_storage_lookup(task, map, true); + put_pid(pid); + return sdata ? sdata->data : NULL; +out: + put_pid(pid); + return ERR_PTR(err); +} + +static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct bpf_local_storage_data *sdata; + struct task_struct *task; + unsigned int f_flags; + struct pid *pid; + int fd, err; + + fd = *(int *)key; + pid = pidfd_get_pid(fd, &f_flags); + if (IS_ERR(pid)) + return PTR_ERR(pid); + + /* We should be in an RCU read side critical section, it should be safe + * to call pid_task. + */ + WARN_ON_ONCE(!rcu_read_lock_held()); + task = pid_task(pid, PIDTYPE_PID); + if (!task) { + err = -ENOENT; + goto out; + } + + sdata = bpf_local_storage_update( + task, (struct bpf_local_storage_map *)map, value, map_flags); + + err = PTR_ERR_OR_ZERO(sdata); +out: + put_pid(pid); + return err; +} + +static int task_storage_delete(struct task_struct *task, struct bpf_map *map) +{ + struct bpf_local_storage_data *sdata; + + sdata = task_storage_lookup(task, map, false); + if (!sdata) + return -ENOENT; + + bpf_selem_unlink(SELEM(sdata)); + + return 0; +} + +static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) +{ + struct task_struct *task; + unsigned int f_flags; + struct pid *pid; + int fd, err; + + fd = *(int *)key; + pid = pidfd_get_pid(fd, &f_flags); + if (IS_ERR(pid)) + return PTR_ERR(pid); + + /* We should be in an RCU read side critical section, it should be safe + * to call pid_task. + */ + WARN_ON_ONCE(!rcu_read_lock_held()); + task = pid_task(pid, PIDTYPE_PID); + if (!task) { + err = -ENOENT; + goto out; + } + + err = task_storage_delete(task, map); +out: + put_pid(pid); + return err; +} + +BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, + task, void *, value, u64, flags) +{ + struct bpf_local_storage_data *sdata; + + if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) + return (unsigned long)NULL; + + /* explicitly check that the task_storage_ptr is not + * NULL as task_storage_lookup returns NULL in this case and + * bpf_local_storage_update expects the owner to have a + * valid storage pointer. + */ + if (!task_storage_ptr(task)) + return (unsigned long)NULL; + + sdata = task_storage_lookup(task, map, true); + if (sdata) + return (unsigned long)sdata->data; + + /* This helper must only be called from places where the lifetime of the task + * is guaranteed. Either by being refcounted or by being protected + * by an RCU read-side critical section. + */ + if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { + sdata = bpf_local_storage_update( + task, (struct bpf_local_storage_map *)map, value, + BPF_NOEXIST); + return IS_ERR(sdata) ? (unsigned long)NULL : + (unsigned long)sdata->data; + } + + return (unsigned long)NULL; +} + +BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *, + task) +{ + /* This helper must only be called from places where the lifetime of the task + * is guaranteed. Either by being refcounted or by being protected + * by an RCU read-side critical section. + */ + return task_storage_delete(task, map); +} + +static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -ENOTSUPP; +} + +static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr) +{ + struct bpf_local_storage_map *smap; + + smap = bpf_local_storage_map_alloc(attr); + if (IS_ERR(smap)) + return ERR_CAST(smap); + + smap->cache_idx = bpf_local_storage_cache_idx_get(&task_cache); + return &smap->map; +} + +static void task_storage_map_free(struct bpf_map *map) +{ + struct bpf_local_storage_map *smap; + + smap = (struct bpf_local_storage_map *)map; + bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx); + bpf_local_storage_map_free(smap); +} + +static int task_storage_map_btf_id; +const struct bpf_map_ops task_storage_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc_check = bpf_local_storage_map_alloc_check, + .map_alloc = task_storage_map_alloc, + .map_free = task_storage_map_free, + .map_get_next_key = notsupp_get_next_key, + .map_lookup_elem = bpf_pid_task_storage_lookup_elem, + .map_update_elem = bpf_pid_task_storage_update_elem, + .map_delete_elem = bpf_pid_task_storage_delete_elem, + .map_check_btf = bpf_local_storage_map_check_btf, + .map_btf_name = "bpf_local_storage_map", + .map_btf_id = &task_storage_map_btf_id, + .map_owner_storage_ptr = task_storage_ptr, +}; + +BTF_ID_LIST_SINGLE(bpf_task_storage_btf_ids, struct, task_struct) + +const struct bpf_func_proto bpf_task_storage_get_proto = { + .func = bpf_task_storage_get, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &bpf_task_storage_btf_ids[0], + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; + +const struct bpf_func_proto bpf_task_storage_delete_proto = { + .func = bpf_task_storage_delete, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &bpf_task_storage_btf_ids[0], +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8f50c9c19f1b..f3fe9f53f93c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -773,7 +773,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && map->map_type != BPF_MAP_TYPE_SK_STORAGE && - map->map_type != BPF_MAP_TYPE_INODE_STORAGE) + map->map_type != BPF_MAP_TYPE_INODE_STORAGE && + map->map_type != BPF_MAP_TYPE_TASK_STORAGE) return -ENOTSUPP; if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > map->value_size) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f863aa84d0a2..00960f6a83ec 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4469,6 +4469,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_inode_storage_delete) goto error; break; + case BPF_MAP_TYPE_TASK_STORAGE: + if (func_id != BPF_FUNC_task_storage_get && + func_id != BPF_FUNC_task_storage_delete) + goto error; + break; default: break; } @@ -4547,6 +4552,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_INODE_STORAGE) goto error; break; + case BPF_FUNC_task_storage_get: + case BPF_FUNC_task_storage_delete: + if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE) + goto error; + break; default: break; } diff --git a/security/bpf/hooks.c b/security/bpf/hooks.c index 788667d582ae..e5971fa74fd7 100644 --- a/security/bpf/hooks.c +++ b/security/bpf/hooks.c @@ -12,6 +12,7 @@ static struct security_hook_list bpf_lsm_hooks[] __lsm_ro_after_init = { #include #undef LSM_HOOK LSM_HOOK_INIT(inode_free_security, bpf_inode_storage_free), + LSM_HOOK_INIT(task_free, bpf_task_storage_free), }; static int __init bpf_lsm_init(void) @@ -23,6 +24,7 @@ static int __init bpf_lsm_init(void) struct lsm_blob_sizes bpf_lsm_blob_sizes __lsm_ro_after_init = { .lbs_inode = sizeof(struct bpf_storage_blob), + .lbs_task = sizeof(struct bpf_storage_blob), }; DEFINE_LSM(bpf) = { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e6ceac3f7d62..f4037b2161a6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -157,6 +157,7 @@ enum bpf_map_type { BPF_MAP_TYPE_STRUCT_OPS, BPF_MAP_TYPE_RINGBUF, BPF_MAP_TYPE_INODE_STORAGE, + BPF_MAP_TYPE_TASK_STORAGE, }; /* Note that tracing related programs such as @@ -3742,6 +3743,42 @@ union bpf_attr { * Return * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. + * + * void *bpf_task_storage_get(struct bpf_map *map, struct task_struct *task, void *value, u64 flags) + * Description + * Get a bpf_local_storage from the *task*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *task* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this + * helper enforces the key must be an task_struct and the map must also + * be a **BPF_MAP_TYPE_TASK_STORAGE**. + * + * Underneath, the value is stored locally at *task* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *task*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * long bpf_task_storage_delete(struct bpf_map *map, struct task_struct *task) + * Description + * Delete a bpf_local_storage from a *task*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3900,6 +3937,8 @@ union bpf_attr { FN(bpf_per_cpu_ptr), \ FN(bpf_this_cpu_ptr), \ FN(redirect_peer), \ + FN(task_storage_get), \ + FN(task_storage_delete), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 3ca1032ab7ab010eccb107aa515598788f7d93bb Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 6 Nov 2020 10:37:43 +0000 Subject: bpf: Implement get_current_task_btf and RET_PTR_TO_BTF_ID The currently available bpf_get_current_task returns an unsigned integer which can be used along with BPF_CORE_READ to read data from the task_struct but still cannot be used as an input argument to a helper that accepts an ARG_PTR_TO_BTF_ID of type task_struct. In order to implement this helper a new return type, RET_PTR_TO_BTF_ID, is added. This is similar to RET_PTR_TO_BTF_ID_OR_NULL but does not require checking the nullness of returned pointer. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20201106103747.2780972-6-kpsingh@chromium.org --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 9 +++++++++ kernel/bpf/verifier.c | 7 +++++-- kernel/trace/bpf_trace.c | 16 ++++++++++++++++ tools/include/uapi/linux/bpf.h | 9 +++++++++ 5 files changed, 40 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2fffd30e13ac..73d5381a5d5c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -310,6 +310,7 @@ enum bpf_return_type { RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */ RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */ RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */ + RET_PTR_TO_BTF_ID, /* returns a pointer to a btf_id */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f4037b2161a6..9879d6793e90 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3779,6 +3779,14 @@ union bpf_attr { * 0 on success. * * **-ENOENT** if the bpf_local_storage cannot be found. + * + * struct task_struct *bpf_get_current_task_btf(void) + * Description + * Return a BTF pointer to the "current" task. + * This pointer can also be used in helpers that accept an + * *ARG_PTR_TO_BTF_ID* of type *task_struct*. + * Return + * Pointer to the current task. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3939,6 +3947,7 @@ union bpf_attr { FN(redirect_peer), \ FN(task_storage_get), \ FN(task_storage_delete), \ + FN(get_current_task_btf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 00960f6a83ec..10da26e55130 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5186,11 +5186,14 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL; regs[BPF_REG_0].btf_id = meta.ret_btf_id; } - } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) { + } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL || + fn->ret_type == RET_PTR_TO_BTF_ID) { int ret_btf_id; mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL; + regs[BPF_REG_0].type = fn->ret_type == RET_PTR_TO_BTF_ID ? + PTR_TO_BTF_ID : + PTR_TO_BTF_ID_OR_NULL; ret_btf_id = *fn->ret_btf_id; if (ret_btf_id == 0) { verbose(env, "invalid return type %d of func %s#%d\n", diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4517c8b66518..e4515b0f62a8 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1022,6 +1022,20 @@ const struct bpf_func_proto bpf_get_current_task_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_0(bpf_get_current_task_btf) +{ + return (unsigned long) current; +} + +BTF_ID_LIST_SINGLE(bpf_get_current_btf_ids, struct, task_struct) + +static const struct bpf_func_proto bpf_get_current_task_btf_proto = { + .func = bpf_get_current_task_btf, + .gpl_only = true, + .ret_type = RET_PTR_TO_BTF_ID, + .ret_btf_id = &bpf_get_current_btf_ids[0], +}; + BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) { struct bpf_array *array = container_of(map, struct bpf_array, map); @@ -1265,6 +1279,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_get_current_task: return &bpf_get_current_task_proto; + case BPF_FUNC_get_current_task_btf: + return &bpf_get_current_task_btf_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_current_comm: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f4037b2161a6..9879d6793e90 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3779,6 +3779,14 @@ union bpf_attr { * 0 on success. * * **-ENOENT** if the bpf_local_storage cannot be found. + * + * struct task_struct *bpf_get_current_task_btf(void) + * Description + * Return a BTF pointer to the "current" task. + * This pointer can also be used in helpers that accept an + * *ARG_PTR_TO_BTF_ID* of type *task_struct*. + * Return + * Pointer to the current task. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3939,6 +3947,7 @@ union bpf_attr { FN(redirect_peer), \ FN(task_storage_get), \ FN(task_storage_delete), \ + FN(get_current_task_btf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 666475ccbf1dc99c1e61e47975d5fbf86d6236aa Mon Sep 17 00:00:00 2001 From: Wang Qing Date: Sat, 7 Nov 2020 16:10:50 +0800 Subject: bpf, btf: Remove the duplicate btf_ids.h include Remove duplicate btf_ids.h header which is included twice. Signed-off-by: Wang Qing Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/1604736650-11197-1-git-send-email-wangqing@vivo.com --- kernel/bpf/btf.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ed7d02e8bc93..6324de8c59f7 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -22,7 +22,6 @@ #include #include #include -#include #include /* BTF (BPF Type Format) is the meta data format which describes -- cgit v1.2.3 From 951bb64621b8139c0cd99dcadc13e6510c08aa73 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 9 Nov 2020 17:19:28 -0800 Subject: bpf: Add in-kernel split BTF support Adjust in-kernel BTF implementation to support a split BTF mode of operation. Changes are mostly mirroring libbpf split BTF changes, with the exception of start_id being 0 for in-kernel implementation due to simpler read-only mode. Otherwise, for split BTF logic, most of the logic of jumping to base BTF, where necessary, is encapsulated in few helper functions. Type numbering and string offset in a split BTF are logically continuing where base BTF ends, so most of the high-level logic is kept without changes. Type verification and size resolution is only doing an added resolution of new split BTF types and relies on already cached size and type resolution results in the base BTF. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201110011932.3201430-2-andrii@kernel.org --- kernel/bpf/btf.c | 171 ++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 119 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6324de8c59f7..727c1c27053f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -203,12 +203,17 @@ struct btf { const char *strings; void *nohdr_data; struct btf_header hdr; - u32 nr_types; + u32 nr_types; /* includes VOID for base BTF */ u32 types_size; u32 data_size; refcount_t refcnt; u32 id; struct rcu_head rcu; + + /* split BTF support */ + struct btf *base_btf; + u32 start_id; /* first type ID in this BTF (0 for base BTF) */ + u32 start_str_off; /* first string offset (0 for base BTF) */ }; enum verifier_phase { @@ -449,14 +454,27 @@ static bool btf_type_is_datasec(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; } +static u32 btf_nr_types_total(const struct btf *btf) +{ + u32 total = 0; + + while (btf) { + total += btf->nr_types; + btf = btf->base_btf; + } + + return total; +} + s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind) { const struct btf_type *t; const char *tname; - u32 i; + u32 i, total; - for (i = 1; i <= btf->nr_types; i++) { - t = btf->types[i]; + total = btf_nr_types_total(btf); + for (i = 1; i < total; i++) { + t = btf_type_by_id(btf, i); if (BTF_INFO_KIND(t->info) != kind) continue; @@ -599,8 +617,14 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) static bool btf_name_offset_valid(const struct btf *btf, u32 offset) { - return BTF_STR_OFFSET_VALID(offset) && - offset < btf->hdr.str_len; + if (!BTF_STR_OFFSET_VALID(offset)) + return false; + + while (offset < btf->start_str_off) + btf = btf->base_btf; + + offset -= btf->start_str_off; + return offset < btf->hdr.str_len; } static bool __btf_name_char_ok(char c, bool first, bool dot_ok) @@ -614,10 +638,22 @@ static bool __btf_name_char_ok(char c, bool first, bool dot_ok) return true; } +static const char *btf_str_by_offset(const struct btf *btf, u32 offset) +{ + while (offset < btf->start_str_off) + btf = btf->base_btf; + + offset -= btf->start_str_off; + if (offset < btf->hdr.str_len) + return &btf->strings[offset]; + + return NULL; +} + static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok) { /* offset must be valid */ - const char *src = &btf->strings[offset]; + const char *src = btf_str_by_offset(btf, offset); const char *src_limit; if (!__btf_name_char_ok(*src, true, dot_ok)) @@ -650,27 +686,28 @@ static bool btf_name_valid_section(const struct btf *btf, u32 offset) static const char *__btf_name_by_offset(const struct btf *btf, u32 offset) { + const char *name; + if (!offset) return "(anon)"; - else if (offset < btf->hdr.str_len) - return &btf->strings[offset]; - else - return "(invalid-name-offset)"; + + name = btf_str_by_offset(btf, offset); + return name ?: "(invalid-name-offset)"; } const char *btf_name_by_offset(const struct btf *btf, u32 offset) { - if (offset < btf->hdr.str_len) - return &btf->strings[offset]; - - return NULL; + return btf_str_by_offset(btf, offset); } const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) { - if (type_id > btf->nr_types) - return NULL; + while (type_id < btf->start_id) + btf = btf->base_btf; + type_id -= btf->start_id; + if (type_id >= btf->nr_types) + return NULL; return btf->types[type_id]; } @@ -1390,17 +1427,13 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t) { struct btf *btf = env->btf; - /* < 2 because +1 for btf_void which is always in btf->types[0]. - * btf_void is not accounted in btf->nr_types because btf_void - * does not come from the BTF file. - */ - if (btf->types_size - btf->nr_types < 2) { + if (btf->types_size == btf->nr_types) { /* Expand 'types' array */ struct btf_type **new_types; u32 expand_by, new_size; - if (btf->types_size == BTF_MAX_TYPE) { + if (btf->start_id + btf->types_size == BTF_MAX_TYPE) { btf_verifier_log(env, "Exceeded max num of types"); return -E2BIG; } @@ -1414,18 +1447,23 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t) if (!new_types) return -ENOMEM; - if (btf->nr_types == 0) - new_types[0] = &btf_void; - else + if (btf->nr_types == 0) { + if (!btf->base_btf) { + /* lazily init VOID type */ + new_types[0] = &btf_void; + btf->nr_types++; + } + } else { memcpy(new_types, btf->types, - sizeof(*btf->types) * (btf->nr_types + 1)); + sizeof(*btf->types) * btf->nr_types); + } kvfree(btf->types); btf->types = new_types; btf->types_size = new_size; } - btf->types[++(btf->nr_types)] = t; + btf->types[btf->nr_types++] = t; return 0; } @@ -1498,18 +1536,17 @@ static int env_resolve_init(struct btf_verifier_env *env) u32 *resolved_ids = NULL; u8 *visit_states = NULL; - /* +1 for btf_void */ - resolved_sizes = kvcalloc(nr_types + 1, sizeof(*resolved_sizes), + resolved_sizes = kvcalloc(nr_types, sizeof(*resolved_sizes), GFP_KERNEL | __GFP_NOWARN); if (!resolved_sizes) goto nomem; - resolved_ids = kvcalloc(nr_types + 1, sizeof(*resolved_ids), + resolved_ids = kvcalloc(nr_types, sizeof(*resolved_ids), GFP_KERNEL | __GFP_NOWARN); if (!resolved_ids) goto nomem; - visit_states = kvcalloc(nr_types + 1, sizeof(*visit_states), + visit_states = kvcalloc(nr_types, sizeof(*visit_states), GFP_KERNEL | __GFP_NOWARN); if (!visit_states) goto nomem; @@ -1561,21 +1598,27 @@ static bool env_type_is_resolve_sink(const struct btf_verifier_env *env, static bool env_type_is_resolved(const struct btf_verifier_env *env, u32 type_id) { - return env->visit_states[type_id] == RESOLVED; + /* base BTF types should be resolved by now */ + if (type_id < env->btf->start_id) + return true; + + return env->visit_states[type_id - env->btf->start_id] == RESOLVED; } static int env_stack_push(struct btf_verifier_env *env, const struct btf_type *t, u32 type_id) { + const struct btf *btf = env->btf; struct resolve_vertex *v; if (env->top_stack == MAX_RESOLVE_DEPTH) return -E2BIG; - if (env->visit_states[type_id] != NOT_VISITED) + if (type_id < btf->start_id + || env->visit_states[type_id - btf->start_id] != NOT_VISITED) return -EEXIST; - env->visit_states[type_id] = VISITED; + env->visit_states[type_id - btf->start_id] = VISITED; v = &env->stack[env->top_stack++]; v->t = t; @@ -1605,6 +1648,7 @@ static void env_stack_pop_resolved(struct btf_verifier_env *env, u32 type_id = env->stack[--(env->top_stack)].type_id; struct btf *btf = env->btf; + type_id -= btf->start_id; /* adjust to local type id */ btf->resolved_sizes[type_id] = resolved_size; btf->resolved_ids[type_id] = resolved_type_id; env->visit_states[type_id] = RESOLVED; @@ -1709,14 +1753,30 @@ btf_resolve_size(const struct btf *btf, const struct btf_type *type, return __btf_resolve_size(btf, type, type_size, NULL, NULL, NULL, NULL); } +static u32 btf_resolved_type_id(const struct btf *btf, u32 type_id) +{ + while (type_id < btf->start_id) + btf = btf->base_btf; + + return btf->resolved_ids[type_id - btf->start_id]; +} + /* The input param "type_id" must point to a needs_resolve type */ static const struct btf_type *btf_type_id_resolve(const struct btf *btf, u32 *type_id) { - *type_id = btf->resolved_ids[*type_id]; + *type_id = btf_resolved_type_id(btf, *type_id); return btf_type_by_id(btf, *type_id); } +static u32 btf_resolved_type_size(const struct btf *btf, u32 type_id) +{ + while (type_id < btf->start_id) + btf = btf->base_btf; + + return btf->resolved_sizes[type_id - btf->start_id]; +} + const struct btf_type *btf_type_id_size(const struct btf *btf, u32 *type_id, u32 *ret_size) { @@ -1731,7 +1791,7 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, if (btf_type_has_size(size_type)) { size = size_type->size; } else if (btf_type_is_array(size_type)) { - size = btf->resolved_sizes[size_type_id]; + size = btf_resolved_type_size(btf, size_type_id); } else if (btf_type_is_ptr(size_type)) { size = sizeof(void *); } else { @@ -1739,14 +1799,14 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, !btf_type_is_var(size_type))) return NULL; - size_type_id = btf->resolved_ids[size_type_id]; + size_type_id = btf_resolved_type_id(btf, size_type_id); size_type = btf_type_by_id(btf, size_type_id); if (btf_type_nosize_or_null(size_type)) return NULL; else if (btf_type_has_size(size_type)) size = size_type->size; else if (btf_type_is_array(size_type)) - size = btf->resolved_sizes[size_type_id]; + size = btf_resolved_type_size(btf, size_type_id); else if (btf_type_is_ptr(size_type)) size = sizeof(void *); else @@ -3798,7 +3858,7 @@ static int btf_check_all_metas(struct btf_verifier_env *env) cur = btf->nohdr_data + hdr->type_off; end = cur + hdr->type_len; - env->log_type_id = 1; + env->log_type_id = btf->base_btf ? btf->start_id : 1; while (cur < end) { struct btf_type *t = cur; s32 meta_size; @@ -3825,8 +3885,8 @@ static bool btf_resolve_valid(struct btf_verifier_env *env, return false; if (btf_type_is_struct(t) || btf_type_is_datasec(t)) - return !btf->resolved_ids[type_id] && - !btf->resolved_sizes[type_id]; + return !btf_resolved_type_id(btf, type_id) && + !btf_resolved_type_size(btf, type_id); if (btf_type_is_modifier(t) || btf_type_is_ptr(t) || btf_type_is_var(t)) { @@ -3846,7 +3906,7 @@ static bool btf_resolve_valid(struct btf_verifier_env *env, elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); return elem_type && !btf_type_is_modifier(elem_type) && (array->nelems * elem_size == - btf->resolved_sizes[type_id]); + btf_resolved_type_size(btf, type_id)); } return false; @@ -3888,7 +3948,8 @@ static int btf_resolve(struct btf_verifier_env *env, static int btf_check_all_types(struct btf_verifier_env *env) { struct btf *btf = env->btf; - u32 type_id; + const struct btf_type *t; + u32 type_id, i; int err; err = env_resolve_init(env); @@ -3896,8 +3957,9 @@ static int btf_check_all_types(struct btf_verifier_env *env) return err; env->phase++; - for (type_id = 1; type_id <= btf->nr_types; type_id++) { - const struct btf_type *t = btf_type_by_id(btf, type_id); + for (i = btf->base_btf ? 0 : 1; i < btf->nr_types; i++) { + type_id = btf->start_id + i; + t = btf_type_by_id(btf, type_id); env->log_type_id = type_id; if (btf_type_needs_resolve(t) && @@ -3934,7 +3996,7 @@ static int btf_parse_type_sec(struct btf_verifier_env *env) return -EINVAL; } - if (!hdr->type_len) { + if (!env->btf->base_btf && !hdr->type_len) { btf_verifier_log(env, "No type found"); return -EINVAL; } @@ -3961,13 +4023,18 @@ static int btf_parse_str_sec(struct btf_verifier_env *env) return -EINVAL; } - if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET || - start[0] || end[-1]) { + btf->strings = start; + + if (btf->base_btf && !hdr->str_len) + return 0; + if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET || end[-1]) { + btf_verifier_log(env, "Invalid string section"); + return -EINVAL; + } + if (!btf->base_btf && start[0]) { btf_verifier_log(env, "Invalid string section"); return -EINVAL; } - - btf->strings = start; return 0; } @@ -4908,7 +4975,7 @@ static int __get_type_size(struct btf *btf, u32 btf_id, while (t && btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (!t) { - *bad_type = btf->types[0]; + *bad_type = btf_type_by_id(btf, 0); return -EINVAL; } if (btf_type_is_ptr(t)) -- cgit v1.2.3 From 5329722057d41aebc31e391907a501feaa42f7d9 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 9 Nov 2020 17:19:29 -0800 Subject: bpf: Assign ID to vmlinux BTF and return extra info for BTF in GET_OBJ_INFO Allocate ID for vmlinux BTF. This makes it visible when iterating over all BTF objects in the system. To allow distinguishing vmlinux BTF (and later kernel module BTF) from user-provided BTFs, expose extra kernel_btf flag, as well as BTF name ("vmlinux" for vmlinux BTF, will equal to module's name for module BTF). We might want to later allow specifying BTF name for user-provided BTFs as well, if that makes sense. But currently this is reserved only for in-kernel BTFs. Having in-kernel BTFs exposed IDs will allow to extend BPF APIs that require in-kernel BTF type with ability to specify BTF types from kernel modules, not just vmlinux BTF. This will be implemented in a follow up patch set for fentry/fexit/fmod_ret/lsm/etc. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201110011932.3201430-3-andrii@kernel.org --- include/uapi/linux/bpf.h | 3 +++ kernel/bpf/btf.c | 43 +++++++++++++++++++++++++++++++++++++++--- tools/include/uapi/linux/bpf.h | 3 +++ 3 files changed, 46 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9879d6793e90..162999b12790 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4466,6 +4466,9 @@ struct bpf_btf_info { __aligned_u64 btf; __u32 btf_size; __u32 id; + __aligned_u64 name; + __u32 name_len; + __u32 kernel_btf; } __attribute__((aligned(8))); struct bpf_link_info { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 727c1c27053f..856585db7aa7 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -214,6 +214,8 @@ struct btf { struct btf *base_btf; u32 start_id; /* first type ID in this BTF (0 for base BTF) */ u32 start_str_off; /* first string offset (0 for base BTF) */ + char name[MODULE_NAME_LEN]; + bool kernel_btf; }; enum verifier_phase { @@ -4429,6 +4431,8 @@ struct btf *btf_parse_vmlinux(void) btf->data = __start_BTF; btf->data_size = __stop_BTF - __start_BTF; + btf->kernel_btf = true; + snprintf(btf->name, sizeof(btf->name), "vmlinux"); err = btf_parse_hdr(env); if (err) @@ -4454,8 +4458,13 @@ struct btf *btf_parse_vmlinux(void) bpf_struct_ops_init(btf, log); - btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); + + err = btf_alloc_id(btf); + if (err) + goto errout; + + btf_verifier_env_free(env); return btf; errout: @@ -5553,7 +5562,9 @@ int btf_get_info_by_fd(const struct btf *btf, struct bpf_btf_info info; u32 info_copy, btf_copy; void __user *ubtf; - u32 uinfo_len; + char __user *uname; + u32 uinfo_len, uname_len, name_len; + int ret = 0; uinfo = u64_to_user_ptr(attr->info.info); uinfo_len = attr->info.info_len; @@ -5570,11 +5581,37 @@ int btf_get_info_by_fd(const struct btf *btf, return -EFAULT; info.btf_size = btf->data_size; + info.kernel_btf = btf->kernel_btf; + + uname = u64_to_user_ptr(info.name); + uname_len = info.name_len; + if (!uname ^ !uname_len) + return -EINVAL; + + name_len = strlen(btf->name); + info.name_len = name_len; + + if (uname) { + if (uname_len >= name_len + 1) { + if (copy_to_user(uname, btf->name, name_len + 1)) + return -EFAULT; + } else { + char zero = '\0'; + + if (copy_to_user(uname, btf->name, uname_len - 1)) + return -EFAULT; + if (put_user(zero, uname + uname_len - 1)) + return -EFAULT; + /* let user-space know about too short buffer */ + ret = -ENOSPC; + } + } + if (copy_to_user(uinfo, &info, info_copy) || put_user(info_copy, &uattr->info.info_len)) return -EFAULT; - return 0; + return ret; } int btf_get_fd_by_id(u32 id) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 9879d6793e90..162999b12790 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4466,6 +4466,9 @@ struct bpf_btf_info { __aligned_u64 btf; __u32 btf_size; __u32 id; + __aligned_u64 name; + __u32 name_len; + __u32 kernel_btf; } __attribute__((aligned(8))); struct bpf_link_info { -- cgit v1.2.3 From 36e68442d1afd4f720704ee1ea8486331507e834 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 9 Nov 2020 17:19:31 -0800 Subject: bpf: Load and verify kernel module BTFs Add kernel module listener that will load/validate and unload module BTF. Module BTFs gets ID generated for them, which makes it possible to iterate them with existing BTF iteration API. They are given their respective module's names, which will get reported through GET_OBJ_INFO API. They are also marked as in-kernel BTFs for tooling to distinguish them from user-provided BTFs. Also, similarly to vmlinux BTF, kernel module BTFs are exposed through sysfs as /sys/kernel/btf/. This is convenient for user-space tools to inspect module BTF contents and dump their types with existing tools: [vmuser@archvm bpf]$ ls -la /sys/kernel/btf total 0 drwxr-xr-x 2 root root 0 Nov 4 19:46 . drwxr-xr-x 13 root root 0 Nov 4 19:46 .. ... -r--r--r-- 1 root root 888 Nov 4 19:46 irqbypass -r--r--r-- 1 root root 100225 Nov 4 19:46 kvm -r--r--r-- 1 root root 35401 Nov 4 19:46 kvm_intel -r--r--r-- 1 root root 120 Nov 4 19:46 pcspkr -r--r--r-- 1 root root 399 Nov 4 19:46 serio_raw -r--r--r-- 1 root root 4094095 Nov 4 19:46 vmlinux Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/bpf/20201110011932.3201430-5-andrii@kernel.org --- Documentation/ABI/testing/sysfs-kernel-btf | 8 ++ include/linux/bpf.h | 2 + include/linux/module.h | 4 + kernel/bpf/btf.c | 194 +++++++++++++++++++++++++++++ kernel/bpf/sysfs_btf.c | 2 +- kernel/module.c | 32 +++++ 6 files changed, 241 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/ABI/testing/sysfs-kernel-btf b/Documentation/ABI/testing/sysfs-kernel-btf index 2c9744b2cd59..fe96efdc9b6c 100644 --- a/Documentation/ABI/testing/sysfs-kernel-btf +++ b/Documentation/ABI/testing/sysfs-kernel-btf @@ -15,3 +15,11 @@ Description: information with description of all internal kernel types. See Documentation/bpf/btf.rst for detailed description of format itself. + +What: /sys/kernel/btf/ +Date: Nov 2020 +KernelVersion: 5.11 +Contact: bpf@vger.kernel.org +Description: + Read-only binary attribute exposing kernel module's BTF type + information as an add-on to the kernel's BTF (/sys/kernel/btf/vmlinux). diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 73d5381a5d5c..581b2a2e78eb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -36,9 +36,11 @@ struct seq_operations; struct bpf_iter_aux_info; struct bpf_local_storage; struct bpf_local_storage_map; +struct kobject; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; +extern struct kobject *btf_kobj; typedef int (*bpf_iter_init_seq_priv_t)(void *private_data, struct bpf_iter_aux_info *aux); diff --git a/include/linux/module.h b/include/linux/module.h index a29187f7c360..20fce258ffba 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -475,6 +475,10 @@ struct module { unsigned int num_bpf_raw_events; struct bpf_raw_event_map *bpf_raw_events; #endif +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + unsigned int btf_data_size; + void *btf_data; +#endif #ifdef CONFIG_JUMP_LABEL struct jump_entry *jump_entries; unsigned int num_jump_entries; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 856585db7aa7..0f1fd2669d69 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include /* BTF (BPF Type Format) is the meta data format which describes @@ -4476,6 +4478,75 @@ errout: return ERR_PTR(err); } +static struct btf *btf_parse_module(const char *module_name, const void *data, unsigned int data_size) +{ + struct btf_verifier_env *env = NULL; + struct bpf_verifier_log *log; + struct btf *btf = NULL, *base_btf; + int err; + + base_btf = bpf_get_btf_vmlinux(); + if (IS_ERR(base_btf)) + return base_btf; + if (!base_btf) + return ERR_PTR(-EINVAL); + + env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); + if (!env) + return ERR_PTR(-ENOMEM); + + log = &env->log; + log->level = BPF_LOG_KERNEL; + + btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN); + if (!btf) { + err = -ENOMEM; + goto errout; + } + env->btf = btf; + + btf->base_btf = base_btf; + btf->start_id = base_btf->nr_types; + btf->start_str_off = base_btf->hdr.str_len; + btf->kernel_btf = true; + snprintf(btf->name, sizeof(btf->name), "%s", module_name); + + btf->data = kvmalloc(data_size, GFP_KERNEL | __GFP_NOWARN); + if (!btf->data) { + err = -ENOMEM; + goto errout; + } + memcpy(btf->data, data, data_size); + btf->data_size = data_size; + + err = btf_parse_hdr(env); + if (err) + goto errout; + + btf->nohdr_data = btf->data + btf->hdr.hdr_len; + + err = btf_parse_str_sec(env); + if (err) + goto errout; + + err = btf_check_all_metas(env); + if (err) + goto errout; + + btf_verifier_env_free(env); + refcount_set(&btf->refcnt, 1); + return btf; + +errout: + btf_verifier_env_free(env); + if (btf) { + kvfree(btf->data); + kvfree(btf->types); + kfree(btf); + } + return ERR_PTR(err); +} + struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) { struct bpf_prog *tgt_prog = prog->aux->dst_prog; @@ -5651,3 +5722,126 @@ bool btf_id_set_contains(const struct btf_id_set *set, u32 id) { return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; } + +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES +struct btf_module { + struct list_head list; + struct module *module; + struct btf *btf; + struct bin_attribute *sysfs_attr; +}; + +static LIST_HEAD(btf_modules); +static DEFINE_MUTEX(btf_module_mutex); + +static ssize_t +btf_module_read(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t len) +{ + const struct btf *btf = bin_attr->private; + + memcpy(buf, btf->data + off, len); + return len; +} + +static int btf_module_notify(struct notifier_block *nb, unsigned long op, + void *module) +{ + struct btf_module *btf_mod, *tmp; + struct module *mod = module; + struct btf *btf; + int err = 0; + + if (mod->btf_data_size == 0 || + (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING)) + goto out; + + switch (op) { + case MODULE_STATE_COMING: + btf_mod = kzalloc(sizeof(*btf_mod), GFP_KERNEL); + if (!btf_mod) { + err = -ENOMEM; + goto out; + } + btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size); + if (IS_ERR(btf)) { + pr_warn("failed to validate module [%s] BTF: %ld\n", + mod->name, PTR_ERR(btf)); + kfree(btf_mod); + err = PTR_ERR(btf); + goto out; + } + err = btf_alloc_id(btf); + if (err) { + btf_free(btf); + kfree(btf_mod); + goto out; + } + + mutex_lock(&btf_module_mutex); + btf_mod->module = module; + btf_mod->btf = btf; + list_add(&btf_mod->list, &btf_modules); + mutex_unlock(&btf_module_mutex); + + if (IS_ENABLED(CONFIG_SYSFS)) { + struct bin_attribute *attr; + + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + if (!attr) + goto out; + + sysfs_bin_attr_init(attr); + attr->attr.name = btf->name; + attr->attr.mode = 0444; + attr->size = btf->data_size; + attr->private = btf; + attr->read = btf_module_read; + + err = sysfs_create_bin_file(btf_kobj, attr); + if (err) { + pr_warn("failed to register module [%s] BTF in sysfs: %d\n", + mod->name, err); + kfree(attr); + err = 0; + goto out; + } + + btf_mod->sysfs_attr = attr; + } + + break; + case MODULE_STATE_GOING: + mutex_lock(&btf_module_mutex); + list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { + if (btf_mod->module != module) + continue; + + list_del(&btf_mod->list); + if (btf_mod->sysfs_attr) + sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr); + btf_put(btf_mod->btf); + kfree(btf_mod->sysfs_attr); + kfree(btf_mod); + break; + } + mutex_unlock(&btf_module_mutex); + break; + } +out: + return notifier_from_errno(err); +} + +static struct notifier_block btf_module_nb = { + .notifier_call = btf_module_notify, +}; + +static int __init btf_module_init(void) +{ + register_module_notifier(&btf_module_nb); + return 0; +} + +fs_initcall(btf_module_init); +#endif /* CONFIG_DEBUG_INFO_BTF_MODULES */ diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 11b3380887fa..ef6911aee3bb 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -26,7 +26,7 @@ static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = { .read = btf_vmlinux_read, }; -static struct kobject *btf_kobj; +struct kobject *btf_kobj; static int __init btf_vmlinux_init(void) { diff --git a/kernel/module.c b/kernel/module.c index a4fa44a652a7..f2996b02ab2e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -380,6 +380,35 @@ static void *section_objs(const struct load_info *info, return (void *)info->sechdrs[sec].sh_addr; } +/* Find a module section: 0 means not found. Ignores SHF_ALLOC flag. */ +static unsigned int find_any_sec(const struct load_info *info, const char *name) +{ + unsigned int i; + + for (i = 1; i < info->hdr->e_shnum; i++) { + Elf_Shdr *shdr = &info->sechdrs[i]; + if (strcmp(info->secstrings + shdr->sh_name, name) == 0) + return i; + } + return 0; +} + +/* + * Find a module section, or NULL. Fill in number of "objects" in section. + * Ignores SHF_ALLOC flag. + */ +static __maybe_unused void *any_section_objs(const struct load_info *info, + const char *name, + size_t object_size, + unsigned int *num) +{ + unsigned int sec = find_any_sec(info, name); + + /* Section 0 has sh_addr 0 and sh_size 0. */ + *num = info->sechdrs[sec].sh_size / object_size; + return (void *)info->sechdrs[sec].sh_addr; +} + /* Provided by the linker */ extern const struct kernel_symbol __start___ksymtab[]; extern const struct kernel_symbol __stop___ksymtab[]; @@ -3250,6 +3279,9 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->bpf_raw_events), &mod->num_bpf_raw_events); #endif +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + mod->btf_data = any_section_objs(info, ".BTF", 1, &mod->btf_data_size); +#endif #ifdef CONFIG_JUMP_LABEL mod->jump_entries = section_objs(info, "__jump_table", sizeof(*mod->jump_entries), -- cgit v1.2.3 From 7112d127984bd7b0c8ded7973b358829f16735f5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 10 Nov 2020 20:06:45 -0800 Subject: bpf: Compile out btf_parse_module() if module BTF is not enabled Make sure btf_parse_module() is compiled out if module BTFs are not enabled. Fixes: 36e68442d1af ("bpf: Load and verify kernel module BTFs") Reported-by: Stephen Rothwell Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201111040645.903494-1-andrii@kernel.org --- kernel/bpf/btf.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 0f1fd2669d69..6b2d508b33d4 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4478,6 +4478,8 @@ errout: return ERR_PTR(err); } +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + static struct btf *btf_parse_module(const char *module_name, const void *data, unsigned int data_size) { struct btf_verifier_env *env = NULL; @@ -4547,6 +4549,8 @@ errout: return ERR_PTR(err); } +#endif /* CONFIG_DEBUG_INFO_BTF_MODULES */ + struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) { struct bpf_prog *tgt_prog = prog->aux->dst_prog; -- cgit v1.2.3 From 09a3dac7b579e57e7ef2d875b9216c845ae8a0e5 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 11 Nov 2020 16:19:19 -0800 Subject: bpf: Fix NULL dereference in bpf_task_storage In bpf_pid_task_storage_update_elem(), it missed to test the !task_storage_ptr(task) which then could trigger a NULL pointer exception in bpf_local_storage_update(). Fixes: 4cf1bc1f1045 ("bpf: Implement task local storage") Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Tested-by: Roman Gushchin Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20201112001919.2028357-1-kafai@fb.com --- kernel/bpf/bpf_task_storage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 39a45fba4fb0..4ef1959a78f2 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -150,7 +150,7 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, */ WARN_ON_ONCE(!rcu_read_lock_held()); task = pid_task(pid, PIDTYPE_PID); - if (!task) { + if (!task || !task_storage_ptr(task)) { err = -ENOENT; goto out; } -- cgit v1.2.3 From 6d94e741a8ff818e5518da8257f5ca0aaed1f269 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 10 Nov 2020 19:12:11 -0800 Subject: bpf: Support for pointers beyond pkt_end. This patch adds the verifier support to recognize inlined branch conditions. The LLVM knows that the branch evaluates to the same value, but the verifier couldn't track it. Hence causing valid programs to be rejected. The potential LLVM workaround: https://reviews.llvm.org/D87428 can have undesired side effects, since LLVM doesn't know that skb->data/data_end are being compared. LLVM has to introduce extra boolean variable and use inline_asm trick to force easier for the verifier assembly. Instead teach the verifier to recognize that r1 = skb->data; r1 += 10; r2 = skb->data_end; if (r1 > r2) { here r1 points beyond packet_end and subsequent if (r1 > r2) // always evaluates to "true". } Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Tested-by: Jiri Olsa Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20201111031213.25109-2-alexei.starovoitov@gmail.com --- include/linux/bpf_verifier.h | 2 +- kernel/bpf/verifier.c | 129 +++++++++++++++++++++++++++++++++++-------- 2 files changed, 108 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e83ef6f6bf43..306869d4743b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -45,7 +45,7 @@ struct bpf_reg_state { enum bpf_reg_type type; union { /* valid when type == PTR_TO_PACKET */ - u16 range; + int range; /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | * PTR_TO_MAP_VALUE_OR_NULL diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 10da26e55130..7b1f85aa9741 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2739,7 +2739,9 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, regno); return -EACCES; } - err = __check_mem_access(env, regno, off, size, reg->range, + + err = reg->range < 0 ? -EINVAL : + __check_mem_access(env, regno, off, size, reg->range, zero_size_allowed); if (err) { verbose(env, "R%d offset is outside of the packet\n", regno); @@ -4697,6 +4699,32 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) __clear_all_pkt_pointers(env, vstate->frame[i]); } +enum { + AT_PKT_END = -1, + BEYOND_PKT_END = -2, +}; + +static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range_open) +{ + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *reg = &state->regs[regn]; + + if (reg->type != PTR_TO_PACKET) + /* PTR_TO_PACKET_META is not supported yet */ + return; + + /* The 'reg' is pkt > pkt_end or pkt >= pkt_end. + * How far beyond pkt_end it goes is unknown. + * if (!range_open) it's the case of pkt >= pkt_end + * if (range_open) it's the case of pkt > pkt_end + * hence this pointer is at least 1 byte bigger than pkt_end + */ + if (range_open) + reg->range = BEYOND_PKT_END; + else + reg->range = AT_PKT_END; +} + static void release_reg_references(struct bpf_verifier_env *env, struct bpf_func_state *state, int ref_obj_id) @@ -6708,7 +6736,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) static void __find_good_pkt_pointers(struct bpf_func_state *state, struct bpf_reg_state *dst_reg, - enum bpf_reg_type type, u16 new_range) + enum bpf_reg_type type, int new_range) { struct bpf_reg_state *reg; int i; @@ -6733,8 +6761,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, enum bpf_reg_type type, bool range_right_open) { - u16 new_range; - int i; + int new_range, i; if (dst_reg->off < 0 || (dst_reg->off == 0 && range_right_open)) @@ -6985,6 +7012,67 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, return is_branch64_taken(reg, val, opcode); } +static int flip_opcode(u32 opcode) +{ + /* How can we transform "a b" into "b a"? */ + static const u8 opcode_flip[16] = { + /* these stay the same */ + [BPF_JEQ >> 4] = BPF_JEQ, + [BPF_JNE >> 4] = BPF_JNE, + [BPF_JSET >> 4] = BPF_JSET, + /* these swap "lesser" and "greater" (L and G in the opcodes) */ + [BPF_JGE >> 4] = BPF_JLE, + [BPF_JGT >> 4] = BPF_JLT, + [BPF_JLE >> 4] = BPF_JGE, + [BPF_JLT >> 4] = BPF_JGT, + [BPF_JSGE >> 4] = BPF_JSLE, + [BPF_JSGT >> 4] = BPF_JSLT, + [BPF_JSLE >> 4] = BPF_JSGE, + [BPF_JSLT >> 4] = BPF_JSGT + }; + return opcode_flip[opcode >> 4]; +} + +static int is_pkt_ptr_branch_taken(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg, + u8 opcode) +{ + struct bpf_reg_state *pkt; + + if (src_reg->type == PTR_TO_PACKET_END) { + pkt = dst_reg; + } else if (dst_reg->type == PTR_TO_PACKET_END) { + pkt = src_reg; + opcode = flip_opcode(opcode); + } else { + return -1; + } + + if (pkt->range >= 0) + return -1; + + switch (opcode) { + case BPF_JLE: + /* pkt <= pkt_end */ + fallthrough; + case BPF_JGT: + /* pkt > pkt_end */ + if (pkt->range == BEYOND_PKT_END) + /* pkt has at last one extra byte beyond pkt_end */ + return opcode == BPF_JGT; + break; + case BPF_JLT: + /* pkt < pkt_end */ + fallthrough; + case BPF_JGE: + /* pkt >= pkt_end */ + if (pkt->range == BEYOND_PKT_END || pkt->range == AT_PKT_END) + return opcode == BPF_JGE; + break; + } + return -1; +} + /* Adjusts the register min/max values in the case that the dst_reg is the * variable register that we are working on, and src_reg is a constant or we're * simply doing a BPF_K check. @@ -7148,23 +7236,7 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, u64 val, u32 val32, u8 opcode, bool is_jmp32) { - /* How can we transform "a b" into "b a"? */ - static const u8 opcode_flip[16] = { - /* these stay the same */ - [BPF_JEQ >> 4] = BPF_JEQ, - [BPF_JNE >> 4] = BPF_JNE, - [BPF_JSET >> 4] = BPF_JSET, - /* these swap "lesser" and "greater" (L and G in the opcodes) */ - [BPF_JGE >> 4] = BPF_JLE, - [BPF_JGT >> 4] = BPF_JLT, - [BPF_JLE >> 4] = BPF_JGE, - [BPF_JLT >> 4] = BPF_JGT, - [BPF_JSGE >> 4] = BPF_JSLE, - [BPF_JSGT >> 4] = BPF_JSLT, - [BPF_JSLE >> 4] = BPF_JSGE, - [BPF_JSLT >> 4] = BPF_JSGT - }; - opcode = opcode_flip[opcode >> 4]; + opcode = flip_opcode(opcode); /* This uses zero as "not present in table"; luckily the zero opcode, * BPF_JA, can't get here. */ @@ -7346,6 +7418,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_data' > pkt_end, pkt_meta' > pkt_data */ find_good_pkt_pointers(this_branch, dst_reg, dst_reg->type, false); + mark_pkt_end(other_branch, insn->dst_reg, true); } else if ((dst_reg->type == PTR_TO_PACKET_END && src_reg->type == PTR_TO_PACKET) || (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && @@ -7353,6 +7426,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_end > pkt_data', pkt_data > pkt_meta' */ find_good_pkt_pointers(other_branch, src_reg, src_reg->type, true); + mark_pkt_end(this_branch, insn->src_reg, false); } else { return false; } @@ -7365,6 +7439,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_data' < pkt_end, pkt_meta' < pkt_data */ find_good_pkt_pointers(other_branch, dst_reg, dst_reg->type, true); + mark_pkt_end(this_branch, insn->dst_reg, false); } else if ((dst_reg->type == PTR_TO_PACKET_END && src_reg->type == PTR_TO_PACKET) || (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && @@ -7372,6 +7447,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_end < pkt_data', pkt_data > pkt_meta' */ find_good_pkt_pointers(this_branch, src_reg, src_reg->type, false); + mark_pkt_end(other_branch, insn->src_reg, true); } else { return false; } @@ -7384,6 +7460,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */ find_good_pkt_pointers(this_branch, dst_reg, dst_reg->type, true); + mark_pkt_end(other_branch, insn->dst_reg, false); } else if ((dst_reg->type == PTR_TO_PACKET_END && src_reg->type == PTR_TO_PACKET) || (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && @@ -7391,6 +7468,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_end >= pkt_data', pkt_data >= pkt_meta' */ find_good_pkt_pointers(other_branch, src_reg, src_reg->type, false); + mark_pkt_end(this_branch, insn->src_reg, true); } else { return false; } @@ -7403,6 +7481,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */ find_good_pkt_pointers(other_branch, dst_reg, dst_reg->type, false); + mark_pkt_end(this_branch, insn->dst_reg, true); } else if ((dst_reg->type == PTR_TO_PACKET_END && src_reg->type == PTR_TO_PACKET) || (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && @@ -7410,6 +7489,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_end <= pkt_data', pkt_data <= pkt_meta' */ find_good_pkt_pointers(this_branch, src_reg, src_reg->type, true); + mark_pkt_end(other_branch, insn->src_reg, false); } else { return false; } @@ -7509,6 +7589,10 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, src_reg->var_off.value, opcode, is_jmp32); + } else if (reg_is_pkt_pointer_any(dst_reg) && + reg_is_pkt_pointer_any(src_reg) && + !is_jmp32) { + pred = is_pkt_ptr_branch_taken(dst_reg, src_reg, opcode); } if (pred >= 0) { @@ -7517,7 +7601,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, */ if (!__is_pointer_value(false, dst_reg)) err = mark_chain_precision(env, insn->dst_reg); - if (BPF_SRC(insn->code) == BPF_X && !err) + if (BPF_SRC(insn->code) == BPF_X && !err && + !__is_pointer_value(false, src_reg)) err = mark_chain_precision(env, insn->src_reg); if (err) return err; -- cgit v1.2.3 From 8e4597c627fb48f361e2a5b012202cb1b6cbcd5e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 12 Nov 2020 13:13:13 -0800 Subject: bpf: Allow using bpf_sk_storage in FENTRY/FEXIT/RAW_TP This patch enables the FENTRY/FEXIT/RAW_TP tracing program to use the bpf_sk_storage_(get|delete) helper, so those tracing programs can access the sk's bpf_local_storage and the later selftest will show some examples. The bpf_sk_storage is currently used in bpf-tcp-cc, tc, cg sockops...etc which is running either in softirq or task context. This patch adds bpf_sk_storage_get_tracing_proto and bpf_sk_storage_delete_tracing_proto. They will check in runtime that the helpers can only be called when serving softirq or running in a task context. That should enable most common tracing use cases on sk. During the load time, the new tracing_allowed() function will ensure the tracing prog using the bpf_sk_storage_(get|delete) helper is not tracing any bpf_sk_storage*() function itself. The sk is passed as "void *" when calling into bpf_local_storage. This patch only allows tracing a kernel function. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201112211313.2587383-1-kafai@fb.com --- include/net/bpf_sk_storage.h | 2 ++ kernel/trace/bpf_trace.c | 5 +++ net/core/bpf_sk_storage.c | 74 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+) (limited to 'kernel') diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h index 3c516dd07caf..0e85713f56df 100644 --- a/include/net/bpf_sk_storage.h +++ b/include/net/bpf_sk_storage.h @@ -20,6 +20,8 @@ void bpf_sk_storage_free(struct sock *sk); extern const struct bpf_func_proto bpf_sk_storage_get_proto; extern const struct bpf_func_proto bpf_sk_storage_delete_proto; +extern const struct bpf_func_proto bpf_sk_storage_get_tracing_proto; +extern const struct bpf_func_proto bpf_sk_storage_delete_tracing_proto; struct bpf_local_storage_elem; struct bpf_sk_storage_diag; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e4515b0f62a8..cfce60ad1cb5 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -1735,6 +1736,10 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skc_to_tcp_request_sock_proto; case BPF_FUNC_skc_to_udp6_sock: return &bpf_skc_to_udp6_sock_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_tracing_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_tracing_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index fd416678f236..359908a7d3c1 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -378,6 +379,79 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = { .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, }; +static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) +{ + const struct btf *btf_vmlinux; + const struct btf_type *t; + const char *tname; + u32 btf_id; + + if (prog->aux->dst_prog) + return false; + + /* Ensure the tracing program is not tracing + * any bpf_sk_storage*() function and also + * use the bpf_sk_storage_(get|delete) helper. + */ + switch (prog->expected_attach_type) { + case BPF_TRACE_RAW_TP: + /* bpf_sk_storage has no trace point */ + return true; + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + btf_vmlinux = bpf_get_btf_vmlinux(); + btf_id = prog->aux->attach_btf_id; + t = btf_type_by_id(btf_vmlinux, btf_id); + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + return !!strncmp(tname, "bpf_sk_storage", + strlen("bpf_sk_storage")); + default: + return false; + } + + return false; +} + +BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, + void *, value, u64, flags) +{ + if (!in_serving_softirq() && !in_task()) + return (unsigned long)NULL; + + return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags); +} + +BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map, + struct sock *, sk) +{ + if (!in_serving_softirq() && !in_task()) + return -EPERM; + + return ____bpf_sk_storage_delete(map, sk); +} + +const struct bpf_func_proto bpf_sk_storage_get_tracing_proto = { + .func = bpf_sk_storage_get_tracing, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, + .allowed = bpf_sk_storage_tracing_allowed, +}; + +const struct bpf_func_proto bpf_sk_storage_delete_tracing_proto = { + .func = bpf_sk_storage_delete_tracing, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], + .allowed = bpf_sk_storage_tracing_allowed, +}; + struct bpf_sk_storage_diag { u32 nr_maps; struct bpf_map *maps[]; -- cgit v1.2.3 From 423f16108c9d832bd96059d5c882c8ef6d76eb96 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 13 Nov 2020 00:59:29 +0000 Subject: bpf: Augment the set of sleepable LSM hooks Update the set of sleepable hooks with the ones that do not trigger a warning with might_fault() when exercised with the correct kernel config options enabled, i.e. DEBUG_ATOMIC_SLEEP=y LOCKDEP=y PROVE_LOCKING=y This means that a sleepable LSM eBPF program can be attached to these LSM hooks. A new helper method bpf_lsm_is_sleepable_hook is added and the set is maintained locally in bpf_lsm.c Signed-off-by: KP Singh Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20201113005930.541956-2-kpsingh@chromium.org --- include/linux/bpf_lsm.h | 7 +++++ kernel/bpf/bpf_lsm.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 16 +--------- 3 files changed, 89 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index 73226181b744..0d1c33ace398 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -27,6 +27,8 @@ extern struct lsm_blob_sizes bpf_lsm_blob_sizes; int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog); +bool bpf_lsm_is_sleepable_hook(u32 btf_id); + static inline struct bpf_storage_blob *bpf_inode( const struct inode *inode) { @@ -54,6 +56,11 @@ void bpf_task_storage_free(struct task_struct *task); #else /* !CONFIG_BPF_LSM */ +static inline bool bpf_lsm_is_sleepable_hook(u32 btf_id) +{ + return false; +} + static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) { diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index e92c51bebb47..aed74b853415 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -13,6 +13,7 @@ #include #include #include +#include /* For every LSM hook that allows attachment of BPF programs, declare a nop * function where a BPF program can be attached. @@ -72,6 +73,86 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +/* The set of hooks which are called without pagefaults disabled and are allowed + * to "sleep" and thus can be used for sleeable BPF programs. + */ +BTF_SET_START(sleepable_lsm_hooks) +BTF_ID(func, bpf_lsm_bpf) +BTF_ID(func, bpf_lsm_bpf_map) +BTF_ID(func, bpf_lsm_bpf_map_alloc_security) +BTF_ID(func, bpf_lsm_bpf_map_free_security) +BTF_ID(func, bpf_lsm_bpf_prog) +BTF_ID(func, bpf_lsm_bprm_check_security) +BTF_ID(func, bpf_lsm_bprm_committed_creds) +BTF_ID(func, bpf_lsm_bprm_committing_creds) +BTF_ID(func, bpf_lsm_bprm_creds_for_exec) +BTF_ID(func, bpf_lsm_bprm_creds_from_file) +BTF_ID(func, bpf_lsm_capget) +BTF_ID(func, bpf_lsm_capset) +BTF_ID(func, bpf_lsm_cred_prepare) +BTF_ID(func, bpf_lsm_file_ioctl) +BTF_ID(func, bpf_lsm_file_lock) +BTF_ID(func, bpf_lsm_file_open) +BTF_ID(func, bpf_lsm_file_receive) +BTF_ID(func, bpf_lsm_inet_conn_established) +BTF_ID(func, bpf_lsm_inode_create) +BTF_ID(func, bpf_lsm_inode_free_security) +BTF_ID(func, bpf_lsm_inode_getattr) +BTF_ID(func, bpf_lsm_inode_getxattr) +BTF_ID(func, bpf_lsm_inode_mknod) +BTF_ID(func, bpf_lsm_inode_need_killpriv) +BTF_ID(func, bpf_lsm_inode_post_setxattr) +BTF_ID(func, bpf_lsm_inode_readlink) +BTF_ID(func, bpf_lsm_inode_rename) +BTF_ID(func, bpf_lsm_inode_rmdir) +BTF_ID(func, bpf_lsm_inode_setattr) +BTF_ID(func, bpf_lsm_inode_setxattr) +BTF_ID(func, bpf_lsm_inode_symlink) +BTF_ID(func, bpf_lsm_inode_unlink) +BTF_ID(func, bpf_lsm_kernel_module_request) +BTF_ID(func, bpf_lsm_kernfs_init_security) +BTF_ID(func, bpf_lsm_key_free) +BTF_ID(func, bpf_lsm_mmap_file) +BTF_ID(func, bpf_lsm_netlink_send) +BTF_ID(func, bpf_lsm_path_notify) +BTF_ID(func, bpf_lsm_release_secctx) +BTF_ID(func, bpf_lsm_sb_alloc_security) +BTF_ID(func, bpf_lsm_sb_eat_lsm_opts) +BTF_ID(func, bpf_lsm_sb_kern_mount) +BTF_ID(func, bpf_lsm_sb_mount) +BTF_ID(func, bpf_lsm_sb_remount) +BTF_ID(func, bpf_lsm_sb_set_mnt_opts) +BTF_ID(func, bpf_lsm_sb_show_options) +BTF_ID(func, bpf_lsm_sb_statfs) +BTF_ID(func, bpf_lsm_sb_umount) +BTF_ID(func, bpf_lsm_settime) +BTF_ID(func, bpf_lsm_socket_accept) +BTF_ID(func, bpf_lsm_socket_bind) +BTF_ID(func, bpf_lsm_socket_connect) +BTF_ID(func, bpf_lsm_socket_create) +BTF_ID(func, bpf_lsm_socket_getpeername) +BTF_ID(func, bpf_lsm_socket_getpeersec_dgram) +BTF_ID(func, bpf_lsm_socket_getsockname) +BTF_ID(func, bpf_lsm_socket_getsockopt) +BTF_ID(func, bpf_lsm_socket_listen) +BTF_ID(func, bpf_lsm_socket_post_create) +BTF_ID(func, bpf_lsm_socket_recvmsg) +BTF_ID(func, bpf_lsm_socket_sendmsg) +BTF_ID(func, bpf_lsm_socket_shutdown) +BTF_ID(func, bpf_lsm_socket_socketpair) +BTF_ID(func, bpf_lsm_syslog) +BTF_ID(func, bpf_lsm_task_alloc) +BTF_ID(func, bpf_lsm_task_getsecid) +BTF_ID(func, bpf_lsm_task_prctl) +BTF_ID(func, bpf_lsm_task_setscheduler) +BTF_ID(func, bpf_lsm_task_to_inode) +BTF_SET_END(sleepable_lsm_hooks) + +bool bpf_lsm_is_sleepable_hook(u32 btf_id) +{ + return btf_id_set_contains(&sleepable_lsm_hooks, btf_id); +} + const struct bpf_prog_ops lsm_prog_ops = { }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7b1f85aa9741..fb2943ea715d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11562,20 +11562,6 @@ static int check_attach_modify_return(unsigned long addr, const char *func_name) return -EINVAL; } -/* non exhaustive list of sleepable bpf_lsm_*() functions */ -BTF_SET_START(btf_sleepable_lsm_hooks) -#ifdef CONFIG_BPF_LSM -BTF_ID(func, bpf_lsm_bprm_committed_creds) -#else -BTF_ID_UNUSED -#endif -BTF_SET_END(btf_sleepable_lsm_hooks) - -static int check_sleepable_lsm_hook(u32 btf_id) -{ - return btf_id_set_contains(&btf_sleepable_lsm_hooks, btf_id); -} - /* list of non-sleepable functions that are otherwise on * ALLOW_ERROR_INJECTION list */ @@ -11797,7 +11783,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, /* LSM progs check that they are attached to bpf_lsm_*() funcs. * Only some of them are sleepable. */ - if (check_sleepable_lsm_hook(btf_id)) + if (bpf_lsm_is_sleepable_hook(btf_id)) ret = 0; break; default: -- cgit v1.2.3 From 6f100640ca5b2a2ff67b001c9fd3de21f7b12cf2 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 13 Nov 2020 00:59:30 +0000 Subject: bpf: Expose bpf_d_path helper to sleepable LSM hooks Sleepable hooks are never called from an NMI/interrupt context, so it is safe to use the bpf_d_path helper in LSM programs attaching to these hooks. The helper is not restricted to sleepable programs and merely uses the list of sleepable hooks as the initial subset of LSM hooks where it can be used. Signed-off-by: KP Singh Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201113005930.541956-3-kpsingh@chromium.org --- kernel/trace/bpf_trace.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index cfce60ad1cb5..02986c7b90eb 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -16,6 +16,8 @@ #include #include #include +#include + #include #include @@ -1179,7 +1181,11 @@ BTF_SET_END(btf_allowlist_d_path) static bool bpf_d_path_allowed(const struct bpf_prog *prog) { - return btf_id_set_contains(&btf_allowlist_d_path, prog->aux->attach_btf_id); + if (prog->type == BPF_PROG_TYPE_LSM) + return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id); + + return btf_id_set_contains(&btf_allowlist_d_path, + prog->aux->attach_btf_id); } BTF_ID_LIST_SINGLE(bpf_d_path_btf_ids, struct, path) -- cgit v1.2.3