diff options
Diffstat (limited to 'kernel/bpf/cpumap.c')
-rw-r--r-- | kernel/bpf/cpumap.c | 148 |
1 files changed, 94 insertions, 54 deletions
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index a2f46785ac3b..67e8a2fc1a99 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -33,8 +33,8 @@ #include <trace/events/xdp.h> #include <linux/btf_ids.h> -#include <linux/netdevice.h> /* netif_receive_skb_list */ -#include <linux/etherdevice.h> /* eth_type_trans */ +#include <linux/netdevice.h> +#include <net/gro.h> /* General idea: XDP packets getting XDP redirected to another CPU, * will maximum be stored/queued for one driver ->poll() call. It is @@ -68,6 +68,7 @@ struct bpf_cpu_map_entry { struct bpf_cpumap_val value; struct bpf_prog *prog; + struct gro_node gro; struct completion kthread_running; struct rcu_work free_work; @@ -133,22 +134,23 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring) } } -static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, - struct list_head *listp, - struct xdp_cpumap_stats *stats) +static u32 cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, + void **skbs, u32 skb_n, + struct xdp_cpumap_stats *stats) { - struct sk_buff *skb, *tmp; struct xdp_buff xdp; - u32 act; + u32 act, pass = 0; int err; - list_for_each_entry_safe(skb, tmp, listp, list) { + for (u32 i = 0; i < skb_n; i++) { + struct sk_buff *skb = skbs[i]; + act = bpf_prog_run_generic_xdp(skb, &xdp, rcpu->prog); switch (act) { case XDP_PASS: + skbs[pass++] = skb; break; case XDP_REDIRECT: - skb_list_del_init(skb); err = xdp_do_generic_redirect(skb->dev, skb, &xdp, rcpu->prog); if (unlikely(err)) { @@ -157,7 +159,7 @@ static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, } else { stats->redirect++; } - return; + break; default: bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act); fallthrough; @@ -165,12 +167,15 @@ static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, trace_xdp_exception(skb->dev, rcpu->prog, act); fallthrough; case XDP_DROP: - skb_list_del_init(skb); - kfree_skb(skb); + napi_consume_skb(skb, true); stats->drop++; - return; + break; } } + + stats->pass += pass; + + return pass; } static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, @@ -190,7 +195,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, int err; rxq.dev = xdpf->dev_rx; - rxq.mem = xdpf->mem; + rxq.mem.type = xdpf->mem_type; /* TODO: report queue_index to xdp_rxq_info */ xdp_convert_frame_to_buff(xdpf, &xdp); @@ -204,7 +209,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, stats->drop++; } else { frames[nframes++] = xdpf; - stats->pass++; } break; case XDP_REDIRECT: @@ -228,43 +232,65 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, } xdp_clear_return_frame_no_direct(); + stats->pass += nframes; return nframes; } #define CPUMAP_BATCH 8 -static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, - int xdp_n, struct xdp_cpumap_stats *stats, - struct list_head *list) +struct cpu_map_ret { + u32 xdp_n; + u32 skb_n; +}; + +static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, + void **skbs, struct cpu_map_ret *ret, + struct xdp_cpumap_stats *stats) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; - int nframes; if (!rcpu->prog) - return xdp_n; + goto out; - rcu_read_lock_bh(); + rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); - nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats); + ret->xdp_n = cpu_map_bpf_prog_run_xdp(rcpu, frames, ret->xdp_n, stats); + if (unlikely(ret->skb_n)) + ret->skb_n = cpu_map_bpf_prog_run_skb(rcpu, skbs, ret->skb_n, + stats); if (stats->redirect) xdp_do_flush(); - if (unlikely(!list_empty(list))) - cpu_map_bpf_prog_run_skb(rcpu, list, stats); - bpf_net_ctx_clear(bpf_net_ctx); - rcu_read_unlock_bh(); /* resched point, may call do_softirq() */ + rcu_read_unlock(); - return nframes; +out: + if (unlikely(ret->skb_n) && ret->xdp_n) + memmove(&skbs[ret->xdp_n], skbs, ret->skb_n * sizeof(*skbs)); +} + +static void cpu_map_gro_flush(struct bpf_cpu_map_entry *rcpu, bool empty) +{ + /* + * If the ring is not empty, there'll be a new iteration soon, and we + * only need to do a full flush if a tick is long (> 1 ms). + * If the ring is empty, to not hold GRO packets in the stack for too + * long, do a full flush. + * This is equivalent to how NAPI decides whether to perform a full + * flush. + */ + gro_flush(&rcpu->gro, !empty && HZ >= 1000); + gro_normal_list(&rcpu->gro); } static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; unsigned long last_qs = jiffies; + u32 packets = 0; complete(&rcpu->kthread_running); set_current_state(TASK_INTERRUPTIBLE); @@ -277,11 +303,11 @@ static int cpu_map_kthread_run(void *data) while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { struct xdp_cpumap_stats stats = {}; /* zero stats */ unsigned int kmem_alloc_drops = 0, sched = 0; - gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; - int i, n, m, nframes, xdp_n; + struct cpu_map_ret ret = { }; void *frames[CPUMAP_BATCH]; void *skbs[CPUMAP_BATCH]; - LIST_HEAD(list); + u32 i, n, m; + bool empty; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -306,7 +332,7 @@ static int cpu_map_kthread_run(void *data) */ n = __ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); - for (i = 0, xdp_n = 0; i < n; i++) { + for (i = 0; i < n; i++) { void *f = frames[i]; struct page *page; @@ -314,11 +340,11 @@ static int cpu_map_kthread_run(void *data) struct sk_buff *skb = f; __ptr_clear_bit(0, &skb); - list_add_tail(&skb->list, &list); + skbs[ret.skb_n++] = skb; continue; } - frames[xdp_n++] = f; + frames[ret.xdp_n++] = f; page = virt_to_page(f); /* Bring struct page memory area to curr CPU. Read by @@ -328,40 +354,51 @@ static int cpu_map_kthread_run(void *data) prefetchw(page); } + local_bh_disable(); + /* Support running another XDP prog on this CPU */ - nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list); - if (nframes) { - m = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, - gfp, nframes, skbs); - if (unlikely(m == 0)) { - for (i = 0; i < nframes; i++) - skbs[i] = NULL; /* effect: xdp_return_frame */ - kmem_alloc_drops += nframes; - } + cpu_map_bpf_prog_run(rcpu, frames, skbs, &ret, &stats); + if (!ret.xdp_n) + goto stats; + + m = napi_skb_cache_get_bulk(skbs, ret.xdp_n); + if (unlikely(m < ret.xdp_n)) { + for (i = m; i < ret.xdp_n; i++) + xdp_return_frame(frames[i]); + + if (ret.skb_n) + memmove(&skbs[m], &skbs[ret.xdp_n], + ret.skb_n * sizeof(*skbs)); + + kmem_alloc_drops += ret.xdp_n - m; + ret.xdp_n = m; } - local_bh_disable(); - for (i = 0; i < nframes; i++) { + for (i = 0; i < ret.xdp_n; i++) { struct xdp_frame *xdpf = frames[i]; - struct sk_buff *skb = skbs[i]; - skb = __xdp_build_skb_from_frame(xdpf, skb, - xdpf->dev_rx); - if (!skb) { - xdp_return_frame(xdpf); - continue; - } - - list_add_tail(&skb->list, &list); + /* Can fail only when !skb -- already handled above */ + __xdp_build_skb_from_frame(xdpf, skbs[i], xdpf->dev_rx); } +stats: /* Feedback loop via tracepoint. * NB: keep before recv to allow measuring enqueue/dequeue latency. */ trace_xdp_cpumap_kthread(rcpu->map_id, n, kmem_alloc_drops, sched, &stats); - netif_receive_skb_list(&list); + for (i = 0; i < ret.xdp_n + ret.skb_n; i++) + gro_receive_skb(&rcpu->gro, skbs[i]); + + /* Flush either every 64 packets or in case of empty ring */ + packets += n; + empty = __ptr_ring_empty(rcpu->queue); + if (packets >= NAPI_POLL_WEIGHT || empty) { + cpu_map_gro_flush(rcpu, empty); + packets = 0; + } + local_bh_enable(); /* resched point, may call do_softirq() */ } __set_current_state(TASK_RUNNING); @@ -430,6 +467,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, rcpu->cpu = cpu; rcpu->map_id = map->id; rcpu->value.qsize = value->qsize; + gro_init(&rcpu->gro); if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd)) goto free_ptr_ring; @@ -458,6 +496,7 @@ free_prog: if (rcpu->prog) bpf_prog_put(rcpu->prog); free_ptr_ring: + gro_cleanup(&rcpu->gro); ptr_ring_cleanup(rcpu->queue, NULL); free_queue: kfree(rcpu->queue); @@ -487,6 +526,7 @@ static void __cpu_map_entry_free(struct work_struct *work) if (rcpu->prog) bpf_prog_put(rcpu->prog); + gro_cleanup(&rcpu->gro); /* The queue should be empty at this point */ __cpu_map_ring_cleanup(rcpu->queue); ptr_ring_cleanup(rcpu->queue, NULL); |