From 7b4858df3bf7a8d43ed6b58f411543a040c56f10 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 29 May 2023 14:48:28 +0300 Subject: skbuff: bridge: Add layer 2 miss indication For EVPN non-DF (Designated Forwarder) filtering we need to be able to prevent decapsulated traffic from being flooded to a multi-homed host. Filtering of multicast and broadcast traffic can be achieved using the following flower filter: # tc filter add dev bond0 egress pref 1 proto all flower indev vxlan0 dst_mac 01:00:00:00:00:00/01:00:00:00:00:00 action drop Unlike broadcast and multicast traffic, it is not currently possible to filter unknown unicast traffic. The classification into unknown unicast is performed by the bridge driver, but is not visible to other layers such as tc. Solve this by adding a new 'l2_miss' bit to the tc skb extension. Clear the bit whenever a packet enters the bridge (received from a bridge port or transmitted via the bridge) and set it if the packet did not match an FDB or MDB entry. If there is no skb extension and the bit needs to be cleared, then do not allocate one as no extension is equivalent to the bit being cleared. The bit is not set for broadcast packets as they never perform a lookup and therefore never incur a miss. A bit that is set for every flooded packet would also work for the current use case, but it does not allow us to differentiate between registered and unregistered multicast traffic, which might be useful in the future. To keep the performance impact to a minimum, the marking of packets is guarded by the 'tc_skb_ext_tc' static key. When 'false', the skb is not touched and an skb extension is not allocated. Instead, only a 5 bytes nop is executed, as demonstrated below for the call site in br_handle_frame(). Before the patch: ``` memset(skb->cb, 0, sizeof(struct br_input_skb_cb)); c37b09: 49 c7 44 24 28 00 00 movq $0x0,0x28(%r12) c37b10: 00 00 p = br_port_get_rcu(skb->dev); c37b12: 49 8b 44 24 10 mov 0x10(%r12),%rax memset(skb->cb, 0, sizeof(struct br_input_skb_cb)); c37b17: 49 c7 44 24 30 00 00 movq $0x0,0x30(%r12) c37b1e: 00 00 c37b20: 49 c7 44 24 38 00 00 movq $0x0,0x38(%r12) c37b27: 00 00 ``` After the patch (when static key is disabled): ``` memset(skb->cb, 0, sizeof(struct br_input_skb_cb)); c37c29: 49 c7 44 24 28 00 00 movq $0x0,0x28(%r12) c37c30: 00 00 c37c32: 49 8d 44 24 28 lea 0x28(%r12),%rax c37c37: 48 c7 40 08 00 00 00 movq $0x0,0x8(%rax) c37c3e: 00 c37c3f: 48 c7 40 10 00 00 00 movq $0x0,0x10(%rax) c37c46: 00 #ifdef CONFIG_HAVE_JUMP_LABEL_HACK static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { asm_volatile_goto("1:" c37c47: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) br_tc_skb_miss_set(skb, false); p = br_port_get_rcu(skb->dev); c37c4c: 49 8b 44 24 10 mov 0x10(%r12),%rax ``` Subsequent patches will extend the flower classifier to be able to match on the new 'l2_miss' bit and enable / disable the static key when filters that match on it are added / deleted. Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Acked-by: Jakub Kicinski Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5951904413ab..e2f48ddb2f7c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -330,6 +330,7 @@ struct tc_skb_ext { u8 post_ct_snat:1; u8 post_ct_dnat:1; u8 act_miss:1; /* Set if act_miss_cookie is used */ + u8 l2_miss:1; /* Set by bridge upon FDB or MDB miss */ }; #endif -- cgit v1.2.3 From d5ccfd90df7fd0a50038a68634c131b8fd081bac Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 29 May 2023 14:48:29 +0300 Subject: flow_dissector: Dissect layer 2 miss from tc skb extension Extend the 'FLOW_DISSECTOR_KEY_META' key with a new 'l2_miss' field and populate it from a field with the same name in the tc skb extension. This field is set by the bridge driver for packets that incur an FDB or MDB miss. The next patch will extend the flower classifier to be able to match on layer 2 misses. Signed-off-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/net/flow_dissector.h | 2 ++ net/core/flow_dissector.c | 10 ++++++++++ 2 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 85b2281576ed..8b41668c77fc 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -243,10 +243,12 @@ struct flow_dissector_key_ip { * struct flow_dissector_key_meta: * @ingress_ifindex: ingress ifindex * @ingress_iftype: ingress interface type + * @l2_miss: packet did not match an L2 entry during forwarding */ struct flow_dissector_key_meta { int ingress_ifindex; u16 ingress_iftype; + u8 l2_miss; }; /** diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 25fb0bbc310f..481ca4080cbd 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -241,6 +242,15 @@ void skb_flow_dissect_meta(const struct sk_buff *skb, FLOW_DISSECTOR_KEY_META, target_container); meta->ingress_ifindex = skb->skb_iif; +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + if (tc_skb_ext_tc_enabled()) { + struct tc_skb_ext *ext; + + ext = skb_ext_find(skb, TC_SKB_EXT); + if (ext) + meta->l2_miss = ext->l2_miss; + } +#endif } EXPORT_SYMBOL(skb_flow_dissect_meta); -- cgit v1.2.3 From 1a432018c0cdf51a77a2e134b19ba6cab4c29c89 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 29 May 2023 14:48:30 +0300 Subject: net/sched: flower: Allow matching on layer 2 miss Add the 'TCA_FLOWER_L2_MISS' netlink attribute that allows user space to match on packets that encountered a layer 2 miss. The miss indication is set as metadata in the tc skb extension by the bridge driver upon FDB or MDB lookup miss and dissected by the flow dissector to the 'FLOW_DISSECTOR_KEY_META' key. The use of this skb extension is guarded by the 'tc_skb_ext_tc' static key. As such, enable / disable this key when filters that match on layer 2 miss are added / deleted. Tested: # cat tc_skb_ext_tc.py #!/usr/bin/env -S drgn -s vmlinux refcount = prog["tc_skb_ext_tc"].key.enabled.counter.value_() print(f"tc_skb_ext_tc reference count is {refcount}") # ./tc_skb_ext_tc.py tc_skb_ext_tc reference count is 0 # tc filter add dev swp1 egress proto all handle 101 pref 1 flower src_mac 00:11:22:33:44:55 action drop # tc filter add dev swp1 egress proto all handle 102 pref 2 flower src_mac 00:11:22:33:44:55 l2_miss true action drop # tc filter add dev swp1 egress proto all handle 103 pref 3 flower src_mac 00:11:22:33:44:55 l2_miss false action drop # ./tc_skb_ext_tc.py tc_skb_ext_tc reference count is 2 # tc filter replace dev swp1 egress proto all handle 102 pref 2 flower src_mac 00:01:02:03:04:05 l2_miss false action drop # ./tc_skb_ext_tc.py tc_skb_ext_tc reference count is 2 # tc filter del dev swp1 egress proto all handle 103 pref 3 flower # tc filter del dev swp1 egress proto all handle 102 pref 2 flower # tc filter del dev swp1 egress proto all handle 101 pref 1 flower # ./tc_skb_ext_tc.py tc_skb_ext_tc reference count is 0 Signed-off-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/uapi/linux/pkt_cls.h | 2 ++ net/sched/cls_flower.c | 30 ++++++++++++++++++++++++++++-- 2 files changed, 30 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 648a82f32666..00933dda7b10 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -594,6 +594,8 @@ enum { TCA_FLOWER_KEY_L2TPV3_SID, /* be32 */ + TCA_FLOWER_L2_MISS, /* u8 */ + __TCA_FLOWER_MAX, }; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 9dbc43388e57..04adcde9eb81 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -120,6 +120,7 @@ struct cls_fl_filter { u32 handle; u32 flags; u32 in_hw_count; + u8 needs_tc_skb_ext:1; struct rcu_work rwork; struct net_device *hw_dev; /* Flower classifier is unlocked, which means that its reference counter @@ -415,6 +416,8 @@ static struct cls_fl_head *fl_head_dereference(struct tcf_proto *tp) static void __fl_destroy_filter(struct cls_fl_filter *f) { + if (f->needs_tc_skb_ext) + tc_skb_ext_tc_disable(); tcf_exts_destroy(&f->exts); tcf_exts_put_net(&f->exts); kfree(f); @@ -615,7 +618,8 @@ static void *fl_get(struct tcf_proto *tp, u32 handle) } static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { - [TCA_FLOWER_UNSPEC] = { .type = NLA_UNSPEC }, + [TCA_FLOWER_UNSPEC] = { .strict_start_type = + TCA_FLOWER_L2_MISS }, [TCA_FLOWER_CLASSID] = { .type = NLA_U32 }, [TCA_FLOWER_INDEV] = { .type = NLA_STRING, .len = IFNAMSIZ }, @@ -720,7 +724,7 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_PPPOE_SID] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_PPP_PROTO] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_L2TPV3_SID] = { .type = NLA_U32 }, - + [TCA_FLOWER_L2_MISS] = NLA_POLICY_MAX(NLA_U8, 1), }; static const struct nla_policy @@ -1668,6 +1672,10 @@ static int fl_set_key(struct net *net, struct nlattr **tb, mask->meta.ingress_ifindex = 0xffffffff; } + fl_set_key_val(tb, &key->meta.l2_miss, TCA_FLOWER_L2_MISS, + &mask->meta.l2_miss, TCA_FLOWER_UNSPEC, + sizeof(key->meta.l2_miss)); + fl_set_key_val(tb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, sizeof(key->eth.dst)); @@ -2085,6 +2093,11 @@ errout_cleanup: return ret; } +static bool fl_needs_tc_skb_ext(const struct fl_flow_key *mask) +{ + return mask->meta.l2_miss; +} + static int fl_set_parms(struct net *net, struct tcf_proto *tp, struct cls_fl_filter *f, struct fl_flow_mask *mask, unsigned long base, struct nlattr **tb, @@ -2121,6 +2134,14 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp, return -EINVAL; } + /* Enable tc skb extension if filter matches on data extracted from + * this extension. + */ + if (fl_needs_tc_skb_ext(&mask->key)) { + f->needs_tc_skb_ext = 1; + tc_skb_ext_tc_enable(); + } + return 0; } @@ -3074,6 +3095,11 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net, goto nla_put_failure; } + if (fl_dump_key_val(skb, &key->meta.l2_miss, + TCA_FLOWER_L2_MISS, &mask->meta.l2_miss, + TCA_FLOWER_UNSPEC, sizeof(key->meta.l2_miss))) + goto nla_put_failure; + if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, sizeof(key->eth.dst)) || -- cgit v1.2.3