From d92191aa84e5f187d543867c3d54b38f294833fa Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 21 Mar 2018 13:55:42 +0100 Subject: netfilter: nf_tables: cache device name in flowtable object Devices going away have to grab the nfnl_lock from the netdev event path to avoid races with control plane updates. However, netlink dumps in netfilter do not hold nfnl_lock mutex. Cache the device name into the objects to avoid an use-after-free situation for a device that is going away. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 663b015dace5..30eb0652b025 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1068,6 +1068,8 @@ struct nft_object_ops { int nft_register_obj(struct nft_object_type *obj_type); void nft_unregister_obj(struct nft_object_type *obj_type); +#define NFT_FLOWTABLE_DEVICE_MAX 8 + /** * struct nft_flowtable - nf_tables flow table * @@ -1080,6 +1082,7 @@ void nft_unregister_obj(struct nft_object_type *obj_type); * @genmask: generation mask * @use: number of references to this flow table * @handle: unique object handle + * @dev_name: array of device names * @data: rhashtable and garbage collector * @ops: array of hooks */ @@ -1093,6 +1096,7 @@ struct nft_flowtable { u32 genmask:2, use:30; u64 handle; + char *dev_name[NFT_FLOWTABLE_DEVICE_MAX]; /* runtime data below here */ struct nf_hook_ops *ops ____cacheline_aligned; struct nf_flowtable data; -- cgit v1.2.3 From eb82a994479245a79647d302f9b4eb8e7c9d7ca6 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sat, 24 Mar 2018 22:25:06 -0700 Subject: net: sched, fix OOO packets with pfifo_fast After the qdisc lock was dropped in pfifo_fast we allow multiple enqueue threads and dequeue threads to run in parallel. On the enqueue side the skb bit ooo_okay is used to ensure all related skbs are enqueued in-order. On the dequeue side though there is no similar logic. What we observe is with fewer queues than CPUs it is possible to re-order packets when two instances of __qdisc_run() are running in parallel. Each thread will dequeue a skb and then whichever thread calls the ndo op first will be sent on the wire. This doesn't typically happen because qdisc_run() is usually triggered by the same core that did the enqueue. However, drivers will trigger __netif_schedule() when queues are transitioning from stopped to awake using the netif_tx_wake_* APIs. When this happens netif_schedule() calls qdisc_run() on the same CPU that did the netif_tx_wake_* which is usually done in the interrupt completion context. This CPU is selected with the irq affinity which is unrelated to the enqueue operations. To resolve this we add a RUNNING bit to the qdisc to ensure only a single dequeue per qdisc is running. Enqueue and dequeue operations can still run in parallel and also on multi queue NICs we can still have a dequeue in-flight per qdisc, which is typically per CPU. Fixes: c5ad119fb6c0 ("net: sched: pfifo_fast use skb_array") Reported-by: Jakob Unterwurzacher Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/sch_generic.h | 1 + net/sched/sch_generic.c | 17 +++++++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 2092d33194dd..8da32678ce18 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -30,6 +30,7 @@ struct qdisc_rate_table { enum qdisc_state_t { __QDISC_STATE_SCHED, __QDISC_STATE_DEACTIVATED, + __QDISC_STATE_RUNNING, }; struct qdisc_size_table { diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 7e3fbe9cc936..39c144b6ff98 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -373,24 +373,33 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, */ static inline bool qdisc_restart(struct Qdisc *q, int *packets) { + bool more, validate, nolock = q->flags & TCQ_F_NOLOCK; spinlock_t *root_lock = NULL; struct netdev_queue *txq; struct net_device *dev; struct sk_buff *skb; - bool validate; /* Dequeue packet */ + if (nolock && test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) + return false; + skb = dequeue_skb(q, &validate, packets); - if (unlikely(!skb)) + if (unlikely(!skb)) { + if (nolock) + clear_bit(__QDISC_STATE_RUNNING, &q->state); return false; + } - if (!(q->flags & TCQ_F_NOLOCK)) + if (!nolock) root_lock = qdisc_lock(q); dev = qdisc_dev(q); txq = skb_get_tx_queue(dev, skb); - return sch_direct_xmit(skb, q, dev, txq, root_lock, validate); + more = sch_direct_xmit(skb, q, dev, txq, root_lock, validate); + if (nolock) + clear_bit(__QDISC_STATE_RUNNING, &q->state); + return more; } void __qdisc_run(struct Qdisc *q) -- cgit v1.2.3 From b85ab56c3f81c5a24b5a5213374f549df06430da Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 26 Mar 2018 15:08:33 -0700 Subject: llc: properly handle dev_queue_xmit() return value llc_conn_send_pdu() pushes the skb into write queue and calls llc_conn_send_pdus() to flush them out. However, the status of dev_queue_xmit() is not returned to caller, in this case, llc_conn_state_process(). llc_conn_state_process() needs hold the skb no matter success or failure, because it still uses it after that, therefore we should hold skb before dev_queue_xmit() when that skb is the one being processed by llc_conn_state_process(). For other callers, they can just pass NULL and ignore the return value as they are. Reported-by: Noam Rathaus Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/llc_conn.h | 2 +- net/llc/llc_c_ac.c | 15 +++++++++------ net/llc/llc_conn.c | 32 +++++++++++++++++++++++--------- 3 files changed, 33 insertions(+), 16 deletions(-) (limited to 'include/net') diff --git a/include/net/llc_conn.h b/include/net/llc_conn.h index fe994d2e5286..5c40f118c0fa 100644 --- a/include/net/llc_conn.h +++ b/include/net/llc_conn.h @@ -103,7 +103,7 @@ void llc_sk_reset(struct sock *sk); /* Access to a connection */ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb); -void llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb); +int llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb); void llc_conn_rtn_pdu(struct sock *sk, struct sk_buff *skb); void llc_conn_resend_i_pdu_as_cmd(struct sock *sk, u8 nr, u8 first_p_bit); void llc_conn_resend_i_pdu_as_rsp(struct sock *sk, u8 nr, u8 first_f_bit); diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c index f59648018060..163121192aca 100644 --- a/net/llc/llc_c_ac.c +++ b/net/llc/llc_c_ac.c @@ -389,7 +389,7 @@ static int llc_conn_ac_send_i_cmd_p_set_0(struct sock *sk, struct sk_buff *skb) llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { - llc_conn_send_pdu(sk, skb); + rc = llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } return rc; @@ -916,7 +916,7 @@ static int llc_conn_ac_send_i_rsp_f_set_ackpf(struct sock *sk, llc_pdu_init_as_i_cmd(skb, llc->ack_pf, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { - llc_conn_send_pdu(sk, skb); + rc = llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } return rc; @@ -935,14 +935,17 @@ static int llc_conn_ac_send_i_rsp_f_set_ackpf(struct sock *sk, int llc_conn_ac_send_i_as_ack(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); + int ret; if (llc->ack_must_be_send) { - llc_conn_ac_send_i_rsp_f_set_ackpf(sk, skb); + ret = llc_conn_ac_send_i_rsp_f_set_ackpf(sk, skb); llc->ack_must_be_send = 0 ; llc->ack_pf = 0; - } else - llc_conn_ac_send_i_cmd_p_set_0(sk, skb); - return 0; + } else { + ret = llc_conn_ac_send_i_cmd_p_set_0(sk, skb); + } + + return ret; } /** diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 9177dbb16dce..110e32bcb399 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -30,7 +30,7 @@ #endif static int llc_find_offset(int state, int ev_type); -static void llc_conn_send_pdus(struct sock *sk); +static int llc_conn_send_pdus(struct sock *sk, struct sk_buff *skb); static int llc_conn_service(struct sock *sk, struct sk_buff *skb); static int llc_exec_conn_trans_actions(struct sock *sk, struct llc_conn_state_trans *trans, @@ -193,11 +193,11 @@ out_skb_put: return rc; } -void llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb) +int llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb) { /* queue PDU to send to MAC layer */ skb_queue_tail(&sk->sk_write_queue, skb); - llc_conn_send_pdus(sk); + return llc_conn_send_pdus(sk, skb); } /** @@ -255,7 +255,7 @@ void llc_conn_resend_i_pdu_as_cmd(struct sock *sk, u8 nr, u8 first_p_bit) if (howmany_resend > 0) llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO; /* any PDUs to re-send are queued up; start sending to MAC */ - llc_conn_send_pdus(sk); + llc_conn_send_pdus(sk, NULL); out:; } @@ -296,7 +296,7 @@ void llc_conn_resend_i_pdu_as_rsp(struct sock *sk, u8 nr, u8 first_f_bit) if (howmany_resend > 0) llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO; /* any PDUs to re-send are queued up; start sending to MAC */ - llc_conn_send_pdus(sk); + llc_conn_send_pdus(sk, NULL); out:; } @@ -340,12 +340,16 @@ out: /** * llc_conn_send_pdus - Sends queued PDUs * @sk: active connection + * @hold_skb: the skb held by caller, or NULL if does not care * - * Sends queued pdus to MAC layer for transmission. + * Sends queued pdus to MAC layer for transmission. When @hold_skb is + * NULL, always return 0. Otherwise, return 0 if @hold_skb is sent + * successfully, or 1 for failure. */ -static void llc_conn_send_pdus(struct sock *sk) +static int llc_conn_send_pdus(struct sock *sk, struct sk_buff *hold_skb) { struct sk_buff *skb; + int ret = 0; while ((skb = skb_dequeue(&sk->sk_write_queue)) != NULL) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); @@ -357,10 +361,20 @@ static void llc_conn_send_pdus(struct sock *sk) skb_queue_tail(&llc_sk(sk)->pdu_unack_q, skb); if (!skb2) break; - skb = skb2; + dev_queue_xmit(skb2); + } else { + bool is_target = skb == hold_skb; + int rc; + + if (is_target) + skb_get(skb); + rc = dev_queue_xmit(skb); + if (is_target) + ret = rc; } - dev_queue_xmit(skb); } + + return ret; } /** -- cgit v1.2.3