From d92191aa84e5f187d543867c3d54b38f294833fa Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 21 Mar 2018 13:55:42 +0100 Subject: netfilter: nf_tables: cache device name in flowtable object Devices going away have to grab the nfnl_lock from the netdev event path to avoid races with control plane updates. However, netlink dumps in netfilter do not hold nfnl_lock mutex. Cache the device name into the objects to avoid an use-after-free situation for a device that is going away. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 663b015dace5..30eb0652b025 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1068,6 +1068,8 @@ struct nft_object_ops { int nft_register_obj(struct nft_object_type *obj_type); void nft_unregister_obj(struct nft_object_type *obj_type); +#define NFT_FLOWTABLE_DEVICE_MAX 8 + /** * struct nft_flowtable - nf_tables flow table * @@ -1080,6 +1082,7 @@ void nft_unregister_obj(struct nft_object_type *obj_type); * @genmask: generation mask * @use: number of references to this flow table * @handle: unique object handle + * @dev_name: array of device names * @data: rhashtable and garbage collector * @ops: array of hooks */ @@ -1093,6 +1096,7 @@ struct nft_flowtable { u32 genmask:2, use:30; u64 handle; + char *dev_name[NFT_FLOWTABLE_DEVICE_MAX]; /* runtime data below here */ struct nf_hook_ops *ops ____cacheline_aligned; struct nf_flowtable data; -- cgit v1.2.3 From eb82a994479245a79647d302f9b4eb8e7c9d7ca6 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sat, 24 Mar 2018 22:25:06 -0700 Subject: net: sched, fix OOO packets with pfifo_fast After the qdisc lock was dropped in pfifo_fast we allow multiple enqueue threads and dequeue threads to run in parallel. On the enqueue side the skb bit ooo_okay is used to ensure all related skbs are enqueued in-order. On the dequeue side though there is no similar logic. What we observe is with fewer queues than CPUs it is possible to re-order packets when two instances of __qdisc_run() are running in parallel. Each thread will dequeue a skb and then whichever thread calls the ndo op first will be sent on the wire. This doesn't typically happen because qdisc_run() is usually triggered by the same core that did the enqueue. However, drivers will trigger __netif_schedule() when queues are transitioning from stopped to awake using the netif_tx_wake_* APIs. When this happens netif_schedule() calls qdisc_run() on the same CPU that did the netif_tx_wake_* which is usually done in the interrupt completion context. This CPU is selected with the irq affinity which is unrelated to the enqueue operations. To resolve this we add a RUNNING bit to the qdisc to ensure only a single dequeue per qdisc is running. Enqueue and dequeue operations can still run in parallel and also on multi queue NICs we can still have a dequeue in-flight per qdisc, which is typically per CPU. Fixes: c5ad119fb6c0 ("net: sched: pfifo_fast use skb_array") Reported-by: Jakob Unterwurzacher Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/sch_generic.h | 1 + net/sched/sch_generic.c | 17 +++++++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 2092d33194dd..8da32678ce18 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -30,6 +30,7 @@ struct qdisc_rate_table { enum qdisc_state_t { __QDISC_STATE_SCHED, __QDISC_STATE_DEACTIVATED, + __QDISC_STATE_RUNNING, }; struct qdisc_size_table { diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 7e3fbe9cc936..39c144b6ff98 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -373,24 +373,33 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, */ static inline bool qdisc_restart(struct Qdisc *q, int *packets) { + bool more, validate, nolock = q->flags & TCQ_F_NOLOCK; spinlock_t *root_lock = NULL; struct netdev_queue *txq; struct net_device *dev; struct sk_buff *skb; - bool validate; /* Dequeue packet */ + if (nolock && test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) + return false; + skb = dequeue_skb(q, &validate, packets); - if (unlikely(!skb)) + if (unlikely(!skb)) { + if (nolock) + clear_bit(__QDISC_STATE_RUNNING, &q->state); return false; + } - if (!(q->flags & TCQ_F_NOLOCK)) + if (!nolock) root_lock = qdisc_lock(q); dev = qdisc_dev(q); txq = skb_get_tx_queue(dev, skb); - return sch_direct_xmit(skb, q, dev, txq, root_lock, validate); + more = sch_direct_xmit(skb, q, dev, txq, root_lock, validate); + if (nolock) + clear_bit(__QDISC_STATE_RUNNING, &q->state); + return more; } void __qdisc_run(struct Qdisc *q) -- cgit v1.2.3 From b85ab56c3f81c5a24b5a5213374f549df06430da Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 26 Mar 2018 15:08:33 -0700 Subject: llc: properly handle dev_queue_xmit() return value llc_conn_send_pdu() pushes the skb into write queue and calls llc_conn_send_pdus() to flush them out. However, the status of dev_queue_xmit() is not returned to caller, in this case, llc_conn_state_process(). llc_conn_state_process() needs hold the skb no matter success or failure, because it still uses it after that, therefore we should hold skb before dev_queue_xmit() when that skb is the one being processed by llc_conn_state_process(). For other callers, they can just pass NULL and ignore the return value as they are. Reported-by: Noam Rathaus Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/llc_conn.h | 2 +- net/llc/llc_c_ac.c | 15 +++++++++------ net/llc/llc_conn.c | 32 +++++++++++++++++++++++--------- 3 files changed, 33 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/net/llc_conn.h b/include/net/llc_conn.h index fe994d2e5286..5c40f118c0fa 100644 --- a/include/net/llc_conn.h +++ b/include/net/llc_conn.h @@ -103,7 +103,7 @@ void llc_sk_reset(struct sock *sk); /* Access to a connection */ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb); -void llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb); +int llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb); void llc_conn_rtn_pdu(struct sock *sk, struct sk_buff *skb); void llc_conn_resend_i_pdu_as_cmd(struct sock *sk, u8 nr, u8 first_p_bit); void llc_conn_resend_i_pdu_as_rsp(struct sock *sk, u8 nr, u8 first_f_bit); diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c index f59648018060..163121192aca 100644 --- a/net/llc/llc_c_ac.c +++ b/net/llc/llc_c_ac.c @@ -389,7 +389,7 @@ static int llc_conn_ac_send_i_cmd_p_set_0(struct sock *sk, struct sk_buff *skb) llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { - llc_conn_send_pdu(sk, skb); + rc = llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } return rc; @@ -916,7 +916,7 @@ static int llc_conn_ac_send_i_rsp_f_set_ackpf(struct sock *sk, llc_pdu_init_as_i_cmd(skb, llc->ack_pf, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { - llc_conn_send_pdu(sk, skb); + rc = llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } return rc; @@ -935,14 +935,17 @@ static int llc_conn_ac_send_i_rsp_f_set_ackpf(struct sock *sk, int llc_conn_ac_send_i_as_ack(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); + int ret; if (llc->ack_must_be_send) { - llc_conn_ac_send_i_rsp_f_set_ackpf(sk, skb); + ret = llc_conn_ac_send_i_rsp_f_set_ackpf(sk, skb); llc->ack_must_be_send = 0 ; llc->ack_pf = 0; - } else - llc_conn_ac_send_i_cmd_p_set_0(sk, skb); - return 0; + } else { + ret = llc_conn_ac_send_i_cmd_p_set_0(sk, skb); + } + + return ret; } /** diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 9177dbb16dce..110e32bcb399 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -30,7 +30,7 @@ #endif static int llc_find_offset(int state, int ev_type); -static void llc_conn_send_pdus(struct sock *sk); +static int llc_conn_send_pdus(struct sock *sk, struct sk_buff *skb); static int llc_conn_service(struct sock *sk, struct sk_buff *skb); static int llc_exec_conn_trans_actions(struct sock *sk, struct llc_conn_state_trans *trans, @@ -193,11 +193,11 @@ out_skb_put: return rc; } -void llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb) +int llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb) { /* queue PDU to send to MAC layer */ skb_queue_tail(&sk->sk_write_queue, skb); - llc_conn_send_pdus(sk); + return llc_conn_send_pdus(sk, skb); } /** @@ -255,7 +255,7 @@ void llc_conn_resend_i_pdu_as_cmd(struct sock *sk, u8 nr, u8 first_p_bit) if (howmany_resend > 0) llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO; /* any PDUs to re-send are queued up; start sending to MAC */ - llc_conn_send_pdus(sk); + llc_conn_send_pdus(sk, NULL); out:; } @@ -296,7 +296,7 @@ void llc_conn_resend_i_pdu_as_rsp(struct sock *sk, u8 nr, u8 first_f_bit) if (howmany_resend > 0) llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO; /* any PDUs to re-send are queued up; start sending to MAC */ - llc_conn_send_pdus(sk); + llc_conn_send_pdus(sk, NULL); out:; } @@ -340,12 +340,16 @@ out: /** * llc_conn_send_pdus - Sends queued PDUs * @sk: active connection + * @hold_skb: the skb held by caller, or NULL if does not care * - * Sends queued pdus to MAC layer for transmission. + * Sends queued pdus to MAC layer for transmission. When @hold_skb is + * NULL, always return 0. Otherwise, return 0 if @hold_skb is sent + * successfully, or 1 for failure. */ -static void llc_conn_send_pdus(struct sock *sk) +static int llc_conn_send_pdus(struct sock *sk, struct sk_buff *hold_skb) { struct sk_buff *skb; + int ret = 0; while ((skb = skb_dequeue(&sk->sk_write_queue)) != NULL) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); @@ -357,10 +361,20 @@ static void llc_conn_send_pdus(struct sock *sk) skb_queue_tail(&llc_sk(sk)->pdu_unack_q, skb); if (!skb2) break; - skb = skb2; + dev_queue_xmit(skb2); + } else { + bool is_target = skb == hold_skb; + int rc; + + if (is_target) + skb_get(skb); + rc = dev_queue_xmit(skb); + if (is_target) + ret = rc; } - dev_queue_xmit(skb); } + + return ret; } /** -- cgit v1.2.3 From c769accdf3d8a103940bea2979b65556718567e9 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Thu, 29 Mar 2018 19:05:30 +0900 Subject: vlan: Fix vlan insertion for packets without ethernet header In some situation vlan packets do not have ethernet headers. One example is packets from tun devices. Users can specify vlan protocol in tun_pi field instead of IP protocol. When we have a vlan device with reorder_hdr disabled on top of the tun device, such packets from tun devices are untagged in skb_vlan_untag() and vlan headers will be inserted back in vlan_insert_inner_tag(). vlan_insert_inner_tag() however did not expect packets without ethernet headers, so in such a case size argument for memmove() underflowed. We don't need to copy headers for packets which do not have preceding headers of vlan headers, so skip memmove() in that case. Also don't write vlan protocol in skb->data when it does not have enough room for it. Fixes: cbe7128c4b92 ("vlan: Fix out of order vlan headers with reorder header off") Signed-off-by: Toshiaki Makita Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index c4a1cff9c768..7d30892da064 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -323,13 +323,24 @@ static inline int __vlan_insert_inner_tag(struct sk_buff *skb, skb_push(skb, VLAN_HLEN); /* Move the mac header sans proto to the beginning of the new header. */ - memmove(skb->data, skb->data + VLAN_HLEN, mac_len - ETH_TLEN); + if (likely(mac_len > ETH_TLEN)) + memmove(skb->data, skb->data + VLAN_HLEN, mac_len - ETH_TLEN); skb->mac_header -= VLAN_HLEN; veth = (struct vlan_ethhdr *)(skb->data + mac_len - ETH_HLEN); /* first, the ethernet type */ - veth->h_vlan_proto = vlan_proto; + if (likely(mac_len >= ETH_TLEN)) { + /* h_vlan_encapsulated_proto should already be populated, and + * skb->data has space for h_vlan_proto + */ + veth->h_vlan_proto = vlan_proto; + } else { + /* h_vlan_encapsulated_proto should not be populated, and + * skb->data has no space for h_vlan_proto + */ + veth->h_vlan_encapsulated_proto = skb->protocol; + } /* now, the TCI */ veth->h_vlan_TCI = htons(vlan_tci); -- cgit v1.2.3 From f97c3dc3c0e8d23a5c4357d182afeef4c67f5c33 Mon Sep 17 00:00:00 2001 From: Tal Gilboa Date: Thu, 29 Mar 2018 13:53:52 +0300 Subject: net/dim: Fix int overflow When calculating difference between samples, the values are multiplied by 100. Large values may cause int overflow when multiplied (usually on first iteration). Fixed by forcing 100 to be of type unsigned long. Fixes: 4c4dbb4a7363 ("net/mlx5e: Move dynamic interrupt coalescing code to include/linux") Signed-off-by: Tal Gilboa Reviewed-by: Andy Gospodarek Signed-off-by: David S. Miller --- include/linux/net_dim.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/net_dim.h b/include/linux/net_dim.h index bebeaad897cc..29ed8fd6379a 100644 --- a/include/linux/net_dim.h +++ b/include/linux/net_dim.h @@ -231,7 +231,7 @@ static inline void net_dim_exit_parking(struct net_dim *dim) } #define IS_SIGNIFICANT_DIFF(val, ref) \ - (((100 * abs((val) - (ref))) / (ref)) > 10) /* more than 10% difference */ + (((100UL * abs((val) - (ref))) / (ref)) > 10) /* more than 10% difference */ static inline int net_dim_stats_compare(struct net_dim_stats *curr, struct net_dim_stats *prev) -- cgit v1.2.3