From 248e23b50e2da0753f3b5faa068939cbe9f8a75a Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 12 Feb 2017 11:26:33 +0100 Subject: batman-adv: Fix double free during fragment merge error The function batadv_frag_skb_buffer was supposed not to consume the skbuff on errors. This was followed in the helper function batadv_frag_insert_packet when the skb would potentially be inserted in the fragment queue. But it could happen that the next helper function batadv_frag_merge_packets would try to merge the fragments and fail. This results in a kfree_skb of all the enqueued fragments (including the just inserted one). batadv_recv_frag_packet would detect the error in batadv_frag_skb_buffer and try to free the skb again. The behavior of batadv_frag_skb_buffer (and its helper batadv_frag_insert_packet) must therefore be changed to always consume the skbuff to have a common behavior and avoid the double kfree_skb. Fixes: 610bfc6bc99b ("batman-adv: Receive fragmented packets and merge") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/fragmentation.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 0854ebd8613e..31e97e9aee0d 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -239,8 +239,10 @@ err_unlock: spin_unlock_bh(&chain->lock); err: - if (!ret) + if (!ret) { kfree(frag_entry_new); + kfree_skb(skb); + } return ret; } @@ -313,7 +315,7 @@ free: * * There are three possible outcomes: 1) Packet is merged: Return true and * set *skb to merged packet; 2) Packet is buffered: Return true and set *skb - * to NULL; 3) Error: Return false and leave skb as is. + * to NULL; 3) Error: Return false and free skb. * * Return: true when packet is merged or buffered, false when skb is not not * used. @@ -338,9 +340,9 @@ bool batadv_frag_skb_buffer(struct sk_buff **skb, goto out_err; out: - *skb = skb_out; ret = true; out_err: + *skb = skb_out; return ret; } -- cgit v1.2.3 From 51c6b429c0c95e67edd1cb0b548c5cf6a6604763 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Mon, 13 Feb 2017 20:44:31 +0100 Subject: batman-adv: Fix transmission of final, 16th fragment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Trying to split and transmit a unicast packet in 16 parts will fail for the final fragment: After having sent the 15th one with a frag_packet.no index of 14, we will increase the the index to 15 - and return with an error code immediately, even though one more fragment is due for transmission and allowed. Fixing this issue by moving the check before incrementing the index. While at it, adding an unlikely(), because the check is actually more of an assertion. Fixes: ee75ed88879a ("batman-adv: Fragment and send skbs larger than mtu") Signed-off-by: Linus Lüssing Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/fragmentation.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 31e97e9aee0d..11149e5be4e0 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -501,6 +501,12 @@ int batadv_frag_send_packet(struct sk_buff *skb, /* Eat and send fragments from the tail of skb */ while (skb->len > max_fragment_size) { + /* The initial check in this function should cover this case */ + if (unlikely(frag_header.no == BATADV_FRAG_MAX_FRAGMENTS - 1)) { + ret = -EINVAL; + goto put_primary_if; + } + skb_fragment = batadv_frag_create(skb, &frag_header, mtu); if (!skb_fragment) { ret = -ENOMEM; @@ -517,12 +523,6 @@ int batadv_frag_send_packet(struct sk_buff *skb, } frag_header.no++; - - /* The initial check in this function should cover this case */ - if (frag_header.no == BATADV_FRAG_MAX_FRAGMENTS - 1) { - ret = -EINVAL; - goto put_primary_if; - } } /* Make room for the fragment header. */ -- cgit v1.2.3 From 0328edc77d4f35014b35f32b46be0a7e16aae74f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 20 Feb 2017 08:59:16 +0100 Subject: mac80211: fix packet statistics for fast-RX When adding per-CPU statistics, which added statistics back to mac80211 for the fast-RX path, I evidently forgot to add the "stats->packets++" line. The reason for that is likely that I didn't see it since it's done in defragmentation for the regular RX path. Add the missing line to properly count received packets in the fast-RX case. Fixes: c9c5962b56c1 ("mac80211: enable collecting station statistics per-CPU") Reported-by: Oren Givon Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 50ca3828b124..a8443d8bc233 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3880,6 +3880,7 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, stats->last_rate = sta_stats_encode_rate(status); stats->fragments++; + stats->packets++; if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) { stats->last_signal = status->signal; -- cgit v1.2.3 From a9e9200d8661c1a0be8c39f93deb383dc940de35 Mon Sep 17 00:00:00 2001 From: Matt Chen Date: Sun, 22 Jan 2017 02:16:58 +0800 Subject: mac80211: flush delayed work when entering suspend The issue was found when entering suspend and resume. It triggers a warning in: mac80211/key.c: ieee80211_enable_keys() ... WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt || sdata->crypto_tx_tailroom_pending_dec); ... It points out sdata->crypto_tx_tailroom_pending_dec isn't cleaned up successfully in a delayed_work during suspend. Add a flush_delayed_work to fix it. Cc: stable@vger.kernel.org Signed-off-by: Matt Chen Signed-off-by: Johannes Berg --- net/mac80211/pm.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c index 28a3a0957c9e..76a8bcd8ef11 100644 --- a/net/mac80211/pm.c +++ b/net/mac80211/pm.c @@ -168,6 +168,7 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) break; } + flush_delayed_work(&sdata->dec_tailroom_needed_wk); drv_remove_interface(local, sdata); } -- cgit v1.2.3 From b7540d8f25c8034de7e4163fc23ac457bf057731 Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Mon, 6 Feb 2017 15:28:42 +0200 Subject: mac80211: don't reorder frames with SN smaller than SSN When RX aggregation starts, transmitter may continue send frames with SN smaller than SSN until the AddBA response is received. However, the reorder buffer is already initialized at this point, which will cause the drop of such frames as duplicates since the head SN of the reorder buffer is set to the SSN, which is bigger. Cc: stable@vger.kernel.org Signed-off-by: Sara Sharon Signed-off-by: Johannes Berg --- net/mac80211/agg-rx.c | 1 + net/mac80211/rx.c | 14 +++++++++++++- net/mac80211/sta_info.h | 6 ++++-- 3 files changed, 18 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 3b5fd4188f2a..58ad23a44109 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -398,6 +398,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, tid_agg_rx->timeout = timeout; tid_agg_rx->stored_mpdu_num = 0; tid_agg_rx->auto_seq = auto_seq; + tid_agg_rx->started = false; tid_agg_rx->reorder_buf_filtered = 0; status = WLAN_STATUS_SUCCESS; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index a8443d8bc233..28cc494a774d 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4,7 +4,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright(c) 2015 - 2016 Intel Deutschland GmbH + * Copyright(c) 2015 - 2017 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -1034,6 +1034,18 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata buf_size = tid_agg_rx->buf_size; head_seq_num = tid_agg_rx->head_seq_num; + /* + * If the current MPDU's SN is smaller than the SSN, it shouldn't + * be reordered. + */ + if (unlikely(!tid_agg_rx->started)) { + if (ieee80211_sn_less(mpdu_seq_num, head_seq_num)) { + ret = false; + goto out; + } + tid_agg_rx->started = true; + } + /* frame with out of date sequence number */ if (ieee80211_sn_less(mpdu_seq_num, head_seq_num)) { dev_kfree_skb(skb); diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index dd06ef0b8861..15599c70a38f 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -189,6 +189,7 @@ struct tid_ampdu_tx { * @auto_seq: used for offloaded BA sessions to automatically pick head_seq_and * and ssn. * @removed: this session is removed (but might have been found due to RCU) + * @started: this session has started (head ssn or higher was received) * * This structure's lifetime is managed by RCU, assignments to * the array holding it must hold the aggregation mutex. @@ -212,8 +213,9 @@ struct tid_ampdu_rx { u16 ssn; u16 buf_size; u16 timeout; - bool auto_seq; - bool removed; + u8 auto_seq:1, + removed:1, + started:1; }; /** -- cgit v1.2.3 From 890030d3c425f49abaa4acf60e20f288b599f980 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 22 Feb 2017 16:16:07 +0100 Subject: mac80211: don't handle filtered frames within a BA session When running a BA session, the driver (or the hardware) already takes care of retransmitting failed frames, since it has to keep the receiver reorder window in sync. Adding another layer of retransmit around that does not improve anything. In fact, it can only lead to some strong reordering with huge latency. Cc: stable@vger.kernel.org Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- net/mac80211/status.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/status.c b/net/mac80211/status.c index a3af6e1bfd98..05ccd55b5d83 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -51,7 +51,8 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, struct ieee80211_hdr *hdr = (void *)skb->data; int ac; - if (info->flags & IEEE80211_TX_CTL_NO_PS_BUFFER) { + if (info->flags & (IEEE80211_TX_CTL_NO_PS_BUFFER | + IEEE80211_TX_CTL_AMPDU)) { ieee80211_free_txskb(&local->hw, skb); return; } -- cgit v1.2.3 From d98937f4ea713d21e0fcc345919f86c877dd8d6f Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Mon, 20 Feb 2017 14:24:36 +0100 Subject: mac80211: fix power saving clients handling in iwlwifi iwlwifi now supports RSS and can't let mac80211 track the PS state based on the Rx frames since they can come out of order. iwlwifi is now advertising AP_LINK_PS, and uses explicit notifications to teach mac80211 about the PS state of the stations and the PS poll / uAPSD trigger frames coming our way from the peers. Because of that, the TIM stopped being maintained in mac80211. I tried to fix this in commit c68df2e7be0c ("mac80211: allow using AP_LINK_PS with mac80211-generated TIM IE") but that was later reverted by Felix in commit 6c18a6b4e799 ("Revert "mac80211: allow using AP_LINK_PS with mac80211-generated TIM IE") since it broke drivers that do not implement set_tim. Since none of the drivers that set AP_LINK_PS have the set_tim() handler set besides iwlwifi, I can bail out in __sta_info_recalc_tim if AP_LINK_PS AND .set_tim is not implemented. Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/sta_info.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 4774e663a411..8bb99d299cda 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -688,7 +688,7 @@ static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending) } /* No need to do anything if the driver does all */ - if (ieee80211_hw_check(&local->hw, AP_LINK_PS)) + if (ieee80211_hw_check(&local->hw, AP_LINK_PS) && !local->ops->set_tim) return; if (sta->dead) -- cgit v1.2.3 From 2595d259b667114431501bae51b45d6656b987d1 Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Mon, 20 Feb 2017 14:24:39 +0100 Subject: mac80211: shorten debug message Tracing is limited to 100 characters and this message passes the limit when there are a few buffered frames. Shorten it. Signed-off-by: Sara Sharon Signed-off-by: Johannes Berg --- net/mac80211/sta_info.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 8bb99d299cda..3323a2fb289b 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1264,7 +1264,7 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) sta_info_recalc_tim(sta); ps_dbg(sdata, - "STA %pM aid %d sending %d filtered/%d PS frames since STA not sleeping anymore\n", + "STA %pM aid %d sending %d filtered/%d PS frames since STA woke up\n", sta->sta.addr, sta->sta.aid, filtered, buffered); ieee80211_check_fast_xmit(sta); -- cgit v1.2.3 From 09e0a2fe102208cbaf39510b8b04dd524d7d2935 Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Mon, 20 Feb 2017 14:24:38 +0100 Subject: mac80211: fix typo in debug print Signed-off-by: Sara Sharon Signed-off-by: Johannes Berg --- net/mac80211/agg-rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 58ad23a44109..4456559cb056 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -85,7 +85,7 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, ht_dbg(sta->sdata, "Rx BA session stop requested for %pM tid %u %s reason: %d\n", sta->sta.addr, tid, - initiator == WLAN_BACK_RECIPIENT ? "recipient" : "inititator", + initiator == WLAN_BACK_RECIPIENT ? "recipient" : "initiator", (int)reason); if (drv_ampdu_action(local, sta->sdata, ¶ms)) -- cgit v1.2.3 From 19d19e960598161be92a7e4828eb7706c6410ce6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 27 Feb 2017 09:38:11 +0100 Subject: mac80211: use driver-indicated transmitter STA only for data frames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When I originally introduced using the driver-indicated station as an optimisation to avoid the hashtable lookup/iteration, of course it wasn't intended to really functionally change anything. I neglected, however, to take into account VLAN interfaces, which have the property that management and data frames are handled differently: data frames go directly to the station and the VLAN while management frames continue to be processed over the underlying/associated AP-type interface. As a consequence, when a driver used this optimisation for management frames and the user enabled VLANs, my change broke things since any management frames, particularly disassoc/deauth, were missed by hostapd. Fix this by restoring the original code path for non-data frames, they aren't critical for performance to begin with. This fixes https://bugzilla.kernel.org/show_bug.cgi?id=194713. Big thanks goes to Jarek who bisected the issue and provided a very detailed bug report, including the crucial information that he was using VLANs in his configuration. Cc: stable@vger.kernel.org Fixes: 771e846bea9e ("mac80211: allow passing transmitter station on RX") Reported-and-tested-by: Jarek Kamiński Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 28cc494a774d..e48724a6725e 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4086,15 +4086,17 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, ieee80211_is_beacon(hdr->frame_control))) ieee80211_scan_rx(local, skb); - if (pubsta) { - rx.sta = container_of(pubsta, struct sta_info, sta); - rx.sdata = rx.sta->sdata; - if (ieee80211_prepare_and_rx_handle(&rx, skb, true)) - return; - goto out; - } else if (ieee80211_is_data(fc)) { + if (ieee80211_is_data(fc)) { struct sta_info *sta, *prev_sta; + if (pubsta) { + rx.sta = container_of(pubsta, struct sta_info, sta); + rx.sdata = rx.sta->sdata; + if (ieee80211_prepare_and_rx_handle(&rx, skb, true)) + return; + goto out; + } + prev_sta = NULL; for_each_sta_info(local, hdr->addr2, sta, tmp) { -- cgit v1.2.3 From 29e09229d9f26129a39462fae0ddabc4d9533989 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 17 Feb 2017 08:39:28 +0100 Subject: netfilter: use skb_to_full_sk in ip_route_me_harder inet_sk(skb->sk) is illegal in case skb is attached to request socket. Fixes: ca6fb0651883 ("tcp: attach SYNACK messages to request sockets instead of listener") Reported by: Daniel J Blueman Signed-off-by: Florian Westphal Tested-by: Daniel J Blueman Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index b3cc1335adbc..c0cc6aa8cfaa 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -23,7 +23,8 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t struct rtable *rt; struct flowi4 fl4 = {}; __be32 saddr = iph->saddr; - __u8 flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; + const struct sock *sk = skb_to_full_sk(skb); + __u8 flags = sk ? inet_sk_flowi_flags(sk) : 0; struct net_device *dev = skb_dst(skb)->dev; unsigned int hh_len; @@ -40,7 +41,7 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t fl4.daddr = iph->daddr; fl4.saddr = saddr; fl4.flowi4_tos = RT_TOS(iph->tos); - fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; + fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0; if (!fl4.flowi4_oif) fl4.flowi4_oif = l3mdev_master_ifindex(dev); fl4.flowi4_mark = skb->mark; @@ -61,7 +62,7 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t xfrm_decode_session(skb, flowi4_to_flowi(&fl4), AF_INET) == 0) { struct dst_entry *dst = skb_dst(skb); skb_dst_set(skb, NULL); - dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0); + dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), sk, 0); if (IS_ERR(dst)) return PTR_ERR(dst); skb_dst_set(skb, dst); -- cgit v1.2.3 From 540b1c48c37ac0ad66212004db21e1ff7e2d78be Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 27 Feb 2017 15:43:06 +0000 Subject: rxrpc: Fix deadlock between call creation and sendmsg/recvmsg All the routines by which rxrpc is accessed from the outside are serialised by means of the socket lock (sendmsg, recvmsg, bind, rxrpc_kernel_begin_call(), ...) and this presents a problem: (1) If a number of calls on the same socket are in the process of connection to the same peer, a maximum of four concurrent live calls are permitted before further calls need to wait for a slot. (2) If a call is waiting for a slot, it is deep inside sendmsg() or rxrpc_kernel_begin_call() and the entry function is holding the socket lock. (3) sendmsg() and recvmsg() or the in-kernel equivalents are prevented from servicing the other calls as they need to take the socket lock to do so. (4) The socket is stuck until a call is aborted and makes its slot available to the waiter. Fix this by: (1) Provide each call with a mutex ('user_mutex') that arbitrates access by the users of rxrpc separately for each specific call. (2) Make rxrpc_sendmsg() and rxrpc_recvmsg() unlock the socket as soon as they've got a call and taken its mutex. Note that I'm returning EWOULDBLOCK from recvmsg() if MSG_DONTWAIT is set but someone else has the lock. Should I instead only return EWOULDBLOCK if there's nothing currently to be done on a socket, and sleep in this particular instance because there is something to be done, but we appear to be blocked by the interrupt handler doing its ping? (3) Make rxrpc_new_client_call() unlock the socket after allocating a new call, locking its user mutex and adding it to the socket's call tree. The call is returned locked so that sendmsg() can add data to it immediately. From the moment the call is in the socket tree, it is subject to access by sendmsg() and recvmsg() - even if it isn't connected yet. (4) Lock new service calls in the UDP data_ready handler (in rxrpc_new_incoming_call()) because they may already be in the socket's tree and the data_ready handler makes them live immediately if a user ID has already been preassigned. Note that the new call is locked before any notifications are sent that it is live, so doing mutex_trylock() *ought* to always succeed. Userspace is prevented from doing sendmsg() on calls that are in a too-early state in rxrpc_do_sendmsg(). (5) Make rxrpc_new_incoming_call() return the call with the user mutex held so that a ping can be scheduled immediately under it. Note that it might be worth moving the ping call into rxrpc_new_incoming_call() and then we can drop the mutex there. (6) Make rxrpc_accept_call() take the lock on the call it is accepting and release the socket after adding the call to the socket's tree. This is slightly tricky as we've dequeued the call by that point and have to requeue it. Note that requeuing emits a trace event. (7) Make rxrpc_kernel_send_data() and rxrpc_kernel_recv_data() take the new mutex immediately and don't bother with the socket mutex at all. This patch has the nice bonus that calls on the same socket are now to some extent parallelisable. Note that we might want to move rxrpc_service_prealloc() calls out from the socket lock and give it its own lock, so that we don't hang progress in other calls because we're waiting for the allocator. We probably also want to avoid calling rxrpc_notify_socket() from within the socket lock (rxrpc_accept_call()). Signed-off-by: David Howells Tested-by: Marc Dionne Signed-off-by: David S. Miller --- include/trace/events/rxrpc.h | 2 ++ net/rxrpc/af_rxrpc.c | 12 +++++++--- net/rxrpc/ar-internal.h | 1 + net/rxrpc/call_accept.c | 48 +++++++++++++++++++++++++++++++++++++ net/rxrpc/call_object.c | 18 ++++++++++++-- net/rxrpc/input.c | 1 + net/rxrpc/recvmsg.c | 39 +++++++++++++++++++++++++----- net/rxrpc/sendmsg.c | 57 +++++++++++++++++++++++++++++++++++--------- 8 files changed, 156 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 593f586545eb..39123c06a566 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -119,6 +119,7 @@ enum rxrpc_recvmsg_trace { rxrpc_recvmsg_full, rxrpc_recvmsg_hole, rxrpc_recvmsg_next, + rxrpc_recvmsg_requeue, rxrpc_recvmsg_return, rxrpc_recvmsg_terminal, rxrpc_recvmsg_to_be_accepted, @@ -277,6 +278,7 @@ enum rxrpc_congest_change { EM(rxrpc_recvmsg_full, "FULL") \ EM(rxrpc_recvmsg_hole, "HOLE") \ EM(rxrpc_recvmsg_next, "NEXT") \ + EM(rxrpc_recvmsg_requeue, "REQU") \ EM(rxrpc_recvmsg_return, "RETN") \ EM(rxrpc_recvmsg_terminal, "TERM") \ EM(rxrpc_recvmsg_to_be_accepted, "TBAC") \ diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 199b46e93e64..7fb59c3f1542 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -290,10 +290,11 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, cp.exclusive = false; cp.service_id = srx->srx_service; call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, gfp); + /* The socket has been unlocked. */ if (!IS_ERR(call)) call->notify_rx = notify_rx; - release_sock(&rx->sk); + mutex_unlock(&call->user_mutex); _leave(" = %p", call); return call; } @@ -310,7 +311,10 @@ EXPORT_SYMBOL(rxrpc_kernel_begin_call); void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call) { _enter("%d{%d}", call->debug_id, atomic_read(&call->usage)); + + mutex_lock(&call->user_mutex); rxrpc_release_call(rxrpc_sk(sock->sk), call); + mutex_unlock(&call->user_mutex); rxrpc_put_call(call, rxrpc_call_put_kernel); } EXPORT_SYMBOL(rxrpc_kernel_end_call); @@ -450,14 +454,16 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len) case RXRPC_SERVER_BOUND: case RXRPC_SERVER_LISTENING: ret = rxrpc_do_sendmsg(rx, m, len); - break; + /* The socket has been unlocked */ + goto out; default: ret = -EINVAL; - break; + goto error_unlock; } error_unlock: release_sock(&rx->sk); +out: _leave(" = %d", ret); return ret; } diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 12be432be9b2..26a7b1db1361 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -467,6 +467,7 @@ struct rxrpc_call { struct rxrpc_connection *conn; /* connection carrying call */ struct rxrpc_peer *peer; /* Peer record for remote address */ struct rxrpc_sock __rcu *socket; /* socket responsible */ + struct mutex user_mutex; /* User access mutex */ ktime_t ack_at; /* When deferred ACK needs to happen */ ktime_t resend_at; /* When next resend needs to happen */ ktime_t ping_at; /* When next to send a ping */ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 7c4c64ab8da2..0ed181f53f32 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -323,6 +323,8 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, * * If we want to report an error, we mark the skb with the packet type and * abort code and return NULL. + * + * The call is returned with the user access mutex held. */ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, struct rxrpc_connection *conn, @@ -371,6 +373,18 @@ found_service: trace_rxrpc_receive(call, rxrpc_receive_incoming, sp->hdr.serial, sp->hdr.seq); + /* Lock the call to prevent rxrpc_kernel_send/recv_data() and + * sendmsg()/recvmsg() inconveniently stealing the mutex once the + * notification is generated. + * + * The BUG should never happen because the kernel should be well + * behaved enough not to access the call before the first notification + * event and userspace is prevented from doing so until the state is + * appropriate. + */ + if (!mutex_trylock(&call->user_mutex)) + BUG(); + /* Make the call live. */ rxrpc_incoming_call(rx, call, skb); conn = call->conn; @@ -429,10 +443,12 @@ out: /* * handle acceptance of a call by userspace * - assign the user call ID to the call at the front of the queue + * - called with the socket locked. */ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, unsigned long user_call_ID, rxrpc_notify_rx_t notify_rx) + __releases(&rx->sk.sk_lock.slock) { struct rxrpc_call *call; struct rb_node *parent, **pp; @@ -446,6 +462,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, if (list_empty(&rx->to_be_accepted)) { write_unlock(&rx->call_lock); + release_sock(&rx->sk); kleave(" = -ENODATA [empty]"); return ERR_PTR(-ENODATA); } @@ -470,10 +487,39 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, */ call = list_entry(rx->to_be_accepted.next, struct rxrpc_call, accept_link); + write_unlock(&rx->call_lock); + + /* We need to gain the mutex from the interrupt handler without + * upsetting lockdep, so we have to release it there and take it here. + * We are, however, still holding the socket lock, so other accepts + * must wait for us and no one can add the user ID behind our backs. + */ + if (mutex_lock_interruptible(&call->user_mutex) < 0) { + release_sock(&rx->sk); + kleave(" = -ERESTARTSYS"); + return ERR_PTR(-ERESTARTSYS); + } + + write_lock(&rx->call_lock); list_del_init(&call->accept_link); sk_acceptq_removed(&rx->sk); rxrpc_see_call(call); + /* Find the user ID insertion point. */ + pp = &rx->calls.rb_node; + parent = NULL; + while (*pp) { + parent = *pp; + call = rb_entry(parent, struct rxrpc_call, sock_node); + + if (user_call_ID < call->user_call_ID) + pp = &(*pp)->rb_left; + else if (user_call_ID > call->user_call_ID) + pp = &(*pp)->rb_right; + else + BUG(); + } + write_lock_bh(&call->state_lock); switch (call->state) { case RXRPC_CALL_SERVER_ACCEPTING: @@ -499,6 +545,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, write_unlock(&rx->call_lock); rxrpc_notify_socket(call); rxrpc_service_prealloc(rx, GFP_KERNEL); + release_sock(&rx->sk); _leave(" = %p{%d}", call, call->debug_id); return call; @@ -515,6 +562,7 @@ id_in_use: write_unlock(&rx->call_lock); out: rxrpc_service_prealloc(rx, GFP_KERNEL); + release_sock(&rx->sk); _leave(" = %d", ret); return ERR_PTR(ret); } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 8b94db3c9b2e..d79cd36987a9 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -115,6 +115,7 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) if (!call->rxtx_annotations) goto nomem_2; + mutex_init(&call->user_mutex); setup_timer(&call->timer, rxrpc_call_timer_expired, (unsigned long)call); INIT_WORK(&call->processor, &rxrpc_process_call); @@ -194,14 +195,16 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call) } /* - * set up a call for the given data - * - called in process context with IRQs enabled + * Set up a call for the given parameters. + * - Called with the socket lock held, which it must release. + * - If it returns a call, the call's lock will need releasing by the caller. */ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, struct rxrpc_conn_parameters *cp, struct sockaddr_rxrpc *srx, unsigned long user_call_ID, gfp_t gfp) + __releases(&rx->sk.sk_lock.slock) { struct rxrpc_call *call, *xcall; struct rb_node *parent, **pp; @@ -212,6 +215,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, call = rxrpc_alloc_client_call(srx, gfp); if (IS_ERR(call)) { + release_sock(&rx->sk); _leave(" = %ld", PTR_ERR(call)); return call; } @@ -219,6 +223,11 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, trace_rxrpc_call(call, rxrpc_call_new_client, atomic_read(&call->usage), here, (const void *)user_call_ID); + /* We need to protect a partially set up call against the user as we + * will be acting outside the socket lock. + */ + mutex_lock(&call->user_mutex); + /* Publish the call, even though it is incompletely set up as yet */ write_lock(&rx->call_lock); @@ -250,6 +259,9 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, list_add_tail(&call->link, &rxrpc_calls); write_unlock(&rxrpc_call_lock); + /* From this point on, the call is protected by its own lock. */ + release_sock(&rx->sk); + /* Set up or get a connection record and set the protocol parameters, * including channel number and call ID. */ @@ -279,6 +291,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, */ error_dup_user_ID: write_unlock(&rx->call_lock); + release_sock(&rx->sk); ret = -EEXIST; error: @@ -287,6 +300,7 @@ error: trace_rxrpc_call(call, rxrpc_call_error, atomic_read(&call->usage), here, ERR_PTR(ret)); rxrpc_release_call(rx, call); + mutex_unlock(&call->user_mutex); rxrpc_put_call(call, rxrpc_call_put); _leave(" = %d", ret); return ERR_PTR(ret); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 78ec33477adf..9f4cfa25af7c 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1194,6 +1194,7 @@ void rxrpc_data_ready(struct sock *udp_sk) goto reject_packet; } rxrpc_send_ping(call, skb, skew); + mutex_unlock(&call->user_mutex); } rxrpc_input_call_packet(call, skb, skew); diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index f3a688e10843..22447dbcc380 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -487,6 +487,20 @@ try_again: trace_rxrpc_recvmsg(call, rxrpc_recvmsg_dequeue, 0, 0, 0, 0); + /* We're going to drop the socket lock, so we need to lock the call + * against interference by sendmsg. + */ + if (!mutex_trylock(&call->user_mutex)) { + ret = -EWOULDBLOCK; + if (flags & MSG_DONTWAIT) + goto error_requeue_call; + ret = -ERESTARTSYS; + if (mutex_lock_interruptible(&call->user_mutex) < 0) + goto error_requeue_call; + } + + release_sock(&rx->sk); + if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) BUG(); @@ -502,7 +516,7 @@ try_again: &call->user_call_ID); } if (ret < 0) - goto error; + goto error_unlock_call; } if (msg->msg_name) { @@ -533,12 +547,12 @@ try_again: } if (ret < 0) - goto error; + goto error_unlock_call; if (call->state == RXRPC_CALL_COMPLETE) { ret = rxrpc_recvmsg_term(call, msg); if (ret < 0) - goto error; + goto error_unlock_call; if (!(flags & MSG_PEEK)) rxrpc_release_call(rx, call); msg->msg_flags |= MSG_EOR; @@ -551,8 +565,21 @@ try_again: msg->msg_flags &= ~MSG_MORE; ret = copied; -error: +error_unlock_call: + mutex_unlock(&call->user_mutex); rxrpc_put_call(call, rxrpc_call_put); + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret); + return ret; + +error_requeue_call: + if (!(flags & MSG_PEEK)) { + write_lock_bh(&rx->recvmsg_lock); + list_add(&call->recvmsg_link, &rx->recvmsg_q); + write_unlock_bh(&rx->recvmsg_lock); + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_requeue, 0, 0, 0, 0); + } else { + rxrpc_put_call(call, rxrpc_call_put); + } error_no_call: release_sock(&rx->sk); trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret); @@ -609,7 +636,7 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, iov.iov_len = size - *_offset; iov_iter_kvec(&iter, ITER_KVEC | READ, &iov, 1, size - *_offset); - lock_sock(sock->sk); + mutex_lock(&call->user_mutex); switch (call->state) { case RXRPC_CALL_CLIENT_RECV_REPLY: @@ -648,7 +675,7 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, read_phase_complete: ret = 1; out: - release_sock(sock->sk); + mutex_unlock(&call->user_mutex); _leave(" = %d [%zu,%d]", ret, *_offset, *_abort); return ret; diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 0a6ef217aa8a..31c1538c1a8d 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -59,9 +59,12 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, } trace_rxrpc_transmit(call, rxrpc_transmit_wait); - release_sock(&rx->sk); + mutex_unlock(&call->user_mutex); *timeo = schedule_timeout(*timeo); - lock_sock(&rx->sk); + if (mutex_lock_interruptible(&call->user_mutex) < 0) { + ret = sock_intr_errno(*timeo); + break; + } } remove_wait_queue(&call->waitq, &myself); @@ -171,7 +174,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, /* * send data through a socket * - must be called in process context - * - caller holds the socket locked + * - The caller holds the call user access mutex, but not the socket lock. */ static int rxrpc_send_data(struct rxrpc_sock *rx, struct rxrpc_call *call, @@ -437,10 +440,13 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, /* * Create a new client call for sendmsg(). + * - Called with the socket lock held, which it must release. + * - If it returns a call, the call's lock will need releasing by the caller. */ static struct rxrpc_call * rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, unsigned long user_call_ID, bool exclusive) + __releases(&rx->sk.sk_lock.slock) { struct rxrpc_conn_parameters cp; struct rxrpc_call *call; @@ -450,8 +456,10 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, _enter(""); - if (!msg->msg_name) + if (!msg->msg_name) { + release_sock(&rx->sk); return ERR_PTR(-EDESTADDRREQ); + } key = rx->key; if (key && !rx->key->payload.data[0]) @@ -464,6 +472,7 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, cp.exclusive = rx->exclusive | exclusive; cp.service_id = srx->srx_service; call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, GFP_KERNEL); + /* The socket is now unlocked */ _leave(" = %p\n", call); return call; @@ -475,6 +484,7 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, * - the socket may be either a client socket or a server socket */ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) + __releases(&rx->sk.sk_lock.slock) { enum rxrpc_command cmd; struct rxrpc_call *call; @@ -488,12 +498,14 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) ret = rxrpc_sendmsg_cmsg(msg, &user_call_ID, &cmd, &abort_code, &exclusive); if (ret < 0) - return ret; + goto error_release_sock; if (cmd == RXRPC_CMD_ACCEPT) { + ret = -EINVAL; if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) - return -EINVAL; + goto error_release_sock; call = rxrpc_accept_call(rx, user_call_ID, NULL); + /* The socket is now unlocked. */ if (IS_ERR(call)) return PTR_ERR(call); rxrpc_put_call(call, rxrpc_call_put); @@ -502,12 +514,29 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) call = rxrpc_find_call_by_user_ID(rx, user_call_ID); if (!call) { + ret = -EBADSLT; if (cmd != RXRPC_CMD_SEND_DATA) - return -EBADSLT; + goto error_release_sock; + ret = -EBUSY; + if (call->state == RXRPC_CALL_UNINITIALISED || + call->state == RXRPC_CALL_CLIENT_AWAIT_CONN || + call->state == RXRPC_CALL_SERVER_PREALLOC || + call->state == RXRPC_CALL_SERVER_SECURING || + call->state == RXRPC_CALL_SERVER_ACCEPTING) + goto error_release_sock; call = rxrpc_new_client_call_for_sendmsg(rx, msg, user_call_ID, exclusive); + /* The socket is now unlocked... */ if (IS_ERR(call)) return PTR_ERR(call); + /* ... and we have the call lock. */ + } else { + ret = mutex_lock_interruptible(&call->user_mutex); + release_sock(&rx->sk); + if (ret < 0) { + ret = -ERESTARTSYS; + goto error_put; + } } _debug("CALL %d USR %lx ST %d on CONN %p", @@ -535,9 +564,15 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) ret = rxrpc_send_data(rx, call, msg, len); } + mutex_unlock(&call->user_mutex); +error_put: rxrpc_put_call(call, rxrpc_call_put); _leave(" = %d", ret); return ret; + +error_release_sock: + release_sock(&rx->sk); + return ret; } /** @@ -562,7 +597,7 @@ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, ASSERTCMP(msg->msg_name, ==, NULL); ASSERTCMP(msg->msg_control, ==, NULL); - lock_sock(sock->sk); + mutex_lock(&call->user_mutex); _debug("CALL %d USR %lx ST %d on CONN %p", call->debug_id, call->user_call_ID, call->state, call->conn); @@ -577,7 +612,7 @@ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, ret = rxrpc_send_data(rxrpc_sk(sock->sk), call, msg, len); } - release_sock(sock->sk); + mutex_unlock(&call->user_mutex); _leave(" = %d", ret); return ret; } @@ -598,12 +633,12 @@ void rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call, { _enter("{%d},%d,%d,%s", call->debug_id, abort_code, error, why); - lock_sock(sock->sk); + mutex_lock(&call->user_mutex); if (rxrpc_abort_call(why, call, 0, abort_code, error)) rxrpc_send_abort_packet(call); - release_sock(sock->sk); + mutex_unlock(&call->user_mutex); _leave(""); } -- cgit v1.2.3 From 5179b26694c92373275e4933f5d0ff32d585c675 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 28 Feb 2017 12:41:29 +0800 Subject: sctp: call rcu_read_lock before checking for duplicate transport nodes Commit cd2b70875058 ("sctp: check duplicate node before inserting a new transport") called rhltable_lookup() to check for the duplicate transport node in transport rhashtable. But rhltable_lookup() doesn't call rcu_read_lock inside, it could cause a use-after-free issue if it tries to dereference the node that another cpu has freed it. Note that sock lock can not avoid this as it is per sock. This patch is to fix it by calling rcu_read_lock before checking for duplicate transport nodes. Fixes: cd2b70875058 ("sctp: check duplicate node before inserting a new transport") Reported-by: Andrey Konovalov Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/input.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sctp/input.c b/net/sctp/input.c index fc458968fe4b..2a28ab20487f 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -884,14 +884,17 @@ int sctp_hash_transport(struct sctp_transport *t) arg.paddr = &t->ipaddr; arg.lport = htons(t->asoc->base.bind_addr.port); + rcu_read_lock(); list = rhltable_lookup(&sctp_transport_hashtable, &arg, sctp_hash_params); rhl_for_each_entry_rcu(transport, tmp, list, node) if (transport->asoc->ep == t->asoc->ep) { + rcu_read_unlock(); err = -EEXIST; goto out; } + rcu_read_unlock(); err = rhltable_insert_key(&sctp_transport_hashtable, &arg, &t->node, sctp_hash_params); -- cgit v1.2.3 From 4f7bfb3982e02aacc5fb6e1a121e5326c1778ac3 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Tue, 28 Feb 2017 01:45:40 -0500 Subject: rds: ib: add the static type to the variables The variables rds_ib_mr_1m_pool_size and rds_ib_mr_8k_pool_size are used only in the ib.c file. As such, the static type is added to limit them in this file. Cc: Joe Jin Cc: Junxiao Bi Signed-off-by: Zhu Yanjun Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib.c | 4 ++-- net/rds/ib_mr.h | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rds/ib.c b/net/rds/ib.c index 91fe46f1e4cc..0f557b243311 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -45,8 +45,8 @@ #include "ib.h" #include "ib_mr.h" -unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE; -unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE; +static unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE; +static unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE; unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT; module_param(rds_ib_mr_1m_pool_size, int, 0444); diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h index 24c086db4511..5d6e98a79a5e 100644 --- a/net/rds/ib_mr.h +++ b/net/rds/ib_mr.h @@ -107,8 +107,6 @@ struct rds_ib_mr_pool { }; extern struct workqueue_struct *rds_ib_mr_wq; -extern unsigned int rds_ib_mr_1m_pool_size; -extern unsigned int rds_ib_mr_8k_pool_size; extern bool prefer_frmr; struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev, -- cgit v1.2.3 From 39e6c8208d7b6fb9d2047850fb3327db567b564b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Feb 2017 10:34:50 -0800 Subject: net: solve a NAPI race While playing with mlx4 hardware timestamping of RX packets, I found that some packets were received by TCP stack with a ~200 ms delay... Since the timestamp was provided by the NIC, and my probe was added in tcp_v4_rcv() while in BH handler, I was confident it was not a sender issue, or a drop in the network. This would happen with a very low probability, but hurting RPC workloads. A NAPI driver normally arms the IRQ after the napi_complete_done(), after NAPI_STATE_SCHED is cleared, so that the hard irq handler can grab it. Problem is that if another point in the stack grabs NAPI_STATE_SCHED bit while IRQ are not disabled, we might have later an IRQ firing and finding this bit set, right before napi_complete_done() clears it. This can happen with busy polling users, or if gro_flush_timeout is used. But some other uses of napi_schedule() in drivers can cause this as well. thread 1 thread 2 (could be on same cpu, or not) // busy polling or napi_watchdog() napi_schedule(); ... napi->poll() device polling: read 2 packets from ring buffer Additional 3rd packet is available. device hard irq // does nothing because NAPI_STATE_SCHED bit is owned by thread 1 napi_schedule(); napi_complete_done(napi, 2); rearm_irq(); Note that rearm_irq() will not force the device to send an additional IRQ for the packet it already signaled (3rd packet in my example) This patch adds a new NAPI_STATE_MISSED bit, that napi_schedule_prep() can set if it could not grab NAPI_STATE_SCHED Then napi_complete_done() properly reschedules the napi to make sure we do not miss something. Since we manipulate multiple bits at once, use cmpxchg() like in sk_busy_loop() to provide proper transactions. In v2, I changed napi_watchdog() to use a relaxed variant of napi_schedule_prep() : No need to set NAPI_STATE_MISSED from this point. In v3, I added more details in the changelog and clears NAPI_STATE_MISSED in busy_poll_stop() In v4, I added the ideas given by Alexander Duyck in v3 review Signed-off-by: Eric Dumazet Cc: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/netdevice.h | 29 ++++++------------ net/core/dev.c | 76 ++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 81 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f40f0ab3847a..97456b2539e4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -330,6 +330,7 @@ struct napi_struct { enum { NAPI_STATE_SCHED, /* Poll is scheduled */ + NAPI_STATE_MISSED, /* reschedule a napi */ NAPI_STATE_DISABLE, /* Disable pending */ NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */ @@ -338,12 +339,13 @@ enum { }; enum { - NAPIF_STATE_SCHED = (1UL << NAPI_STATE_SCHED), - NAPIF_STATE_DISABLE = (1UL << NAPI_STATE_DISABLE), - NAPIF_STATE_NPSVC = (1UL << NAPI_STATE_NPSVC), - NAPIF_STATE_HASHED = (1UL << NAPI_STATE_HASHED), - NAPIF_STATE_NO_BUSY_POLL = (1UL << NAPI_STATE_NO_BUSY_POLL), - NAPIF_STATE_IN_BUSY_POLL = (1UL << NAPI_STATE_IN_BUSY_POLL), + NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED), + NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED), + NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE), + NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC), + NAPIF_STATE_HASHED = BIT(NAPI_STATE_HASHED), + NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), + NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), }; enum gro_result { @@ -414,20 +416,7 @@ static inline bool napi_disable_pending(struct napi_struct *n) return test_bit(NAPI_STATE_DISABLE, &n->state); } -/** - * napi_schedule_prep - check if NAPI can be scheduled - * @n: NAPI context - * - * Test if NAPI routine is already running, and if not mark - * it as running. This is used as a condition variable to - * insure only one NAPI poll instance runs. We also make - * sure there is no pending NAPI disable. - */ -static inline bool napi_schedule_prep(struct napi_struct *n) -{ - return !napi_disable_pending(n) && - !test_and_set_bit(NAPI_STATE_SCHED, &n->state); -} +bool napi_schedule_prep(struct napi_struct *n); /** * napi_schedule - schedule NAPI poll diff --git a/net/core/dev.c b/net/core/dev.c index 304f2deae5f9..e63bf61b19be 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4883,6 +4883,39 @@ void __napi_schedule(struct napi_struct *n) } EXPORT_SYMBOL(__napi_schedule); +/** + * napi_schedule_prep - check if napi can be scheduled + * @n: napi context + * + * Test if NAPI routine is already running, and if not mark + * it as running. This is used as a condition variable + * insure only one NAPI poll instance runs. We also make + * sure there is no pending NAPI disable. + */ +bool napi_schedule_prep(struct napi_struct *n) +{ + unsigned long val, new; + + do { + val = READ_ONCE(n->state); + if (unlikely(val & NAPIF_STATE_DISABLE)) + return false; + new = val | NAPIF_STATE_SCHED; + + /* Sets STATE_MISSED bit if STATE_SCHED was already set + * This was suggested by Alexander Duyck, as compiler + * emits better code than : + * if (val & NAPIF_STATE_SCHED) + * new |= NAPIF_STATE_MISSED; + */ + new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED * + NAPIF_STATE_MISSED; + } while (cmpxchg(&n->state, val, new) != val); + + return !(val & NAPIF_STATE_SCHED); +} +EXPORT_SYMBOL(napi_schedule_prep); + /** * __napi_schedule_irqoff - schedule for receive * @n: entry to schedule @@ -4897,7 +4930,7 @@ EXPORT_SYMBOL(__napi_schedule_irqoff); bool napi_complete_done(struct napi_struct *n, int work_done) { - unsigned long flags; + unsigned long flags, val, new; /* * 1) Don't let napi dequeue from the cpu poll list @@ -4927,7 +4960,27 @@ bool napi_complete_done(struct napi_struct *n, int work_done) list_del_init(&n->poll_list); local_irq_restore(flags); } - WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state)); + + do { + val = READ_ONCE(n->state); + + WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); + + new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED); + + /* If STATE_MISSED was set, leave STATE_SCHED set, + * because we will call napi->poll() one more time. + * This C code was suggested by Alexander Duyck to help gcc. + */ + new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED * + NAPIF_STATE_SCHED; + } while (cmpxchg(&n->state, val, new) != val); + + if (unlikely(val & NAPIF_STATE_MISSED)) { + __napi_schedule(n); + return false; + } + return true; } EXPORT_SYMBOL(napi_complete_done); @@ -4953,6 +5006,16 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) { int rc; + /* Busy polling means there is a high chance device driver hard irq + * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was + * set in napi_schedule_prep(). + * Since we are about to call napi->poll() once more, we can safely + * clear NAPI_STATE_MISSED. + * + * Note: x86 could use a single "lock and ..." instruction + * to perform these two clear_bit() + */ + clear_bit(NAPI_STATE_MISSED, &napi->state); clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); local_bh_disable(); @@ -5088,8 +5151,13 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) struct napi_struct *napi; napi = container_of(timer, struct napi_struct, timer); - if (napi->gro_list) - napi_schedule_irqoff(napi); + + /* Note : we use a relaxed variant of napi_schedule_prep() not setting + * NAPI_STATE_MISSED, since we do not react to a device IRQ. + */ + if (napi->gro_list && !napi_disable_pending(napi) && + !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) + __napi_schedule_irqoff(napi); return HRTIMER_NORESTART; } -- cgit v1.2.3 From 8c171d6ca56c6891372a97af26b58b2cfad7fd9a Mon Sep 17 00:00:00 2001 From: Felix Jia Date: Mon, 27 Feb 2017 12:41:23 +1300 Subject: net/ipv6: avoid possible dead locking on addr_gen_mode sysctl The addr_gen_mode variable can be accessed by both sysctl and netlink. Repleacd rtnl_lock() with rtnl_trylock() protect the sysctl operation to avoid the possbile dead lock.` Signed-off-by: Felix Jia Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 3a2025f5bf2c..cfc485a8e1c0 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5692,13 +5692,18 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1; struct net *net = (struct net *)ctl->extra2; + if (!rtnl_trylock()) + return restart_syscall(); + ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (write) { new_val = *((int *)ctl->data); - if (check_addr_gen_mode(new_val) < 0) - return -EINVAL; + if (check_addr_gen_mode(new_val) < 0) { + ret = -EINVAL; + goto out; + } /* request for default */ if (&net->ipv6.devconf_dflt->addr_gen_mode == ctl->data) { @@ -5707,20 +5712,23 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, /* request for individual net device */ } else { if (!idev) - return ret; + goto out; - if (check_stable_privacy(idev, net, new_val) < 0) - return -EINVAL; + if (check_stable_privacy(idev, net, new_val) < 0) { + ret = -EINVAL; + goto out; + } if (idev->cnf.addr_gen_mode != new_val) { idev->cnf.addr_gen_mode = new_val; - rtnl_lock(); addrconf_dev_config(idev->dev); - rtnl_unlock(); } } } +out: + rtnl_unlock(); + return ret; } -- cgit v1.2.3 From 3b45a4106f146c336cbcaccb9d8d0fa0e5c3dc1d Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Mon, 27 Feb 2017 20:59:39 +0800 Subject: net: route: add missing nla_policy entry for RTA_MARK attribute This will add stricter validating for RTA_MARK attribute. Signed-off-by: Liping Zhang Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 1 + net/ipv6/route.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index b39a791f6756..42bfd08109dd 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -622,6 +622,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, [RTA_ENCAP] = { .type = NLA_NESTED }, [RTA_UID] = { .type = NLA_U32 }, + [RTA_MARK] = { .type = NLA_U32 }, }; static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f54f4265b37f..d94f1dfa54c8 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2891,6 +2891,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_ENCAP] = { .type = NLA_NESTED }, [RTA_EXPIRES] = { .type = NLA_U32 }, [RTA_UID] = { .type = NLA_U32 }, + [RTA_MARK] = { .type = NLA_U32 }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, -- cgit v1.2.3 From df2c43343b47a7138431f431118eb5819e205365 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Wed, 1 Mar 2017 16:50:45 +0200 Subject: bridge: Fix error path in nbp_vlan_init Fix error path order in nbp_vlan_init, so if switchdev_port_attr_set call failes, the vlan_hash wouldn't be destroyed before inited. Fixes: efa5356b0d97 ("bridge: per vlan dst_metadata netlink support") CC: Roopa Prabhu Signed-off-by: Yotam Gigi Acked-by: Roopa Prabhu Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/bridge/br_vlan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 62e68c0dc687..b838213c408e 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -997,10 +997,10 @@ err_vlan_add: RCU_INIT_POINTER(p->vlgrp, NULL); synchronize_rcu(); vlan_tunnel_deinit(vg); -err_vlan_enabled: err_tunnel_init: rhashtable_destroy(&vg->vlan_hash); err_rhtbl: +err_vlan_enabled: kfree(vg); goto out; -- cgit v1.2.3 From 449809a66c1d0b1563dee84493e14bf3104d2d7e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 1 Mar 2017 08:39:49 -0800 Subject: tcp/dccp: block BH for SYN processing SYN processing really was meant to be handled from BH. When I got rid of BH blocking while processing socket backlog in commit 5413d1babe8f ("net: do not block BH while processing socket backlog"), I forgot that a malicious user could transition to TCP_LISTEN from a state that allowed (SYN) packets to be parked in the socket backlog while socket is owned by the thread doing the listen() call. Sure enough syzkaller found this and reported the bug ;) ================================= [ INFO: inconsistent lock state ] 4.10.0+ #60 Not tainted --------------------------------- inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage. syz-executor0/5090 [HC0[0]:SC0[0]:HE1:SE1] takes: (&(&hashinfo->ehash_locks[i])->rlock){+.?...}, at: [] spin_lock include/linux/spinlock.h:299 [inline] (&(&hashinfo->ehash_locks[i])->rlock){+.?...}, at: [] inet_ehash_insert+0x240/0xad0 net/ipv4/inet_hashtables.c:407 {IN-SOFTIRQ-W} state was registered at: mark_irqflags kernel/locking/lockdep.c:2923 [inline] __lock_acquire+0xbcf/0x3270 kernel/locking/lockdep.c:3295 lock_acquire+0x241/0x580 kernel/locking/lockdep.c:3753 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x33/0x50 kernel/locking/spinlock.c:151 spin_lock include/linux/spinlock.h:299 [inline] inet_ehash_insert+0x240/0xad0 net/ipv4/inet_hashtables.c:407 reqsk_queue_hash_req net/ipv4/inet_connection_sock.c:753 [inline] inet_csk_reqsk_queue_hash_add+0x1b7/0x2a0 net/ipv4/inet_connection_sock.c:764 tcp_conn_request+0x25cc/0x3310 net/ipv4/tcp_input.c:6399 tcp_v4_conn_request+0x157/0x220 net/ipv4/tcp_ipv4.c:1262 tcp_rcv_state_process+0x802/0x4130 net/ipv4/tcp_input.c:5889 tcp_v4_do_rcv+0x56b/0x940 net/ipv4/tcp_ipv4.c:1433 tcp_v4_rcv+0x2e12/0x3210 net/ipv4/tcp_ipv4.c:1711 ip_local_deliver_finish+0x4ce/0xc40 net/ipv4/ip_input.c:216 NF_HOOK include/linux/netfilter.h:257 [inline] ip_local_deliver+0x1ce/0x710 net/ipv4/ip_input.c:257 dst_input include/net/dst.h:492 [inline] ip_rcv_finish+0xb1d/0x2110 net/ipv4/ip_input.c:396 NF_HOOK include/linux/netfilter.h:257 [inline] ip_rcv+0xd90/0x19c0 net/ipv4/ip_input.c:487 __netif_receive_skb_core+0x1ad1/0x3400 net/core/dev.c:4179 __netif_receive_skb+0x2a/0x170 net/core/dev.c:4217 netif_receive_skb_internal+0x1d6/0x430 net/core/dev.c:4245 napi_skb_finish net/core/dev.c:4602 [inline] napi_gro_receive+0x4e6/0x680 net/core/dev.c:4636 e1000_receive_skb drivers/net/ethernet/intel/e1000/e1000_main.c:4033 [inline] e1000_clean_rx_irq+0x5e0/0x1490 drivers/net/ethernet/intel/e1000/e1000_main.c:4489 e1000_clean+0xb9a/0x2910 drivers/net/ethernet/intel/e1000/e1000_main.c:3834 napi_poll net/core/dev.c:5171 [inline] net_rx_action+0xe70/0x1900 net/core/dev.c:5236 __do_softirq+0x2fb/0xb7d kernel/softirq.c:284 invoke_softirq kernel/softirq.c:364 [inline] irq_exit+0x19e/0x1d0 kernel/softirq.c:405 exiting_irq arch/x86/include/asm/apic.h:658 [inline] do_IRQ+0x81/0x1a0 arch/x86/kernel/irq.c:250 ret_from_intr+0x0/0x20 native_safe_halt+0x6/0x10 arch/x86/include/asm/irqflags.h:53 arch_safe_halt arch/x86/include/asm/paravirt.h:98 [inline] default_idle+0x8f/0x410 arch/x86/kernel/process.c:271 arch_cpu_idle+0xa/0x10 arch/x86/kernel/process.c:262 default_idle_call+0x36/0x60 kernel/sched/idle.c:96 cpuidle_idle_call kernel/sched/idle.c:154 [inline] do_idle+0x348/0x440 kernel/sched/idle.c:243 cpu_startup_entry+0x18/0x20 kernel/sched/idle.c:345 start_secondary+0x344/0x440 arch/x86/kernel/smpboot.c:272 verify_cpu+0x0/0xfc irq event stamp: 1741 hardirqs last enabled at (1741): [] __raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:160 [inline] hardirqs last enabled at (1741): [] _raw_spin_unlock_irqrestore+0xf7/0x1a0 kernel/locking/spinlock.c:191 hardirqs last disabled at (1740): [] __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:108 [inline] hardirqs last disabled at (1740): [] _raw_spin_lock_irqsave+0xa2/0x110 kernel/locking/spinlock.c:159 softirqs last enabled at (1738): [] __do_softirq+0x7cf/0xb7d kernel/softirq.c:310 softirqs last disabled at (1571): [] do_softirq_own_stack+0x1c/0x30 arch/x86/entry/entry_64.S:902 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&(&hashinfo->ehash_locks[i])->rlock); lock(&(&hashinfo->ehash_locks[i])->rlock); *** DEADLOCK *** 1 lock held by syz-executor0/5090: #0: (sk_lock-AF_INET6){+.+.+.}, at: [] lock_sock include/net/sock.h:1460 [inline] #0: (sk_lock-AF_INET6){+.+.+.}, at: [] sock_setsockopt+0x233/0x1e40 net/core/sock.c:683 stack backtrace: CPU: 1 PID: 5090 Comm: syz-executor0 Not tainted 4.10.0+ #60 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:15 [inline] dump_stack+0x292/0x398 lib/dump_stack.c:51 print_usage_bug+0x3ef/0x450 kernel/locking/lockdep.c:2387 valid_state kernel/locking/lockdep.c:2400 [inline] mark_lock_irq kernel/locking/lockdep.c:2602 [inline] mark_lock+0xf30/0x1410 kernel/locking/lockdep.c:3065 mark_irqflags kernel/locking/lockdep.c:2941 [inline] __lock_acquire+0x6dc/0x3270 kernel/locking/lockdep.c:3295 lock_acquire+0x241/0x580 kernel/locking/lockdep.c:3753 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x33/0x50 kernel/locking/spinlock.c:151 spin_lock include/linux/spinlock.h:299 [inline] inet_ehash_insert+0x240/0xad0 net/ipv4/inet_hashtables.c:407 reqsk_queue_hash_req net/ipv4/inet_connection_sock.c:753 [inline] inet_csk_reqsk_queue_hash_add+0x1b7/0x2a0 net/ipv4/inet_connection_sock.c:764 dccp_v6_conn_request+0xada/0x11b0 net/dccp/ipv6.c:380 dccp_rcv_state_process+0x51e/0x1660 net/dccp/input.c:606 dccp_v6_do_rcv+0x213/0x350 net/dccp/ipv6.c:632 sk_backlog_rcv include/net/sock.h:896 [inline] __release_sock+0x127/0x3a0 net/core/sock.c:2052 release_sock+0xa5/0x2b0 net/core/sock.c:2539 sock_setsockopt+0x60f/0x1e40 net/core/sock.c:1016 SYSC_setsockopt net/socket.c:1782 [inline] SyS_setsockopt+0x2fb/0x3a0 net/socket.c:1765 entry_SYSCALL_64_fastpath+0x1f/0xc2 RIP: 0033:0x4458b9 RSP: 002b:00007fe8b26c2b58 EFLAGS: 00000292 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 0000000000000006 RCX: 00000000004458b9 RDX: 000000000000001a RSI: 0000000000000001 RDI: 0000000000000006 RBP: 00000000006e2110 R08: 0000000000000010 R09: 0000000000000000 R10: 00000000208c3000 R11: 0000000000000292 R12: 0000000000708000 R13: 0000000020000000 R14: 0000000000001000 R15: 0000000000000000 Fixes: 5413d1babe8f ("net: do not block BH while processing socket backlog") Signed-off-by: Eric Dumazet Reported-by: Andrey Konovalov Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/dccp/input.c | 10 ++++++++-- net/ipv4/tcp_input.c | 10 ++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/dccp/input.c b/net/dccp/input.c index 8fedc2d49770..4a05d7876850 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -577,6 +577,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct dccp_sock *dp = dccp_sk(sk); struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); const int old_state = sk->sk_state; + bool acceptable; int queued = 0; /* @@ -603,8 +604,13 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ if (sk->sk_state == DCCP_LISTEN) { if (dh->dccph_type == DCCP_PKT_REQUEST) { - if (inet_csk(sk)->icsk_af_ops->conn_request(sk, - skb) < 0) + /* It is possible that we process SYN packets from backlog, + * so we need to make sure to disable BH right there. + */ + local_bh_disable(); + acceptable = inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) >= 0; + local_bh_enable(); + if (!acceptable) return 1; consume_skb(skb); return 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2c0ff327b6df..39c393cc0fd3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5886,9 +5886,15 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (th->syn) { if (th->fin) goto discard; - if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) - return 1; + /* It is possible that we process SYN packets from backlog, + * so we need to make sure to disable BH right there. + */ + local_bh_disable(); + acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0; + local_bh_enable(); + if (!acceptable) + return 1; consume_skb(skb); return 0; } -- cgit v1.2.3 From 8953de2f02ad7b15e4964c82f9afd60f128e4e98 Mon Sep 17 00:00:00 2001 From: Mike Manning Date: Wed, 1 Mar 2017 09:55:28 +0000 Subject: net: bridge: allow IPv6 when multicast flood is disabled Even with multicast flooding turned off, IPv6 ND should still work so that IPv6 connectivity is provided. Allow this by continuing to flood multicast traffic originated by us. Fixes: b6cb5ac8331b ("net: bridge: add per-port multicast flood flag") Cc: Nikolay Aleksandrov Signed-off-by: Mike Manning Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_forward.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 6bfac29318f2..902af6ba481c 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -186,8 +186,9 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb, /* Do not flood unicast traffic to ports that turn it off */ if (pkt_type == BR_PKT_UNICAST && !(p->flags & BR_FLOOD)) continue; + /* Do not flood if mc off, except for traffic we originate */ if (pkt_type == BR_PKT_MULTICAST && - !(p->flags & BR_MCAST_FLOOD)) + !(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev) continue; /* Do not flood to ports that enable proxy ARP */ -- cgit v1.2.3 From 540e2894f7905538740aaf122bd8e0548e1c34a4 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Wed, 1 Mar 2017 12:57:20 +0100 Subject: net: don't call strlen() on the user buffer in packet_bind_spkt() KMSAN (KernelMemorySanitizer, a new error detection tool) reports use of uninitialized memory in packet_bind_spkt(): Acked-by: Eric Dumazet ================================================================== BUG: KMSAN: use of unitialized memory CPU: 0 PID: 1074 Comm: packet Not tainted 4.8.0-rc6+ #1891 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 0000000000000000 ffff88006b6dfc08 ffffffff82559ae8 ffff88006b6dfb48 ffffffff818a7c91 ffffffff85b9c870 0000000000000092 ffffffff85b9c550 0000000000000000 0000000000000092 00000000ec400911 0000000000000002 Call Trace: [< inline >] __dump_stack lib/dump_stack.c:15 [] dump_stack+0x238/0x290 lib/dump_stack.c:51 [] kmsan_report+0x276/0x2e0 mm/kmsan/kmsan.c:1003 [] __msan_warning+0x5b/0xb0 mm/kmsan/kmsan_instr.c:424 [< inline >] strlen lib/string.c:484 [] strlcpy+0x9d/0x200 lib/string.c:144 [] packet_bind_spkt+0x144/0x230 net/packet/af_packet.c:3132 [] SYSC_bind+0x40d/0x5f0 net/socket.c:1370 [] SyS_bind+0x82/0xa0 net/socket.c:1356 [] entry_SYSCALL_64_fastpath+0x13/0x8f arch/x86/entry/entry_64.o:? chained origin: 00000000eba00911 [] save_stack_trace+0x27/0x50 arch/x86/kernel/stacktrace.c:67 [< inline >] kmsan_save_stack_with_flags mm/kmsan/kmsan.c:322 [< inline >] kmsan_save_stack mm/kmsan/kmsan.c:334 [] kmsan_internal_chain_origin+0x118/0x1e0 mm/kmsan/kmsan.c:527 [] __msan_set_alloca_origin4+0xc3/0x130 mm/kmsan/kmsan_instr.c:380 [] SYSC_bind+0x129/0x5f0 net/socket.c:1356 [] SyS_bind+0x82/0xa0 net/socket.c:1356 [] entry_SYSCALL_64_fastpath+0x13/0x8f arch/x86/entry/entry_64.o:? origin description: ----address@SYSC_bind (origin=00000000eb400911) ================================================================== (the line numbers are relative to 4.8-rc6, but the bug persists upstream) , when I run the following program as root: ===================================== #include #include #include #include int main() { struct sockaddr addr; memset(&addr, 0xff, sizeof(addr)); addr.sa_family = AF_PACKET; int fd = socket(PF_PACKET, SOCK_PACKET, htons(ETH_P_ALL)); bind(fd, &addr, sizeof(addr)); return 0; } ===================================== This happens because addr.sa_data copied from the userspace is not zero-terminated, and copying it with strlcpy() in packet_bind_spkt() results in calling strlen() on the kernel copy of that non-terminated buffer. Signed-off-by: Alexander Potapenko Signed-off-by: David S. Miller --- net/packet/af_packet.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 2bd0d1949312..a0dbe7ca8f72 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3103,7 +3103,7 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; - char name[15]; + char name[sizeof(uaddr->sa_data) + 1]; /* * Check legality @@ -3111,7 +3111,11 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, if (addr_len != sizeof(struct sockaddr)) return -EINVAL; - strlcpy(name, uaddr->sa_data, sizeof(name)); + /* uaddr->sa_data comes from the userspace, it's not guaranteed to be + * zero-terminated. + */ + memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data)); + name[sizeof(uaddr->sa_data)] = 0; return packet_do_bind(sk, name, 0, pkt_sk(sk)->num); } -- cgit v1.2.3 From 13baa00ad01bb3a9f893e3a08cbc2d072fc0c15d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 1 Mar 2017 14:28:39 -0800 Subject: net: net_enable_timestamp() can be called from irq contexts It is now very clear that silly TCP listeners might play with enabling/disabling timestamping while new children are added to their accept queue. Meaning net_enable_timestamp() can be called from BH context while current state of the static key is not enabled. Lets play safe and allow all contexts. The work queue is scheduled only under the problematic cases, which are the static key enable/disable transition, to not slow down critical paths. This extends and improves what we did in commit 5fa8bbda38c6 ("net: use a work queue to defer net_disable_timestamp() work") Fixes: b90e5794c5bd ("net: dont call jump_label_dec from irq context") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Signed-off-by: David S. Miller --- net/core/dev.c | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index e63bf61b19be..8637b2b71f3d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1698,27 +1698,54 @@ EXPORT_SYMBOL_GPL(net_dec_egress_queue); static struct static_key netstamp_needed __read_mostly; #ifdef HAVE_JUMP_LABEL static atomic_t netstamp_needed_deferred; +static atomic_t netstamp_wanted; static void netstamp_clear(struct work_struct *work) { int deferred = atomic_xchg(&netstamp_needed_deferred, 0); + int wanted; - while (deferred--) - static_key_slow_dec(&netstamp_needed); + wanted = atomic_add_return(deferred, &netstamp_wanted); + if (wanted > 0) + static_key_enable(&netstamp_needed); + else + static_key_disable(&netstamp_needed); } static DECLARE_WORK(netstamp_work, netstamp_clear); #endif void net_enable_timestamp(void) { +#ifdef HAVE_JUMP_LABEL + int wanted; + + while (1) { + wanted = atomic_read(&netstamp_wanted); + if (wanted <= 0) + break; + if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted) + return; + } + atomic_inc(&netstamp_needed_deferred); + schedule_work(&netstamp_work); +#else static_key_slow_inc(&netstamp_needed); +#endif } EXPORT_SYMBOL(net_enable_timestamp); void net_disable_timestamp(void) { #ifdef HAVE_JUMP_LABEL - /* net_disable_timestamp() can be called from non process context */ - atomic_inc(&netstamp_needed_deferred); + int wanted; + + while (1) { + wanted = atomic_read(&netstamp_wanted); + if (wanted <= 1) + break; + if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted) + return; + } + atomic_dec(&netstamp_needed_deferred); schedule_work(&netstamp_work); #else static_key_slow_dec(&netstamp_needed); -- cgit v1.2.3 From 48cac18ecf1de82f76259a54402c3adb7839ad01 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 1 Mar 2017 14:45:06 -0800 Subject: ipv6: orphan skbs in reassembly unit Andrey reported a use-after-free in IPv6 stack. Issue here is that we free the socket while it still has skb in TX path and in some queues. It happens here because IPv6 reassembly unit messes skb->truesize, breaking skb_set_owner_w() badly. We fixed a similar issue for IPV4 in commit 8282f27449bf ("inet: frag: Always orphan skbs inside ip_defrag()") Acked-by: Joe Stringer ================================================================== BUG: KASAN: use-after-free in sock_wfree+0x118/0x120 Read of size 8 at addr ffff880062da0060 by task a.out/4140 page:ffffea00018b6800 count:1 mapcount:0 mapping: (null) index:0x0 compound_mapcount: 0 flags: 0x100000000008100(slab|head) raw: 0100000000008100 0000000000000000 0000000000000000 0000000180130013 raw: dead000000000100 dead000000000200 ffff88006741f140 0000000000000000 page dumped because: kasan: bad access detected CPU: 0 PID: 4140 Comm: a.out Not tainted 4.10.0-rc3+ #59 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:15 dump_stack+0x292/0x398 lib/dump_stack.c:51 describe_address mm/kasan/report.c:262 kasan_report_error+0x121/0x560 mm/kasan/report.c:370 kasan_report mm/kasan/report.c:392 __asan_report_load8_noabort+0x3e/0x40 mm/kasan/report.c:413 sock_flag ./arch/x86/include/asm/bitops.h:324 sock_wfree+0x118/0x120 net/core/sock.c:1631 skb_release_head_state+0xfc/0x250 net/core/skbuff.c:655 skb_release_all+0x15/0x60 net/core/skbuff.c:668 __kfree_skb+0x15/0x20 net/core/skbuff.c:684 kfree_skb+0x16e/0x4e0 net/core/skbuff.c:705 inet_frag_destroy+0x121/0x290 net/ipv4/inet_fragment.c:304 inet_frag_put ./include/net/inet_frag.h:133 nf_ct_frag6_gather+0x1125/0x38b0 net/ipv6/netfilter/nf_conntrack_reasm.c:617 ipv6_defrag+0x21b/0x350 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c:68 nf_hook_entry_hookfn ./include/linux/netfilter.h:102 nf_hook_slow+0xc3/0x290 net/netfilter/core.c:310 nf_hook ./include/linux/netfilter.h:212 __ip6_local_out+0x52c/0xaf0 net/ipv6/output_core.c:160 ip6_local_out+0x2d/0x170 net/ipv6/output_core.c:170 ip6_send_skb+0xa1/0x340 net/ipv6/ip6_output.c:1722 ip6_push_pending_frames+0xb3/0xe0 net/ipv6/ip6_output.c:1742 rawv6_push_pending_frames net/ipv6/raw.c:613 rawv6_sendmsg+0x2cff/0x4130 net/ipv6/raw.c:927 inet_sendmsg+0x164/0x5b0 net/ipv4/af_inet.c:744 sock_sendmsg_nosec net/socket.c:635 sock_sendmsg+0xca/0x110 net/socket.c:645 sock_write_iter+0x326/0x620 net/socket.c:848 new_sync_write fs/read_write.c:499 __vfs_write+0x483/0x760 fs/read_write.c:512 vfs_write+0x187/0x530 fs/read_write.c:560 SYSC_write fs/read_write.c:607 SyS_write+0xfb/0x230 fs/read_write.c:599 entry_SYSCALL_64_fastpath+0x1f/0xc2 arch/x86/entry/entry_64.S:203 RIP: 0033:0x7ff26e6f5b79 RSP: 002b:00007ff268e0ed98 EFLAGS: 00000206 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 00007ff268e0f9c0 RCX: 00007ff26e6f5b79 RDX: 0000000000000010 RSI: 0000000020f50fe1 RDI: 0000000000000003 RBP: 00007ff26ebc1220 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000206 R12: 0000000000000000 R13: 00007ff268e0f9c0 R14: 00007ff26efec040 R15: 0000000000000003 The buggy address belongs to the object at ffff880062da0000 which belongs to the cache RAWv6 of size 1504 The buggy address ffff880062da0060 is located 96 bytes inside of 1504-byte region [ffff880062da0000, ffff880062da05e0) Freed by task 4113: save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:57 save_stack+0x43/0xd0 mm/kasan/kasan.c:502 set_track mm/kasan/kasan.c:514 kasan_slab_free+0x73/0xc0 mm/kasan/kasan.c:578 slab_free_hook mm/slub.c:1352 slab_free_freelist_hook mm/slub.c:1374 slab_free mm/slub.c:2951 kmem_cache_free+0xb2/0x2c0 mm/slub.c:2973 sk_prot_free net/core/sock.c:1377 __sk_destruct+0x49c/0x6e0 net/core/sock.c:1452 sk_destruct+0x47/0x80 net/core/sock.c:1460 __sk_free+0x57/0x230 net/core/sock.c:1468 sk_free+0x23/0x30 net/core/sock.c:1479 sock_put ./include/net/sock.h:1638 sk_common_release+0x31e/0x4e0 net/core/sock.c:2782 rawv6_close+0x54/0x80 net/ipv6/raw.c:1214 inet_release+0xed/0x1c0 net/ipv4/af_inet.c:425 inet6_release+0x50/0x70 net/ipv6/af_inet6.c:431 sock_release+0x8d/0x1e0 net/socket.c:599 sock_close+0x16/0x20 net/socket.c:1063 __fput+0x332/0x7f0 fs/file_table.c:208 ____fput+0x15/0x20 fs/file_table.c:244 task_work_run+0x19b/0x270 kernel/task_work.c:116 exit_task_work ./include/linux/task_work.h:21 do_exit+0x186b/0x2800 kernel/exit.c:839 do_group_exit+0x149/0x420 kernel/exit.c:943 SYSC_exit_group kernel/exit.c:954 SyS_exit_group+0x1d/0x20 kernel/exit.c:952 entry_SYSCALL_64_fastpath+0x1f/0xc2 arch/x86/entry/entry_64.S:203 Allocated by task 4115: save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:57 save_stack+0x43/0xd0 mm/kasan/kasan.c:502 set_track mm/kasan/kasan.c:514 kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:605 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:544 slab_post_alloc_hook mm/slab.h:432 slab_alloc_node mm/slub.c:2708 slab_alloc mm/slub.c:2716 kmem_cache_alloc+0x1af/0x250 mm/slub.c:2721 sk_prot_alloc+0x65/0x2a0 net/core/sock.c:1334 sk_alloc+0x105/0x1010 net/core/sock.c:1396 inet6_create+0x44d/0x1150 net/ipv6/af_inet6.c:183 __sock_create+0x4f6/0x880 net/socket.c:1199 sock_create net/socket.c:1239 SYSC_socket net/socket.c:1269 SyS_socket+0xf9/0x230 net/socket.c:1249 entry_SYSCALL_64_fastpath+0x1f/0xc2 arch/x86/entry/entry_64.S:203 Memory state around the buggy address: ffff880062d9ff00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff880062d9ff80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff880062da0000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff880062da0080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff880062da0100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== Reported-by: Andrey Konovalov Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/netfilter/nf_conntrack_reasm.c | 1 + net/openvswitch/conntrack.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 9948b5ce52da..986d4ca38832 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -589,6 +589,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) hdr = ipv6_hdr(skb); fhdr = (struct frag_hdr *)skb_transport_header(skb); + skb_orphan(skb); fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); if (fq == NULL) { diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 85cd59526670..e0a87776a010 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -485,7 +485,6 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key, } else if (key->eth.type == htons(ETH_P_IPV6)) { enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; - skb_orphan(skb); memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); err = nf_ct_frag6_gather(net, skb, user); if (err) { -- cgit v1.2.3 From eb1e011a14748a1d9df9a7d7df9a5711721a1bdb Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 15 Feb 2017 09:49:26 +0100 Subject: average: change to declare precision, not factor Declaring the factor is counter-intuitive, and people are prone to using small(-ish) values even when that makes no sense. Change the DECLARE_EWMA() macro to take the fractional precision, in bits, rather than a factor, and update all users. While at it, add some more documentation. Acked-by: David S. Miller Signed-off-by: Johannes Berg --- drivers/net/virtio_net.c | 2 +- drivers/net/wireless/ath/ath5k/ath5k.h | 2 +- drivers/net/wireless/ralink/rt2x00/rt2x00.h | 2 +- include/linux/average.h | 61 +++++++++++++++++++---------- net/batman-adv/types.h | 2 +- net/mac80211/ieee80211_i.h | 2 +- net/mac80211/sta_info.h | 2 +- 7 files changed, 47 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index bf95016f442a..e9d7e2b70085 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -51,7 +51,7 @@ module_param(gso, bool, 0444); * at once, the weight is chosen so that the EWMA will be insensitive to short- * term, transient changes in packet size. */ -DECLARE_EWMA(pkt_len, 1, 64) +DECLARE_EWMA(pkt_len, 0, 64) /* With mergeable buffers we align buffer address and use the low bits to * encode its true size. Buffer size is up to 1 page so we need to align to diff --git a/drivers/net/wireless/ath/ath5k/ath5k.h b/drivers/net/wireless/ath/ath5k/ath5k.h index 67fedb61fcc0..979800c6f57f 100644 --- a/drivers/net/wireless/ath/ath5k/ath5k.h +++ b/drivers/net/wireless/ath/ath5k/ath5k.h @@ -1252,7 +1252,7 @@ struct ath5k_statistics { #define ATH5K_TXQ_LEN_MAX (ATH_TXBUF / 4) /* bufs per queue */ #define ATH5K_TXQ_LEN_LOW (ATH5K_TXQ_LEN_MAX / 2) /* low mark */ -DECLARE_EWMA(beacon_rssi, 1024, 8) +DECLARE_EWMA(beacon_rssi, 10, 8) /* Driver state associated with an instance of a device */ struct ath5k_hw { diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00.h b/drivers/net/wireless/ralink/rt2x00/rt2x00.h index 26869b3bef45..340787894c69 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2x00.h +++ b/drivers/net/wireless/ralink/rt2x00/rt2x00.h @@ -257,7 +257,7 @@ struct link_qual { int tx_failed; }; -DECLARE_EWMA(rssi, 1024, 8) +DECLARE_EWMA(rssi, 10, 8) /* * Antenna settings about the currently active link. diff --git a/include/linux/average.h b/include/linux/average.h index d04aa58280de..7ddaf340d2ac 100644 --- a/include/linux/average.h +++ b/include/linux/average.h @@ -1,45 +1,66 @@ #ifndef _LINUX_AVERAGE_H #define _LINUX_AVERAGE_H -/* Exponentially weighted moving average (EWMA) */ +/* + * Exponentially weighted moving average (EWMA) + * + * This implements a fixed-precision EWMA algorithm, with both the + * precision and fall-off coefficient determined at compile-time + * and built into the generated helper funtions. + * + * The first argument to the macro is the name that will be used + * for the struct and helper functions. + * + * The second argument, the precision, expresses how many bits are + * used for the fractional part of the fixed-precision values. + * + * The third argument, the weight reciprocal, determines how the + * new values will be weighed vs. the old state, new values will + * get weight 1/weight_rcp and old values 1-1/weight_rcp. Note + * that this parameter must be a power of two for efficiency. + */ -#define DECLARE_EWMA(name, _factor, _weight) \ +#define DECLARE_EWMA(name, _precision, _weight_rcp) \ struct ewma_##name { \ unsigned long internal; \ }; \ static inline void ewma_##name##_init(struct ewma_##name *e) \ { \ - BUILD_BUG_ON(!__builtin_constant_p(_factor)); \ - BUILD_BUG_ON(!__builtin_constant_p(_weight)); \ - BUILD_BUG_ON_NOT_POWER_OF_2(_factor); \ - BUILD_BUG_ON_NOT_POWER_OF_2(_weight); \ + BUILD_BUG_ON(!__builtin_constant_p(_precision)); \ + BUILD_BUG_ON(!__builtin_constant_p(_weight_rcp)); \ + /* \ + * Even if you want to feed it just 0/1 you should have \ + * some bits for the non-fractional part... \ + */ \ + BUILD_BUG_ON((_precision) > 30); \ + BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp); \ e->internal = 0; \ } \ static inline unsigned long \ ewma_##name##_read(struct ewma_##name *e) \ { \ - BUILD_BUG_ON(!__builtin_constant_p(_factor)); \ - BUILD_BUG_ON(!__builtin_constant_p(_weight)); \ - BUILD_BUG_ON_NOT_POWER_OF_2(_factor); \ - BUILD_BUG_ON_NOT_POWER_OF_2(_weight); \ - return e->internal >> ilog2(_factor); \ + BUILD_BUG_ON(!__builtin_constant_p(_precision)); \ + BUILD_BUG_ON(!__builtin_constant_p(_weight_rcp)); \ + BUILD_BUG_ON((_precision) > 30); \ + BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp); \ + return e->internal >> (_precision); \ } \ static inline void ewma_##name##_add(struct ewma_##name *e, \ unsigned long val) \ { \ unsigned long internal = ACCESS_ONCE(e->internal); \ - unsigned long weight = ilog2(_weight); \ - unsigned long factor = ilog2(_factor); \ + unsigned long weight_rcp = ilog2(_weight_rcp); \ + unsigned long precision = _precision; \ \ - BUILD_BUG_ON(!__builtin_constant_p(_factor)); \ - BUILD_BUG_ON(!__builtin_constant_p(_weight)); \ - BUILD_BUG_ON_NOT_POWER_OF_2(_factor); \ - BUILD_BUG_ON_NOT_POWER_OF_2(_weight); \ + BUILD_BUG_ON(!__builtin_constant_p(_precision)); \ + BUILD_BUG_ON(!__builtin_constant_p(_weight_rcp)); \ + BUILD_BUG_ON((_precision) > 30); \ + BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp); \ \ ACCESS_ONCE(e->internal) = internal ? \ - (((internal << weight) - internal) + \ - (val << factor)) >> weight : \ - (val << factor); \ + (((internal << weight_rcp) - internal) + \ + (val << precision)) >> weight_rcp : \ + (val << precision); \ } #endif /* _LINUX_AVERAGE_H */ diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 8f64a5c01345..66b25e410a41 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -402,7 +402,7 @@ struct batadv_gw_node { struct rcu_head rcu; }; -DECLARE_EWMA(throughput, 1024, 8) +DECLARE_EWMA(throughput, 10, 8) /** * struct batadv_hardif_neigh_node_bat_v - B.A.T.M.A.N. V private neighbor diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 159a1a733725..0e718437d080 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -428,7 +428,7 @@ struct ieee80211_sta_tx_tspec { bool downgraded; }; -DECLARE_EWMA(beacon_signal, 16, 4) +DECLARE_EWMA(beacon_signal, 4, 4) struct ieee80211_if_managed { struct timer_list timer; diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 15599c70a38f..e65cda34d2bc 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -372,7 +372,7 @@ struct mesh_sta { unsigned int fail_avg; }; -DECLARE_EWMA(signal, 1024, 8) +DECLARE_EWMA(signal, 10, 8) struct ieee80211_sta_rx_stats { unsigned long packets; -- cgit v1.2.3 From e3330039ea28dc199e3b2da993895ff742a91adf Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 27 Feb 2017 16:07:43 -0800 Subject: ipv6: check for ip6_null_entry in __ip6_del_rt_siblings() Andrey reported a NULL pointer deref bug in ipv6_route_ioctl() -> ip6_route_del() -> __ip6_del_rt_siblings() code path. This is because ip6_null_entry is returned in this path since ip6_null_entry is kinda default for a ipv6 route table root node. Quote from David Ahern: ip6_null_entry is the root of all ipv6 fib tables making it integrated into the table ... We should ignore any attempt of trying to delete it, like we do in __ip6_del_rt() path and several others. Reported-by: Andrey Konovalov Fixes: 0ae8133586ad ("net: ipv6: Allow shorthand delete of all nexthops in multipath route") Cc: David Ahern Cc: Eric Dumazet Signed-off-by: Cong Wang Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d94f1dfa54c8..43ca90d50ae9 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2169,10 +2169,13 @@ int ip6_del_rt(struct rt6_info *rt) static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) { struct nl_info *info = &cfg->fc_nlinfo; + struct net *net = info->nl_net; struct sk_buff *skb = NULL; struct fib6_table *table; - int err; + int err = -ENOENT; + if (rt == net->ipv6.ip6_null_entry) + goto out_put; table = rt->rt6i_table; write_lock_bh(&table->tb6_lock); @@ -2184,7 +2187,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) if (skb) { u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; - if (rt6_fill_node(info->nl_net, skb, rt, + if (rt6_fill_node(net, skb, rt, NULL, NULL, 0, RTM_DELROUTE, info->portid, seq, 0) < 0) { kfree_skb(skb); @@ -2198,17 +2201,18 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) rt6i_siblings) { err = fib6_del(sibling, info); if (err) - goto out; + goto out_unlock; } } err = fib6_del(rt, info); -out: +out_unlock: write_unlock_bh(&table->tb6_lock); +out_put: ip6_rt_put(rt); if (skb) { - rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV6_ROUTE, + rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, info->nlh, gfp_any()); } return err; -- cgit v1.2.3 From f1304f7ba3981e71dcf2ac7db92949eeab49b1bf Mon Sep 17 00:00:00 2001 From: Peter Downs Date: Wed, 1 Mar 2017 01:01:17 -0800 Subject: openvswitch: actions: fixed a brace coding style warning Fixed a brace coding style warning reported by checkpatch.pl Signed-off-by: Peter Downs Signed-off-by: David S. Miller --- net/openvswitch/actions.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index b1beb2b94ec7..c82301ce3fff 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -796,9 +796,8 @@ static void ovs_fragment(struct net *net, struct vport *vport, unsigned long orig_dst; struct rt6_info ovs_rt; - if (!v6ops) { + if (!v6ops) goto err; - } prepare_frag(vport, skb, orig_network_offset, ovs_key_mac_proto(key)); -- cgit v1.2.3 From d5afb6f9b6bb2c57bd0c05e76e12489dc0d037d9 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 1 Mar 2017 16:35:07 -0300 Subject: dccp: Unlock sock before calling sk_free() The code where sk_clone() came from created a new socket and locked it, but then, on the error path didn't unlock it. This problem stayed there for a long while, till b0691c8ee7c2 ("net: Unlock sock before calling sk_free()") fixed it, but unfortunately the callers of sk_clone() (now sk_clone_locked()) were not audited and the one in dccp_create_openreq_child() remained. Now in the age of the syskaller fuzzer, this was finally uncovered, as reported by Dmitry: ---- 8< ---- I've got the following report while running syzkaller fuzzer on 86292b33d4b7 ("Merge branch 'akpm' (patches from Andrew)") [ BUG: held lock freed! ] 4.10.0+ #234 Not tainted ------------------------- syz-executor6/6898 is freeing memory ffff88006286cac0-ffff88006286d3b7, with a lock still held there! (slock-AF_INET6){+.-...}, at: [] spin_lock include/linux/spinlock.h:299 [inline] (slock-AF_INET6){+.-...}, at: [] sk_clone_lock+0x3d9/0x12c0 net/core/sock.c:1504 5 locks held by syz-executor6/6898: #0: (sk_lock-AF_INET6){+.+.+.}, at: [] lock_sock include/net/sock.h:1460 [inline] #0: (sk_lock-AF_INET6){+.+.+.}, at: [] inet_stream_connect+0x44/0xa0 net/ipv4/af_inet.c:681 #1: (rcu_read_lock){......}, at: [] inet6_csk_xmit+0x12a/0x5d0 net/ipv6/inet6_connection_sock.c:126 #2: (rcu_read_lock){......}, at: [] __skb_unlink include/linux/skbuff.h:1767 [inline] #2: (rcu_read_lock){......}, at: [] __skb_dequeue include/linux/skbuff.h:1783 [inline] #2: (rcu_read_lock){......}, at: [] process_backlog+0x264/0x730 net/core/dev.c:4835 #3: (rcu_read_lock){......}, at: [] ip6_input_finish+0x0/0x1700 net/ipv6/ip6_input.c:59 #4: (slock-AF_INET6){+.-...}, at: [] spin_lock include/linux/spinlock.h:299 [inline] #4: (slock-AF_INET6){+.-...}, at: [] sk_clone_lock+0x3d9/0x12c0 net/core/sock.c:1504 Fix it just like was done by b0691c8ee7c2 ("net: Unlock sock before calling sk_free()"). Reported-by: Dmitry Vyukov Cc: Cong Wang Cc: Eric Dumazet Cc: Gerrit Renker Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170301153510.GE15145@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/dccp/minisocks.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 53eddf99e4f6..d20d948a98ed 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -122,6 +122,7 @@ struct sock *dccp_create_openreq_child(const struct sock *sk, /* It is still raw copy of parent, so invalidate * destructor and make plain sk_free() */ newsk->sk_destruct = NULL; + bh_unlock_sock(newsk); sk_free(newsk); return NULL; } -- cgit v1.2.3 From 94352d45092c23874532221b4d1e4721df9d63df Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 1 Mar 2017 16:35:08 -0300 Subject: net: Introduce sk_clone_lock() error path routine When handling problems in cloning a socket with the sk_clone_locked() function we need to perform several steps that were open coded in it and its callers, so introduce a routine to avoid this duplication: sk_free_unlock_clone(). Cc: Cong Wang Cc: Dmitry Vyukov Cc: Eric Dumazet Cc: Gerrit Renker Cc: Thomas Gleixner Link: http://lkml.kernel.org/n/net-ui6laqkotycunhtmqryl9bfx@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/net/sock.h | 1 + net/core/sock.c | 16 +++++++++++----- net/dccp/minisocks.c | 6 +----- 3 files changed, 13 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 9ccefa5c5487..5e5997654db6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1526,6 +1526,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, void sk_free(struct sock *sk); void sk_destruct(struct sock *sk); struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority); +void sk_free_unlock_clone(struct sock *sk); struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority); diff --git a/net/core/sock.c b/net/core/sock.c index e7d74940e863..f6fd79f33097 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1539,11 +1539,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) is_charged = sk_filter_charge(newsk, filter); if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { - /* It is still raw copy of parent, so invalidate - * destructor and make plain sk_free() */ - newsk->sk_destruct = NULL; - bh_unlock_sock(newsk); - sk_free(newsk); + sk_free_unlock_clone(newsk); newsk = NULL; goto out; } @@ -1592,6 +1588,16 @@ out: } EXPORT_SYMBOL_GPL(sk_clone_lock); +void sk_free_unlock_clone(struct sock *sk) +{ + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() */ + sk->sk_destruct = NULL; + bh_unlock_sock(sk); + sk_free(sk); +} +EXPORT_SYMBOL_GPL(sk_free_unlock_clone); + void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { u32 max_segs = 1; diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index d20d948a98ed..e267e6f4c9a5 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -119,11 +119,7 @@ struct sock *dccp_create_openreq_child(const struct sock *sk, * Activate features: initialise CCIDs, sequence windows etc. */ if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) { - /* It is still raw copy of parent, so invalidate - * destructor and make plain sk_free() */ - newsk->sk_destruct = NULL; - bh_unlock_sock(newsk); - sk_free(newsk); + sk_free_unlock_clone(newsk); return NULL; } dccp_init_xmit_timers(newsk); -- cgit v1.2.3 From 7db92362d2fee5887f6b0c41653b8c9f8f5d6020 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 1 Mar 2017 13:29:48 -0800 Subject: tcp: fix potential double free issue for fastopen_req tp->fastopen_req could potentially be double freed if a malicious user does the following: 1. Enable TCP_FASTOPEN_CONNECT sockopt and do a connect() on the socket. 2. Call connect() with AF_UNSPEC to disconnect the socket. 3. Make this socket a listening socket by calling listen(). 4. Accept incoming connections and generate child sockets. All child sockets will get a copy of the pointer of fastopen_req. 5. Call close() on all sockets. fastopen_req will get freed multiple times. Fixes: 19f6d3f3c842 ("net/tcp-fastopen: Add new API support") Reported-by: Andrey Konovalov Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index da385ae997a3..cf4555581282 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1110,9 +1110,14 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; err = __inet_stream_connect(sk->sk_socket, msg->msg_name, msg->msg_namelen, flags, 1); - inet->defer_connect = 0; - *copied = tp->fastopen_req->copied; - tcp_free_fastopen_req(tp); + /* fastopen_req could already be freed in __inet_stream_connect + * if the connection times out or gets rst + */ + if (tp->fastopen_req) { + *copied = tp->fastopen_req->copied; + tcp_free_fastopen_req(tp); + inet->defer_connect = 0; + } return err; } @@ -2318,6 +2323,10 @@ int tcp_disconnect(struct sock *sk, int flags) memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); + /* Clean up fastopen related fields */ + tcp_free_fastopen_req(tp); + inet->defer_connect = 0; + WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); sk->sk_error_report(sk); -- cgit v1.2.3 From 9d6acb3bc9058d1fd7a5297d71f14213679bb4bd Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Wed, 1 Mar 2017 20:48:39 -0800 Subject: ipv6: ignore null_entry in inet6_rtm_getroute() too Like commit 1f17e2f2c8a8 ("net: ipv6: ignore null_entry on route dumps"), we need to ignore null entry in inet6_rtm_getroute() too. Return -ENETUNREACH here to sync with IPv4 behavior, as suggested by David. Fixes: a1a22c1206 ("net: ipv6: Keep nexthop of multipath route on admin down") Reported-by: Dmitry Vyukov Cc: David Ahern Signed-off-by: Cong Wang Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 43ca90d50ae9..229bfcc451ef 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3632,6 +3632,12 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6); } + if (rt == net->ipv6.ip6_null_entry) { + err = rt->dst.error; + ip6_rt_put(rt); + goto errout; + } + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) { ip6_rt_put(rt); -- cgit v1.2.3 From da2f27e9e615d1c799c9582b15262458da61fddc Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 1 Mar 2017 15:33:26 +0100 Subject: netfilter: nf_conntrack_sip: fix wrong memory initialisation In commit 82de0be6862cd ("netfilter: Add helper array register/unregister functions"), struct nf_conntrack_helper sip[MAX_PORTS][4] was changed to sip[MAX_PORTS * 4], so the memory init should have been changed to memset(&sip[4 * i], 0, 4 * sizeof(sip[i])); But as the sip[] table is allocated in the BSS, it is already set to 0 Fixes: 82de0be6862cd ("netfilter: Add helper array register/unregister functions") Signed-off-by: Christophe Leroy Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_sip.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 24174c520239..0d17894798b5 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1628,8 +1628,6 @@ static int __init nf_conntrack_sip_init(void) ports[ports_c++] = SIP_PORT; for (i = 0; i < ports_c; i++) { - memset(&sip[i], 0, sizeof(sip[i])); - nf_ct_helper_init(&sip[4 * i], AF_INET, IPPROTO_UDP, "sip", SIP_PORT, ports[i], i, sip_exp_policy, SIP_EXPECT_MAX, -- cgit v1.2.3 From f9121355eb6f9babadb97bf5b34ab0cce7764406 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 1 Mar 2017 18:15:11 +0100 Subject: netfilter: nft_set_rbtree: incorrect assumption on lower interval lookups In case of adjacent ranges, we may indeed see either the high part of the range in first place or the low part of it. Remove this incorrect assumption, let's make sure we annotate the low part of the interval in case of we have adjacent interva intervals so we hit a matching in lookups. Reported-by: Simon Hanisch Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_rbtree.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 71e8fb886a73..78dfbf9588b3 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -60,11 +60,10 @@ static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, d = memcmp(this, key, set->klen); if (d < 0) { parent = parent->rb_left; - /* In case of adjacent ranges, we always see the high - * part of the range in first place, before the low one. - * So don't update interval if the keys are equal. - */ - if (interval && nft_rbtree_equal(set, this, interval)) + if (interval && + nft_rbtree_equal(set, this, interval) && + nft_rbtree_interval_end(this) && + !nft_rbtree_interval_end(interval)) continue; interval = rbe; } else if (d > 0) -- cgit v1.2.3 From 25e94a997b324b5f167f56d56d7106d38b78c9de Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 1 Mar 2017 12:52:31 +0100 Subject: netfilter: nf_tables: don't call nfnetlink_set_err() if nfnetlink_send() fails The underlying nlmsg_multicast() already sets sk->sk_err for us to notify socket overruns, so we should not do anything with this return value. So we just call nfnetlink_set_err() if: 1) We fail to allocate the netlink message. or 2) We don't have enough space in the netlink message to place attributes, which means that we likely need to allocate a larger message. Before this patch, the internal ESRCH netlink error code was propagated to userspace, which is quite misleading. Netlink semantics mandate that listeners just hit ENOBUFS if the socket buffer overruns. Reported-by: Alexander Alemayhu Tested-by: Alexander Alemayhu Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 6 +- net/netfilter/nf_tables_api.c | 133 ++++++++++++++++---------------------- 2 files changed, 58 insertions(+), 81 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index ac84686aaafb..2aa8a9d80fbe 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -988,9 +988,9 @@ struct nft_object *nf_tables_obj_lookup(const struct nft_table *table, const struct nlattr *nla, u32 objtype, u8 genmask); -int nft_obj_notify(struct net *net, struct nft_table *table, - struct nft_object *obj, u32 portid, u32 seq, - int event, int family, int report, gfp_t gfp); +void nft_obj_notify(struct net *net, struct nft_table *table, + struct nft_object *obj, u32 portid, u32 seq, + int event, int family, int report, gfp_t gfp); /** * struct nft_object_type - stateful object type diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index ff7304ae58ac..5e0ccfd5bb37 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -461,16 +461,15 @@ nla_put_failure: return -1; } -static int nf_tables_table_notify(const struct nft_ctx *ctx, int event) +static void nf_tables_table_notify(const struct nft_ctx *ctx, int event) { struct sk_buff *skb; int err; if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) - return 0; + return; - err = -ENOBUFS; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (skb == NULL) goto err; @@ -482,14 +481,11 @@ static int nf_tables_table_notify(const struct nft_ctx *ctx, int event) goto err; } - err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, - ctx->report, GFP_KERNEL); + nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, + ctx->report, GFP_KERNEL); + return; err: - if (err < 0) { - nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, - err); - } - return err; + nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); } static int nf_tables_dump_tables(struct sk_buff *skb, @@ -1050,16 +1046,15 @@ nla_put_failure: return -1; } -static int nf_tables_chain_notify(const struct nft_ctx *ctx, int event) +static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event) { struct sk_buff *skb; int err; if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) - return 0; + return; - err = -ENOBUFS; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (skb == NULL) goto err; @@ -1072,14 +1067,11 @@ static int nf_tables_chain_notify(const struct nft_ctx *ctx, int event) goto err; } - err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, - ctx->report, GFP_KERNEL); + nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, + ctx->report, GFP_KERNEL); + return; err: - if (err < 0) { - nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, - err); - } - return err; + nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); } static int nf_tables_dump_chains(struct sk_buff *skb, @@ -1934,18 +1926,16 @@ nla_put_failure: return -1; } -static int nf_tables_rule_notify(const struct nft_ctx *ctx, - const struct nft_rule *rule, - int event) +static void nf_tables_rule_notify(const struct nft_ctx *ctx, + const struct nft_rule *rule, int event) { struct sk_buff *skb; int err; if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) - return 0; + return; - err = -ENOBUFS; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (skb == NULL) goto err; @@ -1958,14 +1948,11 @@ static int nf_tables_rule_notify(const struct nft_ctx *ctx, goto err; } - err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, - ctx->report, GFP_KERNEL); + nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, + ctx->report, GFP_KERNEL); + return; err: - if (err < 0) { - nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, - err); - } - return err; + nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); } struct nft_rule_dump_ctx { @@ -2696,9 +2683,9 @@ nla_put_failure: return -1; } -static int nf_tables_set_notify(const struct nft_ctx *ctx, - const struct nft_set *set, - int event, gfp_t gfp_flags) +static void nf_tables_set_notify(const struct nft_ctx *ctx, + const struct nft_set *set, int event, + gfp_t gfp_flags) { struct sk_buff *skb; u32 portid = ctx->portid; @@ -2706,9 +2693,8 @@ static int nf_tables_set_notify(const struct nft_ctx *ctx, if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) - return 0; + return; - err = -ENOBUFS; skb = nlmsg_new(NLMSG_GOODSIZE, gfp_flags); if (skb == NULL) goto err; @@ -2719,12 +2705,11 @@ static int nf_tables_set_notify(const struct nft_ctx *ctx, goto err; } - err = nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, - ctx->report, gfp_flags); + nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, ctx->report, + gfp_flags); + return; err: - if (err < 0) - nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, err); - return err; + nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, -ENOBUFS); } static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) @@ -3504,10 +3489,10 @@ nla_put_failure: return -1; } -static int nf_tables_setelem_notify(const struct nft_ctx *ctx, - const struct nft_set *set, - const struct nft_set_elem *elem, - int event, u16 flags) +static void nf_tables_setelem_notify(const struct nft_ctx *ctx, + const struct nft_set *set, + const struct nft_set_elem *elem, + int event, u16 flags) { struct net *net = ctx->net; u32 portid = ctx->portid; @@ -3515,9 +3500,8 @@ static int nf_tables_setelem_notify(const struct nft_ctx *ctx, int err; if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) - return 0; + return; - err = -ENOBUFS; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (skb == NULL) goto err; @@ -3529,12 +3513,11 @@ static int nf_tables_setelem_notify(const struct nft_ctx *ctx, goto err; } - err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report, - GFP_KERNEL); + nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report, + GFP_KERNEL); + return; err: - if (err < 0) - nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err); - return err; + nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS); } static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, @@ -4476,18 +4459,17 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk, return nft_delobj(&ctx, obj); } -int nft_obj_notify(struct net *net, struct nft_table *table, - struct nft_object *obj, u32 portid, u32 seq, int event, - int family, int report, gfp_t gfp) +void nft_obj_notify(struct net *net, struct nft_table *table, + struct nft_object *obj, u32 portid, u32 seq, int event, + int family, int report, gfp_t gfp) { struct sk_buff *skb; int err; if (!report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) - return 0; + return; - err = -ENOBUFS; skb = nlmsg_new(NLMSG_GOODSIZE, gfp); if (skb == NULL) goto err; @@ -4499,21 +4481,18 @@ int nft_obj_notify(struct net *net, struct nft_table *table, goto err; } - err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, gfp); + nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, gfp); + return; err: - if (err < 0) { - nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err); - } - return err; + nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS); } EXPORT_SYMBOL_GPL(nft_obj_notify); -static int nf_tables_obj_notify(const struct nft_ctx *ctx, - struct nft_object *obj, int event) +static void nf_tables_obj_notify(const struct nft_ctx *ctx, + struct nft_object *obj, int event) { - return nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, - ctx->seq, event, ctx->afi->family, ctx->report, - GFP_KERNEL); + nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, ctx->seq, event, + ctx->afi->family, ctx->report, GFP_KERNEL); } static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, @@ -4543,7 +4522,8 @@ nla_put_failure: return -EMSGSIZE; } -static int nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int event) +static void nf_tables_gen_notify(struct net *net, struct sk_buff *skb, + int event) { struct nlmsghdr *nlh = nlmsg_hdr(skb); struct sk_buff *skb2; @@ -4551,9 +4531,8 @@ static int nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int event) if (nlmsg_report(nlh) && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) - return 0; + return; - err = -ENOBUFS; skb2 = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (skb2 == NULL) goto err; @@ -4565,14 +4544,12 @@ static int nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int event) goto err; } - err = nfnetlink_send(skb2, net, NETLINK_CB(skb).portid, - NFNLGRP_NFTABLES, nlmsg_report(nlh), GFP_KERNEL); + nfnetlink_send(skb2, net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES, + nlmsg_report(nlh), GFP_KERNEL); + return; err: - if (err < 0) { - nfnetlink_set_err(net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES, - err); - } - return err; + nfnetlink_set_err(net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES, + -ENOBUFS); } static int nf_tables_getgen(struct net *net, struct sock *nlsk, -- cgit v1.2.3 From 37411cad633f5e41f8a13007654909d21b19363a Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 2 Mar 2017 23:48:52 +0000 Subject: rxrpc: Fix potential NULL-pointer exception Fix a potential NULL-pointer exception in rxrpc_do_sendmsg(). The call state check that I added should have gone into the else-body of the if-statement where we actually have a call to check. Found by CoverityScan CID#1414316 ("Dereference after null check"). Fixes: 540b1c48c37a ("rxrpc: Fix deadlock between call creation and sendmsg/recvmsg") Reported-by: Colin Ian King Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/sendmsg.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 31c1538c1a8d..27685d8cba1a 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -517,13 +517,6 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) ret = -EBADSLT; if (cmd != RXRPC_CMD_SEND_DATA) goto error_release_sock; - ret = -EBUSY; - if (call->state == RXRPC_CALL_UNINITIALISED || - call->state == RXRPC_CALL_CLIENT_AWAIT_CONN || - call->state == RXRPC_CALL_SERVER_PREALLOC || - call->state == RXRPC_CALL_SERVER_SECURING || - call->state == RXRPC_CALL_SERVER_ACCEPTING) - goto error_release_sock; call = rxrpc_new_client_call_for_sendmsg(rx, msg, user_call_ID, exclusive); /* The socket is now unlocked... */ @@ -531,6 +524,14 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) return PTR_ERR(call); /* ... and we have the call lock. */ } else { + ret = -EBUSY; + if (call->state == RXRPC_CALL_UNINITIALISED || + call->state == RXRPC_CALL_CLIENT_AWAIT_CONN || + call->state == RXRPC_CALL_SERVER_PREALLOC || + call->state == RXRPC_CALL_SERVER_SECURING || + call->state == RXRPC_CALL_SERVER_ACCEPTING) + goto error_release_sock; + ret = mutex_lock_interruptible(&call->user_mutex); release_sock(&rx->sk); if (ret < 0) { -- cgit v1.2.3 From a8d63a53b3eac8626f62336dcc327c18f1bbca78 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Fri, 3 Mar 2017 00:44:26 -0500 Subject: rds: remove unnecessary returned value check The function rds_trans_register always returns 0. As such, it is not necessary to check the returned value. Cc: Joe Jin Cc: Junxiao Bi Signed-off-by: Zhu Yanjun Reviewed-by: Yuval Shaia Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib.c | 6 +----- net/rds/rds.h | 2 +- net/rds/tcp.c | 6 +----- net/rds/transport.c | 4 +--- 4 files changed, 4 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/rds/ib.c b/net/rds/ib.c index 0f557b243311..7a64c8db81ab 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -438,16 +438,12 @@ int rds_ib_init(void) if (ret) goto out_sysctl; - ret = rds_trans_register(&rds_ib_transport); - if (ret) - goto out_recv; + rds_trans_register(&rds_ib_transport); rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); goto out; -out_recv: - rds_ib_recv_exit(); out_sysctl: rds_ib_sysctl_exit(); out_ibreg: diff --git a/net/rds/rds.h b/net/rds/rds.h index 07fff73dd4f3..6f523ddfe1fb 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -910,7 +910,7 @@ void rds_connect_path_complete(struct rds_conn_path *conn, int curr); void rds_connect_complete(struct rds_connection *conn); /* transport.c */ -int rds_trans_register(struct rds_transport *trans); +void rds_trans_register(struct rds_transport *trans); void rds_trans_unregister(struct rds_transport *trans); struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr); void rds_trans_put(struct rds_transport *trans); diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 5438f6725092..a973d3b4dff0 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -652,16 +652,12 @@ static int rds_tcp_init(void) if (ret) goto out_pernet; - ret = rds_trans_register(&rds_tcp_transport); - if (ret) - goto out_recv; + rds_trans_register(&rds_tcp_transport); rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); goto out; -out_recv: - rds_tcp_recv_exit(); out_pernet: unregister_pernet_subsys(&rds_tcp_net_ops); out_notifier: diff --git a/net/rds/transport.c b/net/rds/transport.c index 2ffd3e30c643..0b188dd0a344 100644 --- a/net/rds/transport.c +++ b/net/rds/transport.c @@ -40,7 +40,7 @@ static struct rds_transport *transports[RDS_TRANS_COUNT]; static DECLARE_RWSEM(rds_trans_sem); -int rds_trans_register(struct rds_transport *trans) +void rds_trans_register(struct rds_transport *trans) { BUG_ON(strlen(trans->t_name) + 1 > TRANSNAMSIZ); @@ -55,8 +55,6 @@ int rds_trans_register(struct rds_transport *trans) } up_write(&rds_trans_sem); - - return 0; } EXPORT_SYMBOL_GPL(rds_trans_register); -- cgit v1.2.3 From f78ef7cd9a0686b979679d0de061c6dbfd8d649e Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Fri, 3 Mar 2017 12:21:14 -0800 Subject: strparser: destroy workqueue on module exit Fixes: 43a0c6751a32 ("strparser: Stream parser for messages") Cc: Tom Herbert Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/strparser/strparser.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 41adf362936d..b5c279b22680 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -504,6 +504,7 @@ static int __init strp_mod_init(void) static void __exit strp_mod_exit(void) { + destroy_workqueue(strp_wq); } module_init(strp_mod_init); module_exit(strp_mod_exit); -- cgit v1.2.3