From 50ac6607845755e594c8a39b9c6a00d1c9b48ea4 Mon Sep 17 00:00:00 2001
From: Amitkumar Karwar <akarwar@marvell.com>
Date: Tue, 25 Jun 2013 19:03:56 -0700
Subject: cfg80211/nl80211: rename packet pattern related structures and enums

Currently packet patterns and it's enum/structures are used only
for WoWLAN feature. As we intend to reuse them for new feature
packet coalesce, they are renamed in this patch.

Older names are kept for backward compatibility purpose.

Signed-off-by: Amitkumar Karwar <akarwar@marvell.com>
Signed-off-by: Bing Zhao <bzhao@marvell.com>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 net/wireless/nl80211.c | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 1cc47aca7f05..a044762f5ea3 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -974,7 +974,7 @@ static int nl80211_send_wowlan(struct sk_buff *msg,
 		return -ENOBUFS;
 
 	if (dev->wiphy.wowlan->n_patterns) {
-		struct nl80211_wowlan_pattern_support pat = {
+		struct nl80211_pattern_support pat = {
 			.max_patterns = dev->wiphy.wowlan->n_patterns,
 			.min_pattern_len = dev->wiphy.wowlan->pattern_min_len,
 			.max_pattern_len = dev->wiphy.wowlan->pattern_max_len,
@@ -7591,12 +7591,11 @@ static int nl80211_send_wowlan_patterns(struct sk_buff *msg,
 		if (!nl_pat)
 			return -ENOBUFS;
 		pat_len = wowlan->patterns[i].pattern_len;
-		if (nla_put(msg, NL80211_WOWLAN_PKTPAT_MASK,
-			    DIV_ROUND_UP(pat_len, 8),
+		if (nla_put(msg, NL80211_PKTPAT_MASK, DIV_ROUND_UP(pat_len, 8),
 			    wowlan->patterns[i].mask) ||
-		    nla_put(msg, NL80211_WOWLAN_PKTPAT_PATTERN,
-			    pat_len, wowlan->patterns[i].pattern) ||
-		    nla_put_u32(msg, NL80211_WOWLAN_PKTPAT_OFFSET,
+		    nla_put(msg, NL80211_PKTPAT_PATTERN, pat_len,
+			    wowlan->patterns[i].pattern) ||
+		    nla_put_u32(msg, NL80211_PKTPAT_OFFSET,
 				wowlan->patterns[i].pkt_offset))
 			return -ENOBUFS;
 		nla_nest_end(msg, nl_pat);
@@ -7937,7 +7936,7 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info)
 		struct nlattr *pat;
 		int n_patterns = 0;
 		int rem, pat_len, mask_len, pkt_offset;
-		struct nlattr *pat_tb[NUM_NL80211_WOWLAN_PKTPAT];
+		struct nlattr *pat_tb[NUM_NL80211_PKTPAT];
 
 		nla_for_each_nested(pat, tb[NL80211_WOWLAN_TRIG_PKT_PATTERN],
 				    rem)
@@ -7956,26 +7955,25 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info)
 
 		nla_for_each_nested(pat, tb[NL80211_WOWLAN_TRIG_PKT_PATTERN],
 				    rem) {
-			nla_parse(pat_tb, MAX_NL80211_WOWLAN_PKTPAT,
-				  nla_data(pat), nla_len(pat), NULL);
+			nla_parse(pat_tb, MAX_NL80211_PKTPAT, nla_data(pat),
+				  nla_len(pat), NULL);
 			err = -EINVAL;
-			if (!pat_tb[NL80211_WOWLAN_PKTPAT_MASK] ||
-			    !pat_tb[NL80211_WOWLAN_PKTPAT_PATTERN])
+			if (!pat_tb[NL80211_PKTPAT_MASK] ||
+			    !pat_tb[NL80211_PKTPAT_PATTERN])
 				goto error;
-			pat_len = nla_len(pat_tb[NL80211_WOWLAN_PKTPAT_PATTERN]);
+			pat_len = nla_len(pat_tb[NL80211_PKTPAT_PATTERN]);
 			mask_len = DIV_ROUND_UP(pat_len, 8);
-			if (nla_len(pat_tb[NL80211_WOWLAN_PKTPAT_MASK]) !=
-			    mask_len)
+			if (nla_len(pat_tb[NL80211_PKTPAT_MASK]) != mask_len)
 				goto error;
 			if (pat_len > wowlan->pattern_max_len ||
 			    pat_len < wowlan->pattern_min_len)
 				goto error;
 
-			if (!pat_tb[NL80211_WOWLAN_PKTPAT_OFFSET])
+			if (!pat_tb[NL80211_PKTPAT_OFFSET])
 				pkt_offset = 0;
 			else
 				pkt_offset = nla_get_u32(
-					pat_tb[NL80211_WOWLAN_PKTPAT_OFFSET]);
+					pat_tb[NL80211_PKTPAT_OFFSET]);
 			if (pkt_offset > wowlan->max_pkt_offset)
 				goto error;
 			new_triggers.patterns[i].pkt_offset = pkt_offset;
@@ -7989,11 +7987,11 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info)
 			new_triggers.patterns[i].pattern =
 				new_triggers.patterns[i].mask + mask_len;
 			memcpy(new_triggers.patterns[i].mask,
-			       nla_data(pat_tb[NL80211_WOWLAN_PKTPAT_MASK]),
+			       nla_data(pat_tb[NL80211_PKTPAT_MASK]),
 			       mask_len);
 			new_triggers.patterns[i].pattern_len = pat_len;
 			memcpy(new_triggers.patterns[i].pattern,
-			       nla_data(pat_tb[NL80211_WOWLAN_PKTPAT_PATTERN]),
+			       nla_data(pat_tb[NL80211_PKTPAT_PATTERN]),
 			       pat_len);
 			i++;
 		}
-- 
cgit v1.2.3


From 803768f54ef84cb4aaac0b51274b11b31885588c Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <simon.wunderlich@s2003.tu-chemnitz.de>
Date: Fri, 28 Jun 2013 10:39:58 +0200
Subject: nl80211: enable HT overrides for ibss

Signed-off-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Mathias Kretschmer <mathias.kretschmer@fokus.fraunhofer.de>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 include/net/cfg80211.h |  5 +++++
 net/wireless/nl80211.c | 13 +++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 207b079a8fa5..79c2277e7c66 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1592,6 +1592,9 @@ struct cfg80211_disassoc_request {
  *	user space. Otherwise, port is marked authorized by default.
  * @basic_rates: bitmap of basic rates to use when creating the IBSS
  * @mcast_rate: per-band multicast rate index + 1 (0: disabled)
+ * @ht_capa:  HT Capabilities over-rides.  Values set in ht_capa_mask
+ *   will be used in ht_capa.  Un-supported values will be ignored.
+ * @ht_capa_mask:  The bits of ht_capa which are to be used.
  */
 struct cfg80211_ibss_params {
 	u8 *ssid;
@@ -1605,6 +1608,8 @@ struct cfg80211_ibss_params {
 	bool privacy;
 	bool control_port;
 	int mcast_rate[IEEE80211_NUM_BANDS];
+	struct ieee80211_ht_cap ht_capa;
+	struct ieee80211_ht_cap ht_capa_mask;
 };
 
 /**
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index a044762f5ea3..0492478ab74e 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -6346,6 +6346,19 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
 			return err;
 	}
 
+	if (info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK])
+		memcpy(&ibss.ht_capa_mask,
+		       nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK]),
+		       sizeof(ibss.ht_capa_mask));
+
+	if (info->attrs[NL80211_ATTR_HT_CAPABILITY]) {
+		if (!info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK])
+			return -EINVAL;
+		memcpy(&ibss.ht_capa,
+		       nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]),
+		       sizeof(ibss.ht_capa));
+	}
+
 	if (info->attrs[NL80211_ATTR_MCAST_RATE] &&
 	    !nl80211_parse_mcast_rate(rdev, ibss.mcast_rate,
 			nla_get_u32(info->attrs[NL80211_ATTR_MCAST_RATE])))
-- 
cgit v1.2.3


From 822854b0b1d8f893279ca717f3d084aa5fcd9ff5 Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <simon.wunderlich@s2003.tu-chemnitz.de>
Date: Fri, 28 Jun 2013 10:39:59 +0200
Subject: mac80211: enable HT overrides for ibss

Signed-off-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Mathias Kretschmer <mathias.kretschmer@fokus.fraunhofer.de>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 net/mac80211/ht.c          | 53 +++++++++++++++++++++++++++++++++-------------
 net/mac80211/ibss.c        | 18 ++++++++++++++--
 net/mac80211/ieee80211_i.h |  3 +++
 3 files changed, 57 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index f83534f6a2ee..529bf58bc145 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -19,13 +19,14 @@
 #include "ieee80211_i.h"
 #include "rate.h"
 
-static void __check_htcap_disable(struct ieee80211_sub_if_data *sdata,
+static void __check_htcap_disable(struct ieee80211_ht_cap *ht_capa,
+				  struct ieee80211_ht_cap *ht_capa_mask,
 				  struct ieee80211_sta_ht_cap *ht_cap,
 				  u16 flag)
 {
 	__le16 le_flag = cpu_to_le16(flag);
-	if (sdata->u.mgd.ht_capa_mask.cap_info & le_flag) {
-		if (!(sdata->u.mgd.ht_capa.cap_info & le_flag))
+	if (ht_capa_mask->cap_info & le_flag) {
+		if (!(ht_capa->cap_info & le_flag))
 			ht_cap->cap &= ~flag;
 	}
 }
@@ -33,13 +34,30 @@ static void __check_htcap_disable(struct ieee80211_sub_if_data *sdata,
 void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata,
 				     struct ieee80211_sta_ht_cap *ht_cap)
 {
-	u8 *scaps = (u8 *)(&sdata->u.mgd.ht_capa.mcs.rx_mask);
-	u8 *smask = (u8 *)(&sdata->u.mgd.ht_capa_mask.mcs.rx_mask);
+	struct ieee80211_ht_cap *ht_capa, *ht_capa_mask;
+	u8 *scaps, *smask;
 	int i;
 
 	if (!ht_cap->ht_supported)
 		return;
 
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_STATION:
+		ht_capa = &sdata->u.mgd.ht_capa;
+		ht_capa_mask = &sdata->u.mgd.ht_capa_mask;
+		break;
+	case NL80211_IFTYPE_ADHOC:
+		ht_capa = &sdata->u.ibss.ht_capa;
+		ht_capa_mask = &sdata->u.ibss.ht_capa_mask;
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	scaps = (u8 *)(&ht_capa->mcs.rx_mask);
+	smask = (u8 *)(&ht_capa_mask->mcs.rx_mask);
+
 	/* NOTE:  If you add more over-rides here, update register_hw
 	 * ht_capa_mod_msk logic in main.c as well.
 	 * And, if this method can ever change ht_cap.ht_supported, fix
@@ -55,28 +73,32 @@ void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata,
 	}
 
 	/* Force removal of HT-40 capabilities? */
-	__check_htcap_disable(sdata, ht_cap, IEEE80211_HT_CAP_SUP_WIDTH_20_40);
-	__check_htcap_disable(sdata, ht_cap, IEEE80211_HT_CAP_SGI_40);
+	__check_htcap_disable(ht_capa, ht_capa_mask, ht_cap,
+			      IEEE80211_HT_CAP_SUP_WIDTH_20_40);
+	__check_htcap_disable(ht_capa, ht_capa_mask, ht_cap,
+			      IEEE80211_HT_CAP_SGI_40);
 
 	/* Allow user to disable SGI-20 (SGI-40 is handled above) */
-	__check_htcap_disable(sdata, ht_cap, IEEE80211_HT_CAP_SGI_20);
+	__check_htcap_disable(ht_capa, ht_capa_mask, ht_cap,
+			      IEEE80211_HT_CAP_SGI_20);
 
 	/* Allow user to disable the max-AMSDU bit. */
-	__check_htcap_disable(sdata, ht_cap, IEEE80211_HT_CAP_MAX_AMSDU);
+	__check_htcap_disable(ht_capa, ht_capa_mask, ht_cap,
+			      IEEE80211_HT_CAP_MAX_AMSDU);
 
 	/* Allow user to decrease AMPDU factor */
-	if (sdata->u.mgd.ht_capa_mask.ampdu_params_info &
+	if (ht_capa_mask->ampdu_params_info &
 	    IEEE80211_HT_AMPDU_PARM_FACTOR) {
-		u8 n = sdata->u.mgd.ht_capa.ampdu_params_info
-			& IEEE80211_HT_AMPDU_PARM_FACTOR;
+		u8 n = ht_capa->ampdu_params_info &
+		       IEEE80211_HT_AMPDU_PARM_FACTOR;
 		if (n < ht_cap->ampdu_factor)
 			ht_cap->ampdu_factor = n;
 	}
 
 	/* Allow the user to increase AMPDU density. */
-	if (sdata->u.mgd.ht_capa_mask.ampdu_params_info &
+	if (ht_capa_mask->ampdu_params_info &
 	    IEEE80211_HT_AMPDU_PARM_DENSITY) {
-		u8 n = (sdata->u.mgd.ht_capa.ampdu_params_info &
+		u8 n = (ht_capa->ampdu_params_info &
 			IEEE80211_HT_AMPDU_PARM_DENSITY)
 			>> IEEE80211_HT_AMPDU_PARM_DENSITY_SHIFT;
 		if (n > ht_cap->ampdu_density)
@@ -112,7 +134,8 @@ bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata,
 	 * we advertised a restricted capability set to. Override
 	 * our own capabilities and then use those below.
 	 */
-	if (sdata->vif.type == NL80211_IFTYPE_STATION &&
+	if ((sdata->vif.type == NL80211_IFTYPE_STATION ||
+	     sdata->vif.type == NL80211_IFTYPE_ADHOC) &&
 	    !test_sta_flag(sta, WLAN_STA_TDLS_PEER))
 		ieee80211_apply_htcap_overrides(sdata, &own_cap);
 
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index ea7b9c2c7e66..7f290a8562ef 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -179,8 +179,12 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	    chandef.width != NL80211_CHAN_WIDTH_5 &&
 	    chandef.width != NL80211_CHAN_WIDTH_10 &&
 	    sband->ht_cap.ht_supported) {
-		pos = ieee80211_ie_build_ht_cap(pos, &sband->ht_cap,
-						sband->ht_cap.cap);
+		struct ieee80211_sta_ht_cap ht_cap;
+
+		memcpy(&ht_cap, &sband->ht_cap, sizeof(ht_cap));
+		ieee80211_apply_htcap_overrides(sdata, &ht_cap);
+
+		pos = ieee80211_ie_build_ht_cap(pos, &ht_cap, ht_cap.cap);
 		/*
 		 * Note: According to 802.11n-2009 9.13.3.1, HT Protection
 		 * field and RIFS Mode are reserved in IBSS mode, therefore
@@ -1051,6 +1055,11 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata,
 	memcpy(sdata->u.ibss.ssid, params->ssid, params->ssid_len);
 	sdata->u.ibss.ssid_len = params->ssid_len;
 
+	memcpy(&sdata->u.ibss.ht_capa, &params->ht_capa,
+	       sizeof(sdata->u.ibss.ht_capa));
+	memcpy(&sdata->u.ibss.ht_capa_mask, &params->ht_capa_mask,
+	       sizeof(sdata->u.ibss.ht_capa_mask));
+
 	/*
 	 * 802.11n-2009 9.13.3.1: In an IBSS, the HT Protection field is
 	 * reserved, but an HT STA shall protect HT transmissions as though
@@ -1131,6 +1140,11 @@ int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata)
 	presp = rcu_dereference_protected(ifibss->presp,
 					  lockdep_is_held(&sdata->wdev.mtx));
 	RCU_INIT_POINTER(sdata->u.ibss.presp, NULL);
+
+	/* on the next join, re-program HT parameters */
+	memset(&ifibss->ht_capa, 0, sizeof(ifibss->ht_capa));
+	memset(&ifibss->ht_capa_mask, 0, sizeof(ifibss->ht_capa_mask));
+
 	sdata->vif.bss_conf.ibss_joined = false;
 	sdata->vif.bss_conf.ibss_creator = false;
 	sdata->vif.bss_conf.enable_beacon = false;
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 8412a303993a..683751a1eceb 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -509,6 +509,9 @@ struct ieee80211_if_ibss {
 	/* probe response/beacon for IBSS */
 	struct beacon_data __rcu *presp;
 
+	struct ieee80211_ht_cap ht_capa; /* configured ht-cap over-rides */
+	struct ieee80211_ht_cap ht_capa_mask; /* Valid parts of ht_capa */
+
 	spinlock_t incomplete_lock;
 	struct list_head incomplete_stations;
 
-- 
cgit v1.2.3


From a144f378a489b5900c028425cb0d0a7a3fc8c1c1 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 3 Jul 2013 13:34:02 +0200
Subject: mac80211: add per-chain signal information to radiotap

When per-chain signal information is available, don't add
the antenna field once but instead add a radiotap namespace
for each chain containing the chain/antenna number and the
signal strength on that chain.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 net/mac80211/rx.c | 69 +++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 52 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 23dbcfc69b3b..c9f2ca43856a 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -87,11 +87,13 @@ ieee80211_rx_radiotap_space(struct ieee80211_local *local,
 	int len;
 
 	/* always present fields */
-	len = sizeof(struct ieee80211_radiotap_header) + 9;
+	len = sizeof(struct ieee80211_radiotap_header) + 8;
 
-	/* allocate extra bitmap */
+	/* allocate extra bitmaps */
 	if (status->vendor_radiotap_len)
 		len += 4;
+	if (status->chains)
+		len += 4 * hweight8(status->chains);
 
 	if (ieee80211_have_rx_timestamp(status)) {
 		len = ALIGN(len, 8);
@@ -100,6 +102,10 @@ ieee80211_rx_radiotap_space(struct ieee80211_local *local,
 	if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
 		len += 1;
 
+	/* antenna field, if we don't have per-chain info */
+	if (!status->chains)
+		len += 1;
+
 	/* padding for RX_FLAGS if necessary */
 	len = ALIGN(len, 2);
 
@@ -116,6 +122,11 @@ ieee80211_rx_radiotap_space(struct ieee80211_local *local,
 		len += 12;
 	}
 
+	if (status->chains) {
+		/* antenna and antenna signal fields */
+		len += 2 * hweight8(status->chains);
+	}
+
 	if (status->vendor_radiotap_len) {
 		if (WARN_ON_ONCE(status->vendor_radiotap_align == 0))
 			status->vendor_radiotap_align = 1;
@@ -145,8 +156,11 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
 	struct ieee80211_radiotap_header *rthdr;
 	unsigned char *pos;
+	__le32 *it_present;
+	u32 it_present_val;
 	u16 rx_flags = 0;
-	int mpdulen;
+	int mpdulen, chain;
+	unsigned long chains = status->chains;
 
 	mpdulen = skb->len;
 	if (!(has_fcs && (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS)))
@@ -154,25 +168,39 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 
 	rthdr = (struct ieee80211_radiotap_header *)skb_push(skb, rtap_len);
 	memset(rthdr, 0, rtap_len);
+	it_present = &rthdr->it_present;
 
 	/* radiotap header, set always present flags */
-	rthdr->it_present =
-		cpu_to_le32((1 << IEEE80211_RADIOTAP_FLAGS) |
-			    (1 << IEEE80211_RADIOTAP_CHANNEL) |
-			    (1 << IEEE80211_RADIOTAP_ANTENNA) |
-			    (1 << IEEE80211_RADIOTAP_RX_FLAGS));
 	rthdr->it_len = cpu_to_le16(rtap_len + status->vendor_radiotap_len);
+	it_present_val = BIT(IEEE80211_RADIOTAP_FLAGS) |
+			 BIT(IEEE80211_RADIOTAP_CHANNEL) |
+			 BIT(IEEE80211_RADIOTAP_RX_FLAGS);
+
+	if (!status->chains)
+		it_present_val |= BIT(IEEE80211_RADIOTAP_ANTENNA);
 
-	pos = (unsigned char *)(rthdr + 1);
+	for_each_set_bit(chain, &chains, IEEE80211_MAX_CHAINS) {
+		it_present_val |=
+			BIT(IEEE80211_RADIOTAP_EXT) |
+			BIT(IEEE80211_RADIOTAP_RADIOTAP_NAMESPACE);
+		put_unaligned_le32(it_present_val, it_present);
+		it_present++;
+		it_present_val = BIT(IEEE80211_RADIOTAP_ANTENNA) |
+				 BIT(IEEE80211_RADIOTAP_DBM_ANTSIGNAL);
+	}
 
 	if (status->vendor_radiotap_len) {
-		rthdr->it_present |=
-			cpu_to_le32(BIT(IEEE80211_RADIOTAP_VENDOR_NAMESPACE)) |
-			cpu_to_le32(BIT(IEEE80211_RADIOTAP_EXT));
-		put_unaligned_le32(status->vendor_radiotap_bitmap, pos);
-		pos += 4;
+		it_present_val |= BIT(IEEE80211_RADIOTAP_VENDOR_NAMESPACE) |
+				  BIT(IEEE80211_RADIOTAP_EXT);
+		put_unaligned_le32(it_present_val, it_present);
+		it_present++;
+		it_present_val = status->vendor_radiotap_bitmap;
 	}
 
+	put_unaligned_le32(it_present_val, it_present);
+
+	pos = (void *)(it_present + 1);
+
 	/* the order of the following fields is important */
 
 	/* IEEE80211_RADIOTAP_TSFT */
@@ -242,9 +270,11 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 
 	/* IEEE80211_RADIOTAP_LOCK_QUALITY is missing */
 
-	/* IEEE80211_RADIOTAP_ANTENNA */
-	*pos = status->antenna;
-	pos++;
+	if (!status->chains) {
+		/* IEEE80211_RADIOTAP_ANTENNA */
+		*pos = status->antenna;
+		pos++;
+	}
 
 	/* IEEE80211_RADIOTAP_DB_ANTNOISE is not used */
 
@@ -341,6 +371,11 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 		pos += 2;
 	}
 
+	for_each_set_bit(chain, &chains, IEEE80211_MAX_CHAINS) {
+		*pos++ = status->chain_signal[chain];
+		*pos++ = chain;
+	}
+
 	if (status->vendor_radiotap_len) {
 		/* ensure 2 byte alignment for the vendor field as required */
 		if ((pos - (u8 *)rthdr) & 1)
-- 
cgit v1.2.3


From be29b99a9b51b0338eea3c66a58de53bbd01de24 Mon Sep 17 00:00:00 2001
From: Amitkumar Karwar <akarwar@marvell.com>
Date: Fri, 28 Jun 2013 11:51:26 -0700
Subject: cfg80211/nl80211: Add packet coalesce support

In most cases, host that receives IPv4 and IPv6 multicast/broadcast
packets does not do anything with these packets. Therefore the
reception of these unwanted packets causes unnecessary processing
and power consumption.

Packet coalesce feature helps to reduce number of received
interrupts to host by buffering these packets in firmware/hardware
for some predefined time. Received interrupt will be generated when
one of the following events occur.
a) Expiration of hardware timer whose expiration time is set to
maximum coalescing delay of matching coalesce rule.
b) Coalescing buffer in hardware reaches it's limit.
c) Packet doesn't match any of the configured coalesce rules.

This patch adds set/get configuration support for packet coalesce.
User needs to configure following parameters for creating a coalesce
rule.
a) Maximum coalescing delay
b) List of packet patterns which needs to be matched
c) Condition for coalescence. pattern 'match' or 'no match'
Multiple such rules can be created.

This feature needs to be advertised during driver initialization.
Drivers are supposed to do required firmware/hardware settings based
on user configuration.

Signed-off-by: Amitkumar Karwar <akarwar@marvell.com>
Signed-off-by: Bing Zhao <bzhao@marvell.com>
[fix kernel-doc, change free function, fix copy/paste error]
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 include/net/cfg80211.h       |  54 ++++++++
 include/uapi/linux/nl80211.h |  90 ++++++++++++-
 net/wireless/core.c          |   9 ++
 net/wireless/core.h          |   2 +
 net/wireless/nl80211.c       | 308 +++++++++++++++++++++++++++++++++++++++++++
 net/wireless/nl80211.h       |   2 +
 6 files changed, 463 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 49409602fe3d..071ed2395c9a 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1780,6 +1780,35 @@ struct cfg80211_wowlan {
 	int n_patterns;
 };
 
+/**
+ * struct cfg80211_coalesce_rules - Coalesce rule parameters
+ *
+ * This structure defines coalesce rule for the device.
+ * @delay: maximum coalescing delay in msecs.
+ * @condition: condition for packet coalescence.
+ *	see &enum nl80211_coalesce_condition.
+ * @patterns: array of packet patterns
+ * @n_patterns: number of patterns
+ */
+struct cfg80211_coalesce_rules {
+	int delay;
+	enum nl80211_coalesce_condition condition;
+	struct cfg80211_pkt_pattern *patterns;
+	int n_patterns;
+};
+
+/**
+ * struct cfg80211_coalesce - Packet coalescing settings
+ *
+ * This structure defines coalescing settings.
+ * @rules: array of coalesce rules
+ * @n_rules: number of rules
+ */
+struct cfg80211_coalesce {
+	struct cfg80211_coalesce_rules *rules;
+	int n_rules;
+};
+
 /**
  * struct cfg80211_wowlan_wakeup - wakeup report
  * @disconnect: woke up by getting disconnected
@@ -2076,6 +2105,7 @@ struct cfg80211_update_ft_ies_params {
  *	driver can take the most appropriate actions.
  * @crit_proto_stop: Indicates critical protocol no longer needs increased link
  *	reliability. This operation can not fail.
+ * @set_coalesce: Set coalesce parameters.
  */
 struct cfg80211_ops {
 	int	(*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow);
@@ -2311,6 +2341,8 @@ struct cfg80211_ops {
 				    u16 duration);
 	void	(*crit_proto_stop)(struct wiphy *wiphy,
 				   struct wireless_dev *wdev);
+	int	(*set_coalesce)(struct wiphy *wiphy,
+				struct cfg80211_coalesce *coalesce);
 };
 
 /*
@@ -2536,6 +2568,25 @@ struct wiphy_wowlan_support {
 	const struct wiphy_wowlan_tcp_support *tcp;
 };
 
+/**
+ * struct wiphy_coalesce_support - coalesce support data
+ * @n_rules: maximum number of coalesce rules
+ * @max_delay: maximum supported coalescing delay in msecs
+ * @n_patterns: number of supported patterns in a rule
+ *	(see nl80211.h for the pattern definition)
+ * @pattern_max_len: maximum length of each pattern
+ * @pattern_min_len: minimum length of each pattern
+ * @max_pkt_offset: maximum Rx packet offset
+ */
+struct wiphy_coalesce_support {
+	int n_rules;
+	int max_delay;
+	int n_patterns;
+	int pattern_max_len;
+	int pattern_min_len;
+	int max_pkt_offset;
+};
+
 /**
  * struct wiphy - wireless hardware description
  * @reg_notifier: the driver's regulatory notification callback,
@@ -2646,6 +2697,7 @@ struct wiphy_wowlan_support {
  *	802.11-2012 8.4.2.29 for the defined fields.
  * @extended_capabilities_mask: mask of the valid values
  * @extended_capabilities_len: length of the extended capabilities
+ * @coalesce: packet coalescing support information
  */
 struct wiphy {
 	/* assign these fields before you register the wiphy */
@@ -2755,6 +2807,8 @@ struct wiphy {
 	const struct iw_handler_def *wext;
 #endif
 
+	const struct wiphy_coalesce_support *coalesce;
+
 	char priv[0] __aligned(NETDEV_ALIGN);
 };
 
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index de0ce809068a..5abc54d14d4d 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -125,6 +125,31 @@
  * interfaces that a given device supports.
  */
 
+/**
+ * DOC: packet coalesce support
+ *
+ * In most cases, host that receives IPv4 and IPv6 multicast/broadcast
+ * packets does not do anything with these packets. Therefore the
+ * reception of these unwanted packets causes unnecessary processing
+ * and power consumption.
+ *
+ * Packet coalesce feature helps to reduce number of received interrupts
+ * to host by buffering these packets in firmware/hardware for some
+ * predefined time. Received interrupt will be generated when one of the
+ * following events occur.
+ * a) Expiration of hardware timer whose expiration time is set to maximum
+ * coalescing delay of matching coalesce rule.
+ * b) Coalescing buffer in hardware reaches it's limit.
+ * c) Packet doesn't match any of the configured coalesce rules.
+ *
+ * User needs to configure following parameters for creating a coalesce
+ * rule.
+ * a) Maximum coalescing delay
+ * b) List of packet patterns which needs to be matched
+ * c) Condition for coalescence. pattern 'match' or 'no match'
+ * Multiple such rules can be created.
+ */
+
 /**
  * enum nl80211_commands - supported nl80211 commands
  *
@@ -648,6 +673,9 @@
  * @NL80211_CMD_CRIT_PROTOCOL_STOP: Indicates the connection reliability can
  *	return back to normal.
  *
+ * @NL80211_CMD_GET_COALESCE: Get currently supported coalesce rules.
+ * @NL80211_CMD_SET_COALESCE: Configure coalesce rules or clear existing rules.
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -810,6 +838,9 @@ enum nl80211_commands {
 	NL80211_CMD_CRIT_PROTOCOL_START,
 	NL80211_CMD_CRIT_PROTOCOL_STOP,
 
+	NL80211_CMD_GET_COALESCE,
+	NL80211_CMD_SET_COALESCE,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -1436,6 +1467,8 @@ enum nl80211_commands {
  *	allowed to be used with the first @NL80211_CMD_SET_STATION command to
  *	update a TDLS peer STA entry.
  *
+ * @NL80211_ATTR_COALESCE_RULE: Coalesce rule information.
+ *
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
  */
@@ -1736,6 +1769,8 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_PEER_AID,
 
+	NL80211_ATTR_COALESCE_RULE,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -3098,8 +3133,10 @@ enum nl80211_packet_pattern_attr {
  * @max_pkt_offset: maximum Rx packet offset
  *
  * This struct is carried in %NL80211_WOWLAN_TRIG_PKT_PATTERN when
- * that is part of %NL80211_ATTR_WOWLAN_TRIGGERS_SUPPORTED in the
- * capability information given by the kernel to userspace.
+ * that is part of %NL80211_ATTR_WOWLAN_TRIGGERS_SUPPORTED or in
+ * %NL80211_ATTR_COALESCE_RULE_PKT_PATTERN when that is part of
+ * %NL80211_ATTR_COALESCE_RULE in the capability information given
+ * by the kernel to userspace.
  */
 struct nl80211_pattern_support {
 	__u32 max_patterns;
@@ -3317,6 +3354,55 @@ enum nl80211_wowlan_tcp_attrs {
 	MAX_NL80211_WOWLAN_TCP = NUM_NL80211_WOWLAN_TCP - 1
 };
 
+/**
+ * struct nl80211_coalesce_rule_support - coalesce rule support information
+ * @max_rules: maximum number of rules supported
+ * @pat: packet pattern support information
+ * @max_delay: maximum supported coalescing delay in msecs
+ *
+ * This struct is carried in %NL80211_ATTR_COALESCE_RULE in the
+ * capability information given by the kernel to userspace.
+ */
+struct nl80211_coalesce_rule_support {
+	__u32 max_rules;
+	struct nl80211_pattern_support pat;
+	__u32 max_delay;
+} __attribute__((packed));
+
+/**
+ * enum nl80211_attr_coalesce_rule - coalesce rule attribute
+ * @__NL80211_COALESCE_RULE_INVALID: invalid number for nested attribute
+ * @NL80211_ATTR_COALESCE_RULE_DELAY: delay in msecs used for packet coalescing
+ * @NL80211_ATTR_COALESCE_RULE_CONDITION: condition for packet coalescence,
+ *	see &enum nl80211_coalesce_condition.
+ * @NL80211_ATTR_COALESCE_RULE_PKT_PATTERN: packet offset, pattern is matched
+ *	after these fixed number of bytes of received packet
+ * @NUM_NL80211_ATTR_COALESCE_RULE: number of attributes
+ * @NL80211_ATTR_COALESCE_RULE_MAX: max attribute number
+ */
+enum nl80211_attr_coalesce_rule {
+	__NL80211_COALESCE_RULE_INVALID,
+	NL80211_ATTR_COALESCE_RULE_DELAY,
+	NL80211_ATTR_COALESCE_RULE_CONDITION,
+	NL80211_ATTR_COALESCE_RULE_PKT_PATTERN,
+
+	/* keep last */
+	NUM_NL80211_ATTR_COALESCE_RULE,
+	NL80211_ATTR_COALESCE_RULE_MAX = NUM_NL80211_ATTR_COALESCE_RULE - 1
+};
+
+/**
+ * enum nl80211_coalesce_condition - coalesce rule conditions
+ * @NL80211_COALESCE_CONDITION_MATCH: coalaesce Rx packets when patterns
+ *	in a rule are matched.
+ * @NL80211_COALESCE_CONDITION_NO_MATCH: coalesce Rx packets when patterns
+ *	in a rule are not matched.
+ */
+enum nl80211_coalesce_condition {
+	NL80211_COALESCE_CONDITION_MATCH,
+	NL80211_COALESCE_CONDITION_NO_MATCH
+};
+
 /**
  * enum nl80211_iface_limit_attrs - limit attributes
  * @NL80211_IFACE_LIMIT_UNSPEC: (reserved)
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 4f9f216665e9..389a3f2ee464 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -462,6 +462,14 @@ int wiphy_register(struct wiphy *wiphy)
 		return -EINVAL;
 #endif
 
+	if (WARN_ON(wiphy->coalesce &&
+		    (!wiphy->coalesce->n_rules ||
+		     !wiphy->coalesce->n_patterns) &&
+		    (!wiphy->coalesce->pattern_min_len ||
+		     wiphy->coalesce->pattern_min_len >
+			wiphy->coalesce->pattern_max_len)))
+		return -EINVAL;
+
 	if (WARN_ON(wiphy->ap_sme_capa &&
 		    !(wiphy->flags & WIPHY_FLAG_HAVE_AP_SME)))
 		return -EINVAL;
@@ -668,6 +676,7 @@ void wiphy_unregister(struct wiphy *wiphy)
 		rdev_set_wakeup(rdev, false);
 #endif
 	cfg80211_rdev_free_wowlan(rdev);
+	cfg80211_rdev_free_coalesce(rdev);
 }
 EXPORT_SYMBOL(wiphy_unregister);
 
diff --git a/net/wireless/core.h b/net/wireless/core.h
index a6b45bf00f33..9ad43c619c54 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -79,6 +79,8 @@ struct cfg80211_registered_device {
 	/* netlink port which started critical protocol (0 means not started) */
 	u32 crit_proto_nlportid;
 
+	struct cfg80211_coalesce *coalesce;
+
 	/* must be last because of the way we do wiphy_priv(),
 	 * and it should at least be aligned to NETDEV_ALIGN */
 	struct wiphy wiphy __aligned(NETDEV_ALIGN);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 0492478ab74e..6dca5a700174 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -403,6 +403,14 @@ nl80211_wowlan_tcp_policy[NUM_NL80211_WOWLAN_TCP] = {
 	[NL80211_WOWLAN_TCP_WAKE_MASK] = { .len = 1 },
 };
 
+/* policy for coalesce rule attributes */
+static const struct nla_policy
+nl80211_coalesce_policy[NUM_NL80211_ATTR_COALESCE_RULE] = {
+	[NL80211_ATTR_COALESCE_RULE_DELAY] = { .type = NLA_U32 },
+	[NL80211_ATTR_COALESCE_RULE_CONDITION] = { .type = NLA_U32 },
+	[NL80211_ATTR_COALESCE_RULE_PKT_PATTERN] = { .type = NLA_NESTED },
+};
+
 /* policy for GTK rekey offload attributes */
 static const struct nla_policy
 nl80211_rekey_policy[NUM_NL80211_REKEY_DATA] = {
@@ -995,6 +1003,27 @@ static int nl80211_send_wowlan(struct sk_buff *msg,
 }
 #endif
 
+static int nl80211_send_coalesce(struct sk_buff *msg,
+				 struct cfg80211_registered_device *dev)
+{
+	struct nl80211_coalesce_rule_support rule;
+
+	if (!dev->wiphy.coalesce)
+		return 0;
+
+	rule.max_rules = dev->wiphy.coalesce->n_rules;
+	rule.max_delay = dev->wiphy.coalesce->max_delay;
+	rule.pat.max_patterns = dev->wiphy.coalesce->n_patterns;
+	rule.pat.min_pattern_len = dev->wiphy.coalesce->pattern_min_len;
+	rule.pat.max_pattern_len = dev->wiphy.coalesce->pattern_max_len;
+	rule.pat.max_pkt_offset = dev->wiphy.coalesce->max_pkt_offset;
+
+	if (nla_put(msg, NL80211_ATTR_COALESCE_RULE, sizeof(rule), &rule))
+		return -ENOBUFS;
+
+	return 0;
+}
+
 static int nl80211_send_band_rateinfo(struct sk_buff *msg,
 				      struct ieee80211_supported_band *sband)
 {
@@ -1513,6 +1542,12 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *dev,
 			    dev->wiphy.vht_capa_mod_mask))
 			goto nla_put_failure;
 
+		state->split_start++;
+		break;
+	case 10:
+		if (nl80211_send_coalesce(msg, dev))
+			goto nla_put_failure;
+
 		/* done */
 		state->split_start = 0;
 		break;
@@ -8043,6 +8078,264 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info)
 }
 #endif
 
+static int nl80211_send_coalesce_rules(struct sk_buff *msg,
+				       struct cfg80211_registered_device *rdev)
+{
+	struct nlattr *nl_pats, *nl_pat, *nl_rule, *nl_rules;
+	int i, j, pat_len;
+	struct cfg80211_coalesce_rules *rule;
+
+	if (!rdev->coalesce->n_rules)
+		return 0;
+
+	nl_rules = nla_nest_start(msg, NL80211_ATTR_COALESCE_RULE);
+	if (!nl_rules)
+		return -ENOBUFS;
+
+	for (i = 0; i < rdev->coalesce->n_rules; i++) {
+		nl_rule = nla_nest_start(msg, i + 1);
+		if (!nl_rule)
+			return -ENOBUFS;
+
+		rule = &rdev->coalesce->rules[i];
+		if (nla_put_u32(msg, NL80211_ATTR_COALESCE_RULE_DELAY,
+				rule->delay))
+			return -ENOBUFS;
+
+		if (nla_put_u32(msg, NL80211_ATTR_COALESCE_RULE_CONDITION,
+				rule->condition))
+			return -ENOBUFS;
+
+		nl_pats = nla_nest_start(msg,
+				NL80211_ATTR_COALESCE_RULE_PKT_PATTERN);
+		if (!nl_pats)
+			return -ENOBUFS;
+
+		for (j = 0; j < rule->n_patterns; j++) {
+			nl_pat = nla_nest_start(msg, j + 1);
+			if (!nl_pat)
+				return -ENOBUFS;
+			pat_len = rule->patterns[j].pattern_len;
+			if (nla_put(msg, NL80211_PKTPAT_MASK,
+				    DIV_ROUND_UP(pat_len, 8),
+				    rule->patterns[j].mask) ||
+			    nla_put(msg, NL80211_PKTPAT_PATTERN, pat_len,
+				    rule->patterns[j].pattern) ||
+			    nla_put_u32(msg, NL80211_PKTPAT_OFFSET,
+					rule->patterns[j].pkt_offset))
+				return -ENOBUFS;
+			nla_nest_end(msg, nl_pat);
+		}
+		nla_nest_end(msg, nl_pats);
+		nla_nest_end(msg, nl_rule);
+	}
+	nla_nest_end(msg, nl_rules);
+
+	return 0;
+}
+
+static int nl80211_get_coalesce(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct sk_buff *msg;
+	void *hdr;
+
+	if (!rdev->wiphy.coalesce)
+		return -EOPNOTSUPP;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
+			     NL80211_CMD_GET_COALESCE);
+	if (!hdr)
+		goto nla_put_failure;
+
+	if (rdev->coalesce && nl80211_send_coalesce_rules(msg, rdev))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+	return genlmsg_reply(msg, info);
+
+nla_put_failure:
+	nlmsg_free(msg);
+	return -ENOBUFS;
+}
+
+void cfg80211_rdev_free_coalesce(struct cfg80211_registered_device *rdev)
+{
+	struct cfg80211_coalesce *coalesce = rdev->coalesce;
+	int i, j;
+	struct cfg80211_coalesce_rules *rule;
+
+	if (!coalesce)
+		return;
+
+	for (i = 0; i < coalesce->n_rules; i++) {
+		rule = &coalesce->rules[i];
+		for (j = 0; j < rule->n_patterns; j++)
+			kfree(rule->patterns[j].mask);
+		kfree(rule->patterns);
+	}
+	kfree(coalesce->rules);
+	kfree(coalesce);
+	rdev->coalesce = NULL;
+}
+
+static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev,
+				       struct nlattr *rule,
+				       struct cfg80211_coalesce_rules *new_rule)
+{
+	int err, i;
+	const struct wiphy_coalesce_support *coalesce = rdev->wiphy.coalesce;
+	struct nlattr *tb[NUM_NL80211_ATTR_COALESCE_RULE], *pat;
+	int rem, pat_len, mask_len, pkt_offset, n_patterns = 0;
+	struct nlattr *pat_tb[NUM_NL80211_PKTPAT];
+
+	err = nla_parse(tb, NL80211_ATTR_COALESCE_RULE_MAX, nla_data(rule),
+			nla_len(rule), nl80211_coalesce_policy);
+	if (err)
+		return err;
+
+	if (tb[NL80211_ATTR_COALESCE_RULE_DELAY])
+		new_rule->delay =
+			nla_get_u32(tb[NL80211_ATTR_COALESCE_RULE_DELAY]);
+	if (new_rule->delay > coalesce->max_delay)
+		return -EINVAL;
+
+	if (tb[NL80211_ATTR_COALESCE_RULE_CONDITION])
+		new_rule->condition =
+			nla_get_u32(tb[NL80211_ATTR_COALESCE_RULE_CONDITION]);
+	if (new_rule->condition != NL80211_COALESCE_CONDITION_MATCH &&
+	    new_rule->condition != NL80211_COALESCE_CONDITION_NO_MATCH)
+		return -EINVAL;
+
+	if (!tb[NL80211_ATTR_COALESCE_RULE_PKT_PATTERN])
+		return -EINVAL;
+
+	nla_for_each_nested(pat, tb[NL80211_ATTR_COALESCE_RULE_PKT_PATTERN],
+			    rem)
+		n_patterns++;
+	if (n_patterns > coalesce->n_patterns)
+		return -EINVAL;
+
+	new_rule->patterns = kcalloc(n_patterns, sizeof(new_rule->patterns[0]),
+				     GFP_KERNEL);
+	if (!new_rule->patterns)
+		return -ENOMEM;
+
+	new_rule->n_patterns = n_patterns;
+	i = 0;
+
+	nla_for_each_nested(pat, tb[NL80211_ATTR_COALESCE_RULE_PKT_PATTERN],
+			    rem) {
+		nla_parse(pat_tb, MAX_NL80211_PKTPAT, nla_data(pat),
+			  nla_len(pat), NULL);
+		if (!pat_tb[NL80211_PKTPAT_MASK] ||
+		    !pat_tb[NL80211_PKTPAT_PATTERN])
+			return -EINVAL;
+		pat_len = nla_len(pat_tb[NL80211_PKTPAT_PATTERN]);
+		mask_len = DIV_ROUND_UP(pat_len, 8);
+		if (nla_len(pat_tb[NL80211_PKTPAT_MASK]) != mask_len)
+			return -EINVAL;
+		if (pat_len > coalesce->pattern_max_len ||
+		    pat_len < coalesce->pattern_min_len)
+			return -EINVAL;
+
+		if (!pat_tb[NL80211_PKTPAT_OFFSET])
+			pkt_offset = 0;
+		else
+			pkt_offset = nla_get_u32(pat_tb[NL80211_PKTPAT_OFFSET]);
+		if (pkt_offset > coalesce->max_pkt_offset)
+			return -EINVAL;
+		new_rule->patterns[i].pkt_offset = pkt_offset;
+
+		new_rule->patterns[i].mask =
+			kmalloc(mask_len + pat_len, GFP_KERNEL);
+		if (!new_rule->patterns[i].mask)
+			return -ENOMEM;
+		new_rule->patterns[i].pattern =
+			new_rule->patterns[i].mask + mask_len;
+		memcpy(new_rule->patterns[i].mask,
+		       nla_data(pat_tb[NL80211_PKTPAT_MASK]), mask_len);
+		new_rule->patterns[i].pattern_len = pat_len;
+		memcpy(new_rule->patterns[i].pattern,
+		       nla_data(pat_tb[NL80211_PKTPAT_PATTERN]), pat_len);
+		i++;
+	}
+
+	return 0;
+}
+
+static int nl80211_set_coalesce(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	const struct wiphy_coalesce_support *coalesce = rdev->wiphy.coalesce;
+	struct cfg80211_coalesce new_coalesce = {};
+	struct cfg80211_coalesce *n_coalesce;
+	int err, rem_rule, n_rules = 0, i, j;
+	struct nlattr *rule;
+	struct cfg80211_coalesce_rules *tmp_rule;
+
+	if (!rdev->wiphy.coalesce || !rdev->ops->set_coalesce)
+		return -EOPNOTSUPP;
+
+	if (!info->attrs[NL80211_ATTR_COALESCE_RULE]) {
+		cfg80211_rdev_free_coalesce(rdev);
+		rdev->ops->set_coalesce(&rdev->wiphy, NULL);
+		return 0;
+	}
+
+	nla_for_each_nested(rule, info->attrs[NL80211_ATTR_COALESCE_RULE],
+			    rem_rule)
+		n_rules++;
+	if (n_rules > coalesce->n_rules)
+		return -EINVAL;
+
+	new_coalesce.rules = kcalloc(n_rules, sizeof(new_coalesce.rules[0]),
+				     GFP_KERNEL);
+	if (!new_coalesce.rules)
+		return -ENOMEM;
+
+	new_coalesce.n_rules = n_rules;
+	i = 0;
+
+	nla_for_each_nested(rule, info->attrs[NL80211_ATTR_COALESCE_RULE],
+			    rem_rule) {
+		err = nl80211_parse_coalesce_rule(rdev, rule,
+						  &new_coalesce.rules[i]);
+		if (err)
+			goto error;
+
+		i++;
+	}
+
+	err = rdev->ops->set_coalesce(&rdev->wiphy, &new_coalesce);
+	if (err)
+		goto error;
+
+	n_coalesce = kmemdup(&new_coalesce, sizeof(new_coalesce), GFP_KERNEL);
+	if (!n_coalesce) {
+		err = -ENOMEM;
+		goto error;
+	}
+	cfg80211_rdev_free_coalesce(rdev);
+	rdev->coalesce = n_coalesce;
+
+	return 0;
+error:
+	for (i = 0; i < new_coalesce.n_rules; i++) {
+		tmp_rule = &new_coalesce.rules[i];
+		for (j = 0; j < tmp_rule->n_patterns; j++)
+			kfree(tmp_rule->patterns[j].mask);
+		kfree(tmp_rule->patterns);
+	}
+	kfree(new_coalesce.rules);
+
+	return err;
+}
+
 static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
@@ -9050,6 +9343,21 @@ static struct genl_ops nl80211_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = NL80211_FLAG_NEED_WDEV_UP |
 				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_GET_COALESCE,
+		.doit = nl80211_get_coalesce,
+		.policy = nl80211_policy,
+		.internal_flags = NL80211_FLAG_NEED_WIPHY |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_SET_COALESCE,
+		.doit = nl80211_set_coalesce,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_WIPHY |
+				  NL80211_FLAG_NEED_RTNL,
 	}
 };
 
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index a4073e808c13..44341bf53cfc 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -74,4 +74,6 @@ nl80211_radar_notify(struct cfg80211_registered_device *rdev,
 		     enum nl80211_radar_event event,
 		     struct net_device *netdev, gfp_t gfp);
 
+void cfg80211_rdev_free_coalesce(struct cfg80211_registered_device *rdev);
+
 #endif /* __NET_WIRELESS_NL80211_H */
-- 
cgit v1.2.3


From dcd6eac1f3b5fa1df11dfa99da0cf75b76cfef97 Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <simon.wunderlich@s2003.tu-chemnitz.de>
Date: Mon, 8 Jul 2013 16:55:49 +0200
Subject: nl80211: add scan width to bss and scan request structs

To allow scanning and working with 5 MHz and 10 MHz BSS, extend the
inform bss commands and add wrappers to take 5 and 10 MHz bss into
account.

Signed-off-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Mathias Kretschmer <mathias.kretschmer@fokus.fraunhofer.de>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 include/net/cfg80211.h       | 54 +++++++++++++++++++++++++++++++++++++++++---
 include/uapi/linux/nl80211.h | 18 +++++++++++++++
 net/wireless/nl80211.c       |  1 +
 net/wireless/scan.c          | 31 +++++++++++++++----------
 net/wireless/trace.h         | 12 ++++++----
 5 files changed, 97 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 071ed2395c9a..bae8614a450f 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1285,6 +1285,7 @@ struct cfg80211_ssid {
  * @n_ssids: number of SSIDs
  * @channels: channels to scan on.
  * @n_channels: total number of channels to scan
+ * @scan_width: channel width for scanning
  * @ie: optional information element(s) to add into Probe Request or %NULL
  * @ie_len: length of ie in octets
  * @flags: bit field of flags controlling operation
@@ -1300,6 +1301,7 @@ struct cfg80211_scan_request {
 	struct cfg80211_ssid *ssids;
 	int n_ssids;
 	u32 n_channels;
+	enum nl80211_bss_scan_width scan_width;
 	const u8 *ie;
 	size_t ie_len;
 	u32 flags;
@@ -1333,6 +1335,7 @@ struct cfg80211_match_set {
  * @ssids: SSIDs to scan for (passed in the probe_reqs in active scans)
  * @n_ssids: number of SSIDs
  * @n_channels: total number of channels to scan
+ * @scan_width: channel width for scanning
  * @interval: interval between each scheduled scan cycle
  * @ie: optional information element(s) to add into Probe Request or %NULL
  * @ie_len: length of ie in octets
@@ -1352,6 +1355,7 @@ struct cfg80211_sched_scan_request {
 	struct cfg80211_ssid *ssids;
 	int n_ssids;
 	u32 n_channels;
+	enum nl80211_bss_scan_width scan_width;
 	u32 interval;
 	const u8 *ie;
 	size_t ie_len;
@@ -1403,6 +1407,7 @@ struct cfg80211_bss_ies {
  * for use in scan results and similar.
  *
  * @channel: channel this BSS is on
+ * @scan_width: width of the control channel
  * @bssid: BSSID of the BSS
  * @beacon_interval: the beacon interval as from the frame
  * @capability: the capability field in host byte order
@@ -1424,6 +1429,7 @@ struct cfg80211_bss_ies {
  */
 struct cfg80211_bss {
 	struct ieee80211_channel *channel;
+	enum nl80211_bss_scan_width scan_width;
 
 	const struct cfg80211_bss_ies __rcu *ies;
 	const struct cfg80211_bss_ies __rcu *beacon_ies;
@@ -3438,10 +3444,11 @@ void cfg80211_sched_scan_results(struct wiphy *wiphy);
 void cfg80211_sched_scan_stopped(struct wiphy *wiphy);
 
 /**
- * cfg80211_inform_bss_frame - inform cfg80211 of a received BSS frame
+ * cfg80211_inform_bss_width_frame - inform cfg80211 of a received BSS frame
  *
  * @wiphy: the wiphy reporting the BSS
  * @channel: The channel the frame was received on
+ * @scan_width: width of the control channel
  * @mgmt: the management frame (probe response or beacon)
  * @len: length of the management frame
  * @signal: the signal strength, type depends on the wiphy's signal_type
@@ -3454,16 +3461,29 @@ void cfg80211_sched_scan_stopped(struct wiphy *wiphy);
  * Or %NULL on error.
  */
 struct cfg80211_bss * __must_check
+cfg80211_inform_bss_width_frame(struct wiphy *wiphy,
+				struct ieee80211_channel *channel,
+				enum nl80211_bss_scan_width scan_width,
+				struct ieee80211_mgmt *mgmt, size_t len,
+				s32 signal, gfp_t gfp);
+
+static inline struct cfg80211_bss * __must_check
 cfg80211_inform_bss_frame(struct wiphy *wiphy,
 			  struct ieee80211_channel *channel,
 			  struct ieee80211_mgmt *mgmt, size_t len,
-			  s32 signal, gfp_t gfp);
+			  s32 signal, gfp_t gfp)
+{
+	return cfg80211_inform_bss_width_frame(wiphy, channel,
+					       NL80211_BSS_CHAN_WIDTH_20,
+					       mgmt, len, signal, gfp);
+}
 
 /**
  * cfg80211_inform_bss - inform cfg80211 of a new BSS
  *
  * @wiphy: the wiphy reporting the BSS
  * @channel: The channel the frame was received on
+ * @scan_width: width of the control channel
  * @bssid: the BSSID of the BSS
  * @tsf: the TSF sent by the peer in the beacon/probe response (or 0)
  * @capability: the capability field sent by the peer
@@ -3480,11 +3500,26 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy,
  * Or %NULL on error.
  */
 struct cfg80211_bss * __must_check
+cfg80211_inform_bss_width(struct wiphy *wiphy,
+			  struct ieee80211_channel *channel,
+			  enum nl80211_bss_scan_width scan_width,
+			  const u8 *bssid, u64 tsf, u16 capability,
+			  u16 beacon_interval, const u8 *ie, size_t ielen,
+			  s32 signal, gfp_t gfp);
+
+static inline struct cfg80211_bss * __must_check
 cfg80211_inform_bss(struct wiphy *wiphy,
 		    struct ieee80211_channel *channel,
 		    const u8 *bssid, u64 tsf, u16 capability,
 		    u16 beacon_interval, const u8 *ie, size_t ielen,
-		    s32 signal, gfp_t gfp);
+		    s32 signal, gfp_t gfp)
+{
+	return cfg80211_inform_bss_width(wiphy, channel,
+					 NL80211_BSS_CHAN_WIDTH_20,
+					 bssid, tsf, capability,
+					 beacon_interval, ie, ielen, signal,
+					 gfp);
+}
 
 struct cfg80211_bss *cfg80211_get_bss(struct wiphy *wiphy,
 				      struct ieee80211_channel *channel,
@@ -3530,6 +3565,19 @@ void cfg80211_put_bss(struct wiphy *wiphy, struct cfg80211_bss *bss);
  */
 void cfg80211_unlink_bss(struct wiphy *wiphy, struct cfg80211_bss *bss);
 
+static inline enum nl80211_bss_scan_width
+cfg80211_chandef_to_scan_width(const struct cfg80211_chan_def *chandef)
+{
+	switch (chandef->width) {
+	case NL80211_CHAN_WIDTH_5:
+		return NL80211_BSS_CHAN_WIDTH_5;
+	case NL80211_CHAN_WIDTH_10:
+		return NL80211_BSS_CHAN_WIDTH_10;
+	default:
+		return NL80211_BSS_CHAN_WIDTH_20;
+	}
+}
+
 /**
  * cfg80211_rx_mlme_mgmt - notification of processed MLME management frame
  * @dev: network device
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 5abc54d14d4d..eb68735b3318 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2807,6 +2807,21 @@ enum nl80211_chan_width {
 	NL80211_CHAN_WIDTH_10,
 };
 
+/**
+ * enum nl80211_bss_scan_width - control channel width for a BSS
+ *
+ * These values are used with the %NL80211_BSS_CHAN_WIDTH attribute.
+ *
+ * @NL80211_BSS_CHAN_WIDTH_20: control channel is 20 MHz wide or compatible
+ * @NL80211_BSS_CHAN_WIDTH_10: control channel is 10 MHz wide
+ * @NL80211_BSS_CHAN_WIDTH_5: control channel is 5 MHz wide
+ */
+enum nl80211_bss_scan_width {
+	NL80211_BSS_CHAN_WIDTH_20,
+	NL80211_BSS_CHAN_WIDTH_10,
+	NL80211_BSS_CHAN_WIDTH_5,
+};
+
 /**
  * enum nl80211_bss - netlink attributes for a BSS
  *
@@ -2831,6 +2846,8 @@ enum nl80211_chan_width {
  * @NL80211_BSS_BEACON_IES: binary attribute containing the raw information
  *	elements from a Beacon frame (bin); not present if no Beacon frame has
  *	yet been received
+ * @NL80211_BSS_CHAN_WIDTH: channel width of the control channel
+ *	(u32, enum nl80211_bss_scan_width)
  * @__NL80211_BSS_AFTER_LAST: internal
  * @NL80211_BSS_MAX: highest BSS attribute
  */
@@ -2847,6 +2864,7 @@ enum nl80211_bss {
 	NL80211_BSS_STATUS,
 	NL80211_BSS_SEEN_MS_AGO,
 	NL80211_BSS_BEACON_IES,
+	NL80211_BSS_CHAN_WIDTH,
 
 	/* keep last */
 	__NL80211_BSS_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 6dca5a700174..ef4c312cc92c 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -5674,6 +5674,7 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb,
 		goto nla_put_failure;
 	if (nla_put_u16(msg, NL80211_BSS_CAPABILITY, res->capability) ||
 	    nla_put_u32(msg, NL80211_BSS_FREQUENCY, res->channel->center_freq) ||
+	    nla_put_u32(msg, NL80211_BSS_CHAN_WIDTH, res->scan_width) ||
 	    nla_put_u32(msg, NL80211_BSS_SEEN_MS_AGO,
 			jiffies_to_msecs(jiffies - intbss->ts)))
 		goto nla_put_failure;
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index ae8c186b50d6..ad1e4068ce06 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -651,6 +651,8 @@ static bool cfg80211_combine_bsses(struct cfg80211_registered_device *dev,
 			continue;
 		if (bss->pub.channel != new->pub.channel)
 			continue;
+		if (bss->pub.scan_width != new->pub.scan_width)
+			continue;
 		if (rcu_access_pointer(bss->pub.beacon_ies))
 			continue;
 		ies = rcu_access_pointer(bss->pub.ies);
@@ -870,11 +872,12 @@ cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen,
 
 /* Returned bss is reference counted and must be cleaned up appropriately. */
 struct cfg80211_bss*
-cfg80211_inform_bss(struct wiphy *wiphy,
-		    struct ieee80211_channel *channel,
-		    const u8 *bssid, u64 tsf, u16 capability,
-		    u16 beacon_interval, const u8 *ie, size_t ielen,
-		    s32 signal, gfp_t gfp)
+cfg80211_inform_bss_width(struct wiphy *wiphy,
+			  struct ieee80211_channel *channel,
+			  enum nl80211_bss_scan_width scan_width,
+			  const u8 *bssid, u64 tsf, u16 capability,
+			  u16 beacon_interval, const u8 *ie, size_t ielen,
+			  s32 signal, gfp_t gfp)
 {
 	struct cfg80211_bss_ies *ies;
 	struct cfg80211_internal_bss tmp = {}, *res;
@@ -892,6 +895,7 @@ cfg80211_inform_bss(struct wiphy *wiphy,
 
 	memcpy(tmp.pub.bssid, bssid, ETH_ALEN);
 	tmp.pub.channel = channel;
+	tmp.pub.scan_width = scan_width;
 	tmp.pub.signal = signal;
 	tmp.pub.beacon_interval = beacon_interval;
 	tmp.pub.capability = capability;
@@ -924,14 +928,15 @@ cfg80211_inform_bss(struct wiphy *wiphy,
 	/* cfg80211_bss_update gives us a referenced result */
 	return &res->pub;
 }
-EXPORT_SYMBOL(cfg80211_inform_bss);
+EXPORT_SYMBOL(cfg80211_inform_bss_width);
 
 /* Returned bss is reference counted and must be cleaned up appropriately. */
 struct cfg80211_bss *
-cfg80211_inform_bss_frame(struct wiphy *wiphy,
-			  struct ieee80211_channel *channel,
-			  struct ieee80211_mgmt *mgmt, size_t len,
-			  s32 signal, gfp_t gfp)
+cfg80211_inform_bss_width_frame(struct wiphy *wiphy,
+				struct ieee80211_channel *channel,
+				enum nl80211_bss_scan_width scan_width,
+				struct ieee80211_mgmt *mgmt, size_t len,
+				s32 signal, gfp_t gfp)
 {
 	struct cfg80211_internal_bss tmp = {}, *res;
 	struct cfg80211_bss_ies *ies;
@@ -941,7 +946,8 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy,
 	BUILD_BUG_ON(offsetof(struct ieee80211_mgmt, u.probe_resp.variable) !=
 			offsetof(struct ieee80211_mgmt, u.beacon.variable));
 
-	trace_cfg80211_inform_bss_frame(wiphy, channel, mgmt, len, signal);
+	trace_cfg80211_inform_bss_width_frame(wiphy, channel, scan_width, mgmt,
+					      len, signal);
 
 	if (WARN_ON(!mgmt))
 		return NULL;
@@ -976,6 +982,7 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy,
 	
 	memcpy(tmp.pub.bssid, mgmt->bssid, ETH_ALEN);
 	tmp.pub.channel = channel;
+	tmp.pub.scan_width = scan_width;
 	tmp.pub.signal = signal;
 	tmp.pub.beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int);
 	tmp.pub.capability = le16_to_cpu(mgmt->u.probe_resp.capab_info);
@@ -991,7 +998,7 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy,
 	/* cfg80211_bss_update gives us a referenced result */
 	return &res->pub;
 }
-EXPORT_SYMBOL(cfg80211_inform_bss_frame);
+EXPORT_SYMBOL(cfg80211_inform_bss_width_frame);
 
 void cfg80211_ref_bss(struct wiphy *wiphy, struct cfg80211_bss *pub)
 {
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index e1534baf2ebb..09af6eb426a8 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -2391,26 +2391,30 @@ TRACE_EVENT(cfg80211_get_bss,
 		  __entry->capa_mask, __entry->capa_val)
 );
 
-TRACE_EVENT(cfg80211_inform_bss_frame,
+TRACE_EVENT(cfg80211_inform_bss_width_frame,
 	TP_PROTO(struct wiphy *wiphy, struct ieee80211_channel *channel,
+		 enum nl80211_bss_scan_width scan_width,
 		 struct ieee80211_mgmt *mgmt, size_t len,
 		 s32 signal),
-	TP_ARGS(wiphy, channel, mgmt, len, signal),
+	TP_ARGS(wiphy, channel, scan_width, mgmt, len, signal),
 	TP_STRUCT__entry(
 		WIPHY_ENTRY
 		CHAN_ENTRY
+		__field(enum nl80211_bss_scan_width, scan_width)
 		__dynamic_array(u8, mgmt, len)
 		__field(s32, signal)
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
 		CHAN_ASSIGN(channel);
+		__entry->scan_width = scan_width;
 		if (mgmt)
 			memcpy(__get_dynamic_array(mgmt), mgmt, len);
 		__entry->signal = signal;
 	),
-	TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT "signal: %d",
-		  WIPHY_PR_ARG, CHAN_PR_ARG, __entry->signal)
+	TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT "(scan_width: %d) signal: %d",
+		  WIPHY_PR_ARG, CHAN_PR_ARG, __entry->scan_width,
+		  __entry->signal)
 );
 
 DECLARE_EVENT_CLASS(cfg80211_bss_evt,
-- 
cgit v1.2.3


From 3de805cf965d69c8d3d7d69368d5fd2c925a2d5a Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <simon.wunderlich@s2003.tu-chemnitz.de>
Date: Mon, 8 Jul 2013 16:55:50 +0200
Subject: mac80211/rc80211: add chandef to rate initialization

5 and 10 MHz support needs to know the current operating channel width,
add the chandef to the rate control API.

Signed-off-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Mathias Kretschmer <mathias.kretschmer@fokus.fraunhofer.de>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 drivers/net/wireless/ath/ath9k/rc.c     |  2 ++
 drivers/net/wireless/iwlegacy/3945-rs.c |  1 +
 drivers/net/wireless/iwlegacy/4965-rs.c |  1 +
 drivers/net/wireless/iwlwifi/dvm/rs.c   |  3 ++-
 drivers/net/wireless/iwlwifi/mvm/rs.c   |  5 +++--
 drivers/net/wireless/rtlwifi/rc.c       |  1 +
 include/net/mac80211.h                  |  2 ++
 net/mac80211/rate.h                     | 22 +++++++++++++++++-----
 net/mac80211/rc80211_minstrel.c         |  3 ++-
 net/mac80211/rc80211_minstrel_ht.c      | 10 +++++++---
 net/mac80211/rc80211_pid_algo.c         |  1 +
 11 files changed, 39 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/drivers/net/wireless/ath/ath9k/rc.c b/drivers/net/wireless/ath/ath9k/rc.c
index 7eb1f4b458e4..a3c4ca0c94bf 100644
--- a/drivers/net/wireless/ath/ath9k/rc.c
+++ b/drivers/net/wireless/ath/ath9k/rc.c
@@ -1275,6 +1275,7 @@ static void ath_tx_status(void *priv, struct ieee80211_supported_band *sband,
 }
 
 static void ath_rate_init(void *priv, struct ieee80211_supported_band *sband,
+			  struct cfg80211_chan_def *chandef,
                           struct ieee80211_sta *sta, void *priv_sta)
 {
 	struct ath_softc *sc = priv;
@@ -1313,6 +1314,7 @@ static void ath_rate_init(void *priv, struct ieee80211_supported_band *sband,
 }
 
 static void ath_rate_update(void *priv, struct ieee80211_supported_band *sband,
+			    struct cfg80211_chan_def *chandef,
 			    struct ieee80211_sta *sta, void *priv_sta,
 			    u32 changed)
 {
diff --git a/drivers/net/wireless/iwlegacy/3945-rs.c b/drivers/net/wireless/iwlegacy/3945-rs.c
index fe31590a51b2..aea667b430c3 100644
--- a/drivers/net/wireless/iwlegacy/3945-rs.c
+++ b/drivers/net/wireless/iwlegacy/3945-rs.c
@@ -887,6 +887,7 @@ il3945_remove_debugfs(void *il, void *il_sta)
  */
 static void
 il3945_rs_rate_init_stub(void *il_r, struct ieee80211_supported_band *sband,
+			 struct cfg80211_chan_def *chandef,
 			 struct ieee80211_sta *sta, void *il_sta)
 {
 }
diff --git a/drivers/net/wireless/iwlegacy/4965-rs.c b/drivers/net/wireless/iwlegacy/4965-rs.c
index ed3c42a63a43..3ccbaf791b48 100644
--- a/drivers/net/wireless/iwlegacy/4965-rs.c
+++ b/drivers/net/wireless/iwlegacy/4965-rs.c
@@ -2803,6 +2803,7 @@ il4965_rs_remove_debugfs(void *il, void *il_sta)
  */
 static void
 il4965_rs_rate_init_stub(void *il_r, struct ieee80211_supported_band *sband,
+			 struct cfg80211_chan_def *chandef,
 			 struct ieee80211_sta *sta, void *il_sta)
 {
 }
diff --git a/drivers/net/wireless/iwlwifi/dvm/rs.c b/drivers/net/wireless/iwlwifi/dvm/rs.c
index 1b693944123b..91eb09b9b56f 100644
--- a/drivers/net/wireless/iwlwifi/dvm/rs.c
+++ b/drivers/net/wireless/iwlwifi/dvm/rs.c
@@ -3319,7 +3319,8 @@ static void rs_remove_debugfs(void *priv, void *priv_sta)
  * station is added we ignore it.
  */
 static void rs_rate_init_stub(void *priv_r, struct ieee80211_supported_band *sband,
-			 struct ieee80211_sta *sta, void *priv_sta)
+			      struct cfg80211_chan_def *chandef,
+			      struct ieee80211_sta *sta, void *priv_sta)
 {
 }
 static struct rate_control_ops rs_ops = {
diff --git a/drivers/net/wireless/iwlwifi/mvm/rs.c b/drivers/net/wireless/iwlwifi/mvm/rs.c
index b328a988c130..376ea2112de2 100644
--- a/drivers/net/wireless/iwlwifi/mvm/rs.c
+++ b/drivers/net/wireless/iwlwifi/mvm/rs.c
@@ -3159,8 +3159,9 @@ static void rs_remove_debugfs(void *mvm, void *mvm_sta)
  * station is added we ignore it.
  */
 static void rs_rate_init_stub(void *mvm_r,
-				 struct ieee80211_supported_band *sband,
-				 struct ieee80211_sta *sta, void *mvm_sta)
+			      struct ieee80211_supported_band *sband,
+			      struct cfg80211_chan_def *chandef,
+			      struct ieee80211_sta *sta, void *mvm_sta)
 {
 }
 static struct rate_control_ops rs_mvm_ops = {
diff --git a/drivers/net/wireless/rtlwifi/rc.c b/drivers/net/wireless/rtlwifi/rc.c
index f9f059dadb73..a98acefb8c06 100644
--- a/drivers/net/wireless/rtlwifi/rc.c
+++ b/drivers/net/wireless/rtlwifi/rc.c
@@ -218,6 +218,7 @@ static void rtl_tx_status(void *ppriv,
 
 static void rtl_rate_init(void *ppriv,
 			  struct ieee80211_supported_band *sband,
+			  struct cfg80211_chan_def *chandef,
 			  struct ieee80211_sta *sta, void *priv_sta)
 {
 }
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 7c5dc787a8e8..fe5fee830aa8 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4204,8 +4204,10 @@ struct rate_control_ops {
 
 	void *(*alloc_sta)(void *priv, struct ieee80211_sta *sta, gfp_t gfp);
 	void (*rate_init)(void *priv, struct ieee80211_supported_band *sband,
+			  struct cfg80211_chan_def *chandef,
 			  struct ieee80211_sta *sta, void *priv_sta);
 	void (*rate_update)(void *priv, struct ieee80211_supported_band *sband,
+			    struct cfg80211_chan_def *chandef,
 			    struct ieee80211_sta *sta, void *priv_sta,
 			    u32 changed);
 	void (*free_sta)(void *priv, struct ieee80211_sta *sta,
diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h
index d35a5dd3fb13..5dedc56c94db 100644
--- a/net/mac80211/rate.h
+++ b/net/mac80211/rate.h
@@ -66,11 +66,12 @@ static inline void rate_control_rate_init(struct sta_info *sta)
 	}
 
 	sband = local->hw.wiphy->bands[chanctx_conf->def.chan->band];
-	rcu_read_unlock();
 
 	ieee80211_sta_set_rx_nss(sta);
 
-	ref->ops->rate_init(ref->priv, sband, ista, priv_sta);
+	ref->ops->rate_init(ref->priv, sband, &chanctx_conf->def, ista,
+			    priv_sta);
+	rcu_read_unlock();
 	set_sta_flag(sta, WLAN_STA_RATE_CONTROL);
 }
 
@@ -81,10 +82,21 @@ static inline void rate_control_rate_update(struct ieee80211_local *local,
 	struct rate_control_ref *ref = local->rate_ctrl;
 	struct ieee80211_sta *ista = &sta->sta;
 	void *priv_sta = sta->rate_ctrl_priv;
+	struct ieee80211_chanctx_conf *chanctx_conf;
+
+	if (ref && ref->ops->rate_update) {
+		rcu_read_lock();
 
-	if (ref && ref->ops->rate_update)
-		ref->ops->rate_update(ref->priv, sband, ista,
-				      priv_sta, changed);
+		chanctx_conf = rcu_dereference(sta->sdata->vif.chanctx_conf);
+		if (WARN_ON(!chanctx_conf)) {
+			rcu_read_unlock();
+			return;
+		}
+
+		ref->ops->rate_update(ref->priv, sband, &chanctx_conf->def,
+				      ista, priv_sta, changed);
+		rcu_read_unlock();
+	}
 	drv_sta_rc_update(local, sta->sdata, &sta->sta, changed);
 }
 
diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c
index ac7ef5414bde..5b25966add10 100644
--- a/net/mac80211/rc80211_minstrel.c
+++ b/net/mac80211/rc80211_minstrel.c
@@ -417,7 +417,8 @@ init_sample_table(struct minstrel_sta_info *mi)
 
 static void
 minstrel_rate_init(void *priv, struct ieee80211_supported_band *sband,
-               struct ieee80211_sta *sta, void *priv_sta)
+		   struct cfg80211_chan_def *chandef,
+		   struct ieee80211_sta *sta, void *priv_sta)
 {
 	struct minstrel_sta_info *mi = priv_sta;
 	struct minstrel_priv *mp = priv;
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 5b2d3012b983..52562973fbd1 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -836,6 +836,7 @@ minstrel_ht_update_cck(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
 
 static void
 minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
+			struct cfg80211_chan_def *chandef,
                         struct ieee80211_sta *sta, void *priv_sta)
 {
 	struct minstrel_priv *mp = priv;
@@ -931,22 +932,25 @@ use_legacy:
 	memset(&msp->legacy, 0, sizeof(msp->legacy));
 	msp->legacy.r = msp->ratelist;
 	msp->legacy.sample_table = msp->sample_table;
-	return mac80211_minstrel.rate_init(priv, sband, sta, &msp->legacy);
+	return mac80211_minstrel.rate_init(priv, sband, chandef, sta,
+					   &msp->legacy);
 }
 
 static void
 minstrel_ht_rate_init(void *priv, struct ieee80211_supported_band *sband,
+		      struct cfg80211_chan_def *chandef,
                       struct ieee80211_sta *sta, void *priv_sta)
 {
-	minstrel_ht_update_caps(priv, sband, sta, priv_sta);
+	minstrel_ht_update_caps(priv, sband, chandef, sta, priv_sta);
 }
 
 static void
 minstrel_ht_rate_update(void *priv, struct ieee80211_supported_band *sband,
+			struct cfg80211_chan_def *chandef,
                         struct ieee80211_sta *sta, void *priv_sta,
                         u32 changed)
 {
-	minstrel_ht_update_caps(priv, sband, sta, priv_sta);
+	minstrel_ht_update_caps(priv, sband, chandef, sta, priv_sta);
 }
 
 static void *
diff --git a/net/mac80211/rc80211_pid_algo.c b/net/mac80211/rc80211_pid_algo.c
index 502d3ecc4a79..958fad07b54c 100644
--- a/net/mac80211/rc80211_pid_algo.c
+++ b/net/mac80211/rc80211_pid_algo.c
@@ -293,6 +293,7 @@ rate_control_pid_get_rate(void *priv, struct ieee80211_sta *sta,
 
 static void
 rate_control_pid_rate_init(void *priv, struct ieee80211_supported_band *sband,
+			   struct cfg80211_chan_def *chandef,
 			   struct ieee80211_sta *sta, void *priv_sta)
 {
 	struct rc_pid_sta_info *spinfo = priv_sta;
-- 
cgit v1.2.3


From 438b61b77082e70d2a408cc77b8c5faac312e940 Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <simon.wunderlich@s2003.tu-chemnitz.de>
Date: Mon, 8 Jul 2013 16:55:51 +0200
Subject: mac80211: fix timing for 5 MHz and 10 MHz channels

according to IEEE 802.11-2012 section 18, various timings change
when using 5 MHz and 10 MHz. Reflect this by using a "shift" when
calculating durations.

Signed-off-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Mathias Kretschmer <mathias.kretschmer@fokus.fraunhofer.de>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 net/mac80211/ieee80211_i.h         | 31 +++++++++++++++++++++++++++++-
 net/mac80211/rc80211_minstrel.c    | 30 +++++++++++++++++++++++------
 net/mac80211/rc80211_minstrel_ht.c |  5 +++--
 net/mac80211/tx.c                  | 10 +++++++---
 net/mac80211/util.c                | 39 +++++++++++++++++++++++++-------------
 5 files changed, 90 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 683751a1eceb..338e7ca21ea5 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -812,6 +812,34 @@ ieee80211_get_sdata_band(struct ieee80211_sub_if_data *sdata)
 	return band;
 }
 
+static inline int
+ieee80211_chandef_get_shift(struct cfg80211_chan_def *chandef)
+{
+	switch (chandef->width) {
+	case NL80211_CHAN_WIDTH_5:
+		return 2;
+	case NL80211_CHAN_WIDTH_10:
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static inline int
+ieee80211_vif_get_shift(struct ieee80211_vif *vif)
+{
+	struct ieee80211_chanctx_conf *chanctx_conf;
+	int shift = 0;
+
+	rcu_read_lock();
+	chanctx_conf = rcu_dereference(vif->chanctx_conf);
+	if (chanctx_conf)
+		shift = ieee80211_chandef_get_shift(&chanctx_conf->def);
+	rcu_read_unlock();
+
+	return shift;
+}
+
 enum sdata_queue_type {
 	IEEE80211_SDATA_QUEUE_TYPE_FRAME	= 0,
 	IEEE80211_SDATA_QUEUE_AGG_START		= 1,
@@ -1468,7 +1496,8 @@ extern void *mac80211_wiphy_privid; /* for wiphy privid */
 u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len,
 			enum nl80211_iftype type);
 int ieee80211_frame_duration(enum ieee80211_band band, size_t len,
-			     int rate, int erp, int short_preamble);
+			     int rate, int erp, int short_preamble,
+			     int shift);
 void mac80211_ev_michael_mic_failure(struct ieee80211_sub_if_data *sdata, int keyidx,
 				     struct ieee80211_hdr *hdr, const u8 *tsc,
 				     gfp_t gfp);
diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c
index 5b25966add10..a507376d5316 100644
--- a/net/mac80211/rc80211_minstrel.c
+++ b/net/mac80211/rc80211_minstrel.c
@@ -382,14 +382,18 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta,
 static void
 calc_rate_durations(enum ieee80211_band band,
 		    struct minstrel_rate *d,
-		    struct ieee80211_rate *rate)
+		    struct ieee80211_rate *rate,
+		    struct cfg80211_chan_def *chandef)
 {
 	int erp = !!(rate->flags & IEEE80211_RATE_ERP_G);
+	int shift = ieee80211_chandef_get_shift(chandef);
 
 	d->perfect_tx_time = ieee80211_frame_duration(band, 1200,
-			rate->bitrate, erp, 1);
+			DIV_ROUND_UP(rate->bitrate, 1 << shift), erp, 1,
+			shift);
 	d->ack_time = ieee80211_frame_duration(band, 10,
-			rate->bitrate, erp, 1);
+			DIV_ROUND_UP(rate->bitrate, 1 << shift), erp, 1,
+			shift);
 }
 
 static void
@@ -425,14 +429,17 @@ minstrel_rate_init(void *priv, struct ieee80211_supported_band *sband,
 	struct ieee80211_rate *ctl_rate;
 	unsigned int i, n = 0;
 	unsigned int t_slot = 9; /* FIXME: get real slot time */
+	u32 rate_flags;
 
 	mi->sta = sta;
 	mi->lowest_rix = rate_lowest_index(sband, sta);
 	ctl_rate = &sband->bitrates[mi->lowest_rix];
 	mi->sp_ack_dur = ieee80211_frame_duration(sband->band, 10,
 				ctl_rate->bitrate,
-				!!(ctl_rate->flags & IEEE80211_RATE_ERP_G), 1);
+				!!(ctl_rate->flags & IEEE80211_RATE_ERP_G), 1,
+				ieee80211_chandef_get_shift(chandef));
 
+	rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef);
 	memset(mi->max_tp_rate, 0, sizeof(mi->max_tp_rate));
 	mi->max_prob_rate = 0;
 
@@ -441,15 +448,22 @@ minstrel_rate_init(void *priv, struct ieee80211_supported_band *sband,
 		unsigned int tx_time = 0, tx_time_cts = 0, tx_time_rtscts = 0;
 		unsigned int tx_time_single;
 		unsigned int cw = mp->cw_min;
+		int shift;
 
 		if (!rate_supported(sta, sband->band, i))
 			continue;
+		if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
+			continue;
+
 		n++;
 		memset(mr, 0, sizeof(*mr));
 
 		mr->rix = i;
-		mr->bitrate = sband->bitrates[i].bitrate / 5;
-		calc_rate_durations(sband->band, mr, &sband->bitrates[i]);
+		shift = ieee80211_chandef_get_shift(chandef);
+		mr->bitrate = DIV_ROUND_UP(sband->bitrates[i].bitrate,
+					   (1 << shift) * 5);
+		calc_rate_durations(sband->band, mr, &sband->bitrates[i],
+				    chandef);
 
 		/* calculate maximum number of retransmissions before
 		 * fallback (based on maximum segment size) */
@@ -547,6 +561,7 @@ minstrel_init_cck_rates(struct minstrel_priv *mp)
 {
 	static const int bitrates[4] = { 10, 20, 55, 110 };
 	struct ieee80211_supported_band *sband;
+	u32 rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef);
 	int i, j;
 
 	sband = mp->hw->wiphy->bands[IEEE80211_BAND_2GHZ];
@@ -559,6 +574,9 @@ minstrel_init_cck_rates(struct minstrel_priv *mp)
 		if (rate->flags & IEEE80211_RATE_ERP_G)
 			continue;
 
+		if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
+			continue;
+
 		for (j = 0; j < ARRAY_SIZE(bitrates); j++) {
 			if (rate->bitrate != bitrates[j])
 				continue;
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 52562973fbd1..7475a7a33797 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -862,8 +862,9 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
 	mi->sta = sta;
 	mi->stats_update = jiffies;
 
-	ack_dur = ieee80211_frame_duration(sband->band, 10, 60, 1, 1);
-	mi->overhead = ieee80211_frame_duration(sband->band, 0, 60, 1, 1) + ack_dur;
+	ack_dur = ieee80211_frame_duration(sband->band, 10, 60, 1, 1, 0);
+	mi->overhead = ieee80211_frame_duration(sband->band, 0, 60, 1, 1, 0);
+	mi->overhead += ack_dur;
 	mi->overhead_rtscts = mi->overhead + 2 * ack_dur;
 
 	mi->avg_ampdu_len = MINSTREL_FRAC(1, 1);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 4105d0ca963e..3523daa0b15c 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -40,7 +40,7 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
 				 struct sk_buff *skb, int group_addr,
 				 int next_frag_len)
 {
-	int rate, mrate, erp, dur, i;
+	int rate, mrate, erp, dur, i, shift;
 	struct ieee80211_rate *txrate;
 	struct ieee80211_local *local = tx->local;
 	struct ieee80211_supported_band *sband;
@@ -153,6 +153,8 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
 		rate = mrate;
 	}
 
+	shift = ieee80211_vif_get_shift(&tx->sdata->vif);
+
 	/* Don't calculate ACKs for QoS Frames with NoAck Policy set */
 	if (ieee80211_is_data_qos(hdr->frame_control) &&
 	    *(ieee80211_get_qos_ctl(hdr)) & IEEE80211_QOS_CTL_ACK_POLICY_NOACK)
@@ -162,7 +164,8 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
 		 * (10 bytes + 4-byte FCS = 112 bits) plus SIFS; rounded up
 		 * to closest integer */
 		dur = ieee80211_frame_duration(sband->band, 10, rate, erp,
-				tx->sdata->vif.bss_conf.use_short_preamble);
+				tx->sdata->vif.bss_conf.use_short_preamble,
+				shift);
 
 	if (next_frag_len) {
 		/* Frame is fragmented: duration increases with time needed to
@@ -171,7 +174,8 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
 		/* next fragment */
 		dur += ieee80211_frame_duration(sband->band, next_frag_len,
 				txrate->bitrate, erp,
-				tx->sdata->vif.bss_conf.use_short_preamble);
+				tx->sdata->vif.bss_conf.use_short_preamble,
+				shift);
 	}
 
 	return cpu_to_le16(dur);
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 22654452a561..61856e17a1e4 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -107,7 +107,8 @@ void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx)
 }
 
 int ieee80211_frame_duration(enum ieee80211_band band, size_t len,
-			     int rate, int erp, int short_preamble)
+			     int rate, int erp, int short_preamble,
+			     int shift)
 {
 	int dur;
 
@@ -118,6 +119,9 @@ int ieee80211_frame_duration(enum ieee80211_band band, size_t len,
 	 *
 	 * rate is in 100 kbps, so divident is multiplied by 10 in the
 	 * DIV_ROUND_UP() operations.
+	 *
+	 * shift may be 2 for 5 MHz channels or 1 for 10 MHz channels, and
+	 * is assumed to be 0 otherwise.
 	 */
 
 	if (band == IEEE80211_BAND_5GHZ || erp) {
@@ -130,15 +134,21 @@ int ieee80211_frame_duration(enum ieee80211_band band, size_t len,
 		 * TXTIME = T_PREAMBLE + T_SIGNAL + T_SYM x N_SYM + Signal Ext
 		 *
 		 * T_SYM = 4 usec
-		 * 802.11a - 17.5.2: aSIFSTime = 16 usec
+		 * 802.11a - 18.5.2: aSIFSTime = 16 usec
 		 * 802.11g - 19.8.4: aSIFSTime = 10 usec +
 		 *	signal ext = 6 usec
 		 */
 		dur = 16; /* SIFS + signal ext */
-		dur += 16; /* 17.3.2.3: T_PREAMBLE = 16 usec */
-		dur += 4; /* 17.3.2.3: T_SIGNAL = 4 usec */
+		dur += 16; /* IEEE 802.11-2012 18.3.2.4: T_PREAMBLE = 16 usec */
+		dur += 4; /* IEEE 802.11-2012 18.3.2.4: T_SIGNAL = 4 usec */
 		dur += 4 * DIV_ROUND_UP((16 + 8 * (len + 4) + 6) * 10,
 					4 * rate); /* T_SYM x N_SYM */
+
+		/* IEEE 802.11-2012 18.3.2.4: all values above are:
+		 *  * times 4 for 5 MHz
+		 *  * times 2 for 10 MHz
+		 */
+		dur *= 1 << shift;
 	} else {
 		/*
 		 * 802.11b or 802.11g with 802.11b compatibility:
@@ -168,7 +178,7 @@ __le16 ieee80211_generic_frame_duration(struct ieee80211_hw *hw,
 {
 	struct ieee80211_sub_if_data *sdata;
 	u16 dur;
-	int erp;
+	int erp, shift = 0;
 	bool short_preamble = false;
 
 	erp = 0;
@@ -177,10 +187,11 @@ __le16 ieee80211_generic_frame_duration(struct ieee80211_hw *hw,
 		short_preamble = sdata->vif.bss_conf.use_short_preamble;
 		if (sdata->flags & IEEE80211_SDATA_OPERATING_GMODE)
 			erp = rate->flags & IEEE80211_RATE_ERP_G;
+		shift = ieee80211_vif_get_shift(vif);
 	}
 
 	dur = ieee80211_frame_duration(band, frame_len, rate->bitrate, erp,
-				       short_preamble);
+				       short_preamble, shift);
 
 	return cpu_to_le16(dur);
 }
@@ -194,7 +205,7 @@ __le16 ieee80211_rts_duration(struct ieee80211_hw *hw,
 	struct ieee80211_rate *rate;
 	struct ieee80211_sub_if_data *sdata;
 	bool short_preamble;
-	int erp;
+	int erp, shift = 0;
 	u16 dur;
 	struct ieee80211_supported_band *sband;
 
@@ -210,17 +221,18 @@ __le16 ieee80211_rts_duration(struct ieee80211_hw *hw,
 		short_preamble = sdata->vif.bss_conf.use_short_preamble;
 		if (sdata->flags & IEEE80211_SDATA_OPERATING_GMODE)
 			erp = rate->flags & IEEE80211_RATE_ERP_G;
+		shift = ieee80211_vif_get_shift(vif);
 	}
 
 	/* CTS duration */
 	dur = ieee80211_frame_duration(sband->band, 10, rate->bitrate,
-				       erp, short_preamble);
+				       erp, short_preamble, shift);
 	/* Data frame duration */
 	dur += ieee80211_frame_duration(sband->band, frame_len, rate->bitrate,
-					erp, short_preamble);
+					erp, short_preamble, shift);
 	/* ACK duration */
 	dur += ieee80211_frame_duration(sband->band, 10, rate->bitrate,
-					erp, short_preamble);
+					erp, short_preamble, shift);
 
 	return cpu_to_le16(dur);
 }
@@ -235,7 +247,7 @@ __le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw,
 	struct ieee80211_rate *rate;
 	struct ieee80211_sub_if_data *sdata;
 	bool short_preamble;
-	int erp;
+	int erp, shift = 0;
 	u16 dur;
 	struct ieee80211_supported_band *sband;
 
@@ -250,15 +262,16 @@ __le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw,
 		short_preamble = sdata->vif.bss_conf.use_short_preamble;
 		if (sdata->flags & IEEE80211_SDATA_OPERATING_GMODE)
 			erp = rate->flags & IEEE80211_RATE_ERP_G;
+		shift = ieee80211_vif_get_shift(vif);
 	}
 
 	/* Data frame duration */
 	dur = ieee80211_frame_duration(sband->band, frame_len, rate->bitrate,
-				       erp, short_preamble);
+				       erp, short_preamble, shift);
 	if (!(frame_txctl->flags & IEEE80211_TX_CTL_NO_ACK)) {
 		/* ACK duration */
 		dur += ieee80211_frame_duration(sband->band, 10, rate->bitrate,
-						erp, short_preamble);
+						erp, short_preamble, shift);
 	}
 
 	return cpu_to_le16(dur);
-- 
cgit v1.2.3


From a5e70697d0c4836e69c60de92db27eac9ae71e05 Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <simon.wunderlich@s2003.tu-chemnitz.de>
Date: Mon, 8 Jul 2013 16:55:52 +0200
Subject: mac80211: add radiotap flag and handling for 5/10 MHz

Wireshark already defines radiotap channel flags for 5 and 10 MHz, so
just use them in Linux radiotap too. Furthermore, add rx status flags to
allow drivers to report when they received data on 5 or 10 MHz channels.

Signed-off-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Mathias Kretschmer <mathias.kretschmer@fokus.fraunhofer.de>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 include/net/ieee80211_radiotap.h |  4 ++++
 include/net/mac80211.h           |  4 ++++
 net/mac80211/rx.c                | 21 ++++++++++++---------
 3 files changed, 20 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h
index c6d07cb074bc..8b5b71433297 100644
--- a/include/net/ieee80211_radiotap.h
+++ b/include/net/ieee80211_radiotap.h
@@ -230,6 +230,10 @@ enum ieee80211_radiotap_type {
 #define	IEEE80211_CHAN_PASSIVE	0x0200	/* Only passive scan allowed */
 #define	IEEE80211_CHAN_DYN	0x0400	/* Dynamic CCK-OFDM channel */
 #define	IEEE80211_CHAN_GFSK	0x0800	/* GFSK channel (FHSS PHY) */
+#define	IEEE80211_CHAN_GSM	0x1000	/* GSM (900 MHz) */
+#define	IEEE80211_CHAN_STURBO	0x2000	/* Static Turbo */
+#define	IEEE80211_CHAN_HALF	0x4000	/* Half channel (10 MHz wide) */
+#define	IEEE80211_CHAN_QUARTER	0x8000	/* Quarter channel (5 MHz wide) */
 
 /* For IEEE80211_RADIOTAP_FLAGS */
 #define	IEEE80211_RADIOTAP_F_CFP	0x01	/* sent/received
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index fe5fee830aa8..3124036285eb 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -811,6 +811,8 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info)
  * @RX_FLAG_AMPDU_DELIM_CRC_KNOWN: The delimiter CRC field is known (the CRC
  *	is stored in the @ampdu_delimiter_crc field)
  * @RX_FLAG_STBC_MASK: STBC 2 bit bitmask. 1 - Nss=1, 2 - Nss=2, 3 - Nss=3
+ * @RX_FLAG_10MHZ: 10 MHz (half channel) was used
+ * @RX_FLAG_5MHZ: 5 MHz (quarter channel) was used
  */
 enum mac80211_rx_flags {
 	RX_FLAG_MMIC_ERROR		= BIT(0),
@@ -839,6 +841,8 @@ enum mac80211_rx_flags {
 	RX_FLAG_80P80MHZ		= BIT(24),
 	RX_FLAG_160MHZ			= BIT(25),
 	RX_FLAG_STBC_MASK		= BIT(26) | BIT(27),
+	RX_FLAG_10MHZ			= BIT(28),
+	RX_FLAG_5MHZ			= BIT(29),
 };
 
 #define RX_FLAG_STBC_SHIFT		26
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index c9f2ca43856a..deeeca1299bd 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -159,6 +159,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 	__le32 *it_present;
 	u32 it_present_val;
 	u16 rx_flags = 0;
+	u16 channel_flags = 0;
 	int mpdulen, chain;
 	unsigned long chains = status->chains;
 
@@ -243,20 +244,22 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 	/* IEEE80211_RADIOTAP_CHANNEL */
 	put_unaligned_le16(status->freq, pos);
 	pos += 2;
+	if (status->flag & RX_FLAG_10MHZ)
+		channel_flags |= IEEE80211_CHAN_HALF;
+	else if (status->flag & RX_FLAG_5MHZ)
+		channel_flags |= IEEE80211_CHAN_QUARTER;
+
 	if (status->band == IEEE80211_BAND_5GHZ)
-		put_unaligned_le16(IEEE80211_CHAN_OFDM | IEEE80211_CHAN_5GHZ,
-				   pos);
+		channel_flags |= IEEE80211_CHAN_OFDM | IEEE80211_CHAN_5GHZ;
 	else if (status->flag & (RX_FLAG_HT | RX_FLAG_VHT))
-		put_unaligned_le16(IEEE80211_CHAN_DYN | IEEE80211_CHAN_2GHZ,
-				   pos);
+		channel_flags |= IEEE80211_CHAN_DYN | IEEE80211_CHAN_2GHZ;
 	else if (rate && rate->flags & IEEE80211_RATE_ERP_G)
-		put_unaligned_le16(IEEE80211_CHAN_OFDM | IEEE80211_CHAN_2GHZ,
-				   pos);
+		channel_flags |= IEEE80211_CHAN_OFDM | IEEE80211_CHAN_2GHZ;
 	else if (rate)
-		put_unaligned_le16(IEEE80211_CHAN_CCK | IEEE80211_CHAN_2GHZ,
-				   pos);
+		channel_flags |= IEEE80211_CHAN_OFDM | IEEE80211_CHAN_2GHZ;
 	else
-		put_unaligned_le16(IEEE80211_CHAN_2GHZ, pos);
+		channel_flags |= IEEE80211_CHAN_2GHZ;
+	put_unaligned_le16(channel_flags, pos);
 	pos += 2;
 
 	/* IEEE80211_RADIOTAP_DBM_ANTSIGNAL */
-- 
cgit v1.2.3


From 2103dec14792be2c2194a454630b01120d30e5cb Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <simon.wunderlich@s2003.tu-chemnitz.de>
Date: Mon, 8 Jul 2013 16:55:53 +0200
Subject: mac80211: select and adjust bitrates according to channel mode

The various components accessing the bitrates table must use consider
the used channel bandwidth to select only available rates or calculate
the bitrate correctly.

There are some rates in reduced bandwidth modes which can't be
represented as multiples of 500kbps, like 2.25 MBit/s in 5 MHz mode. The
standard suggests to round up to the next multiple of 500kbps, just do
that in mac80211 as well.

Signed-off-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Mathias Kretschmer <mathias.kretschmer@fokus.fraunhofer.de>
[make rate unsigned in ieee80211_add_tx_radiotap_header(), squash fix]
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 net/mac80211/cfg.c         |  45 +++++--------
 net/mac80211/ibss.c        |  81 +++++++++++++++++------
 net/mac80211/ieee80211_i.h |   7 +-
 net/mac80211/mesh.c        |   3 +-
 net/mac80211/mesh_plink.c  |   2 +-
 net/mac80211/mlme.c        |  95 +++++++++++++++------------
 net/mac80211/rate.c        |  46 +++++++------
 net/mac80211/rx.c          |   7 +-
 net/mac80211/scan.c        |  27 +++++++-
 net/mac80211/status.c      |  18 +++--
 net/mac80211/tx.c          |  21 ++++--
 net/mac80211/util.c        | 159 ++++++++++++++++++++++++++++++++++++---------
 12 files changed, 352 insertions(+), 159 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 8184d121ff09..b82fff6c0b30 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -395,9 +395,13 @@ void sta_set_rate_info_tx(struct sta_info *sta,
 		rinfo->nss = ieee80211_rate_get_vht_nss(rate);
 	} else {
 		struct ieee80211_supported_band *sband;
+		int shift = ieee80211_vif_get_shift(&sta->sdata->vif);
+		u16 brate;
+
 		sband = sta->local->hw.wiphy->bands[
 				ieee80211_get_sdata_band(sta->sdata)];
-		rinfo->legacy = sband->bitrates[rate->idx].bitrate;
+		brate = sband->bitrates[rate->idx].bitrate;
+		rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift);
 	}
 	if (rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH)
 		rinfo->flags |= RATE_INFO_FLAGS_40_MHZ_WIDTH;
@@ -422,11 +426,13 @@ void sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo)
 		rinfo->mcs = sta->last_rx_rate_idx;
 	} else {
 		struct ieee80211_supported_band *sband;
+		int shift = ieee80211_vif_get_shift(&sta->sdata->vif);
+		u16 brate;
 
 		sband = sta->local->hw.wiphy->bands[
 				ieee80211_get_sdata_band(sta->sdata)];
-		rinfo->legacy =
-			sband->bitrates[sta->last_rx_rate_idx].bitrate;
+		brate = sband->bitrates[sta->last_rx_rate_idx].bitrate;
+		rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift);
 	}
 
 	if (sta->last_rx_rate_flag & RX_FLAG_40MHZ)
@@ -1190,8 +1196,6 @@ static int sta_apply_parameters(struct ieee80211_local *local,
 				struct station_parameters *params)
 {
 	int ret = 0;
-	u32 rates;
-	int i, j;
 	struct ieee80211_supported_band *sband;
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
 	enum ieee80211_band band = ieee80211_get_sdata_band(sdata);
@@ -1284,16 +1288,10 @@ static int sta_apply_parameters(struct ieee80211_local *local,
 		sta->listen_interval = params->listen_interval;
 
 	if (params->supported_rates) {
-		rates = 0;
-
-		for (i = 0; i < params->supported_rates_len; i++) {
-			int rate = (params->supported_rates[i] & 0x7f) * 5;
-			for (j = 0; j < sband->n_bitrates; j++) {
-				if (sband->bitrates[j].bitrate == rate)
-					rates |= BIT(j);
-			}
-		}
-		sta->sta.supp_rates[band] = rates;
+		ieee80211_parse_bitrates(&sdata->vif.bss_conf.chandef,
+					 sband, params->supported_rates,
+					 params->supported_rates_len,
+					 &sta->sta.supp_rates[band]);
 	}
 
 	if (params->ht_capa)
@@ -1956,18 +1954,11 @@ static int ieee80211_change_bss(struct wiphy *wiphy,
 	}
 
 	if (params->basic_rates) {
-		int i, j;
-		u32 rates = 0;
-		struct ieee80211_supported_band *sband = wiphy->bands[band];
-
-		for (i = 0; i < params->basic_rates_len; i++) {
-			int rate = (params->basic_rates[i] & 0x7f) * 5;
-			for (j = 0; j < sband->n_bitrates; j++) {
-				if (sband->bitrates[j].bitrate == rate)
-					rates |= BIT(j);
-			}
-		}
-		sdata->vif.bss_conf.basic_rates = rates;
+		ieee80211_parse_bitrates(&sdata->vif.bss_conf.chandef,
+					 wiphy->bands[band],
+					 params->basic_rates,
+					 params->basic_rates_len,
+					 &sdata->vif.bss_conf.basic_rates);
 		changed |= BSS_CHANGED_BASIC_RATES;
 	}
 
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index 7f290a8562ef..272a3b37615c 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -43,16 +43,17 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 {
 	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
 	struct ieee80211_local *local = sdata->local;
-	int rates, i;
+	int rates_n = 0, i, ri;
 	struct ieee80211_mgmt *mgmt;
 	u8 *pos;
 	struct ieee80211_supported_band *sband;
 	struct cfg80211_bss *bss;
-	u32 bss_change;
+	u32 bss_change, rate_flags, rates = 0, rates_added = 0;
 	u8 supp_rates[IEEE80211_MAX_SUPP_RATES];
 	struct cfg80211_chan_def chandef;
 	struct beacon_data *presp;
 	int frame_len;
+	int shift;
 
 	sdata_assert_lock(sdata);
 
@@ -99,6 +100,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	memcpy(ifibss->bssid, bssid, ETH_ALEN);
 
 	sband = local->hw.wiphy->bands[chan->band];
+	shift = ieee80211_vif_get_shift(&sdata->vif);
 
 	/* Build IBSS probe response */
 	frame_len = sizeof(struct ieee80211_hdr_3addr) +
@@ -134,15 +136,29 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	memcpy(pos, ifibss->ssid, ifibss->ssid_len);
 	pos += ifibss->ssid_len;
 
-	rates = min_t(int, 8, sband->n_bitrates);
+	rate_flags = ieee80211_chandef_rate_flags(&chandef);
+	for (i = 0; i < sband->n_bitrates; i++) {
+		if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
+			continue;
+
+		rates |= BIT(i);
+		rates_n++;
+	}
+
 	*pos++ = WLAN_EID_SUPP_RATES;
-	*pos++ = rates;
-	for (i = 0; i < rates; i++) {
-		int rate = sband->bitrates[i].bitrate;
+	*pos++ = min_t(int, 8, rates_n);
+	for (ri = 0; ri < sband->n_bitrates; ri++) {
+		int rate = DIV_ROUND_UP(sband->bitrates[ri].bitrate,
+					5 * (1 << shift));
 		u8 basic = 0;
-		if (basic_rates & BIT(i))
+		if (!(rates & BIT(ri)))
+			continue;
+
+		if (basic_rates & BIT(ri))
 			basic = 0x80;
-		*pos++ = basic | (u8) (rate / 5);
+		*pos++ = basic | (u8) rate;
+		if (++rates_added == 8)
+			break;
 	}
 
 	if (sband->band == IEEE80211_BAND_2GHZ) {
@@ -157,15 +173,20 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	*pos++ = 0;
 	*pos++ = 0;
 
-	if (sband->n_bitrates > 8) {
+	/* put the remaining rates in WLAN_EID_EXT_SUPP_RATES */
+	if (rates_n > 8) {
 		*pos++ = WLAN_EID_EXT_SUPP_RATES;
-		*pos++ = sband->n_bitrates - 8;
-		for (i = 8; i < sband->n_bitrates; i++) {
-			int rate = sband->bitrates[i].bitrate;
+		*pos++ = rates_n - 8;
+		for (; ri < sband->n_bitrates; ri++) {
+			int rate = DIV_ROUND_UP(sband->bitrates[ri].bitrate,
+						5 * (1 << shift));
 			u8 basic = 0;
-			if (basic_rates & BIT(i))
+			if (!(rates & BIT(ri)))
+				continue;
+
+			if (basic_rates & BIT(ri))
 				basic = 0x80;
-			*pos++ = basic | (u8) (rate / 5);
+			*pos++ = basic | (u8) rate;
 		}
 	}
 
@@ -244,7 +265,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	sdata->vif.bss_conf.ibss_creator = creator;
 	ieee80211_bss_info_change_notify(sdata, bss_change);
 
-	ieee80211_sta_def_wmm_params(sdata, sband->n_bitrates, supp_rates);
+	ieee80211_sta_def_wmm_params(sdata, rates, supp_rates);
 
 	ifibss->state = IEEE80211_IBSS_MLME_JOINED;
 	mod_timer(&ifibss->timer,
@@ -268,6 +289,8 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	u16 beacon_int = cbss->beacon_interval;
 	const struct cfg80211_bss_ies *ies;
 	u64 tsf;
+	u32 rate_flags;
+	int shift;
 
 	sdata_assert_lock(sdata);
 
@@ -275,15 +298,24 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 		beacon_int = 10;
 
 	sband = sdata->local->hw.wiphy->bands[cbss->channel->band];
+	rate_flags = ieee80211_chandef_rate_flags(&sdata->u.ibss.chandef);
+	shift = ieee80211_vif_get_shift(&sdata->vif);
 
 	basic_rates = 0;
 
 	for (i = 0; i < bss->supp_rates_len; i++) {
-		int rate = (bss->supp_rates[i] & 0x7f) * 5;
+		int rate = bss->supp_rates[i] & 0x7f;
 		bool is_basic = !!(bss->supp_rates[i] & 0x80);
 
 		for (j = 0; j < sband->n_bitrates; j++) {
-			if (sband->bitrates[j].bitrate == rate) {
+			int brate;
+			if ((rate_flags & sband->bitrates[j].flags)
+			    != rate_flags)
+				continue;
+
+			brate = DIV_ROUND_UP(sband->bitrates[j].bitrate,
+					     5 * (1 << shift));
+			if (brate == rate) {
 				if (is_basic)
 					basic_rates |= BIT(j);
 				break;
@@ -465,7 +497,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 		sta = sta_info_get(sdata, mgmt->sa);
 
 		if (elems->supp_rates) {
-			supp_rates = ieee80211_sta_get_rates(local, elems,
+			supp_rates = ieee80211_sta_get_rates(sdata, elems,
 							     band, NULL);
 			if (sta) {
 				u32 prev_rates;
@@ -589,7 +621,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 			 "beacon TSF higher than local TSF - IBSS merge with BSSID %pM\n",
 			 mgmt->bssid);
 		ieee80211_sta_join_ibss(sdata, bss);
-		supp_rates = ieee80211_sta_get_rates(local, elems, band, NULL);
+		supp_rates = ieee80211_sta_get_rates(sdata, elems, band, NULL);
 		ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa,
 				       supp_rates);
 		rcu_read_unlock();
@@ -1024,6 +1056,9 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata,
 			struct cfg80211_ibss_params *params)
 {
 	u32 changed = 0;
+	u32 rate_flags;
+	struct ieee80211_supported_band *sband;
+	int i;
 
 	if (params->bssid) {
 		memcpy(sdata->u.ibss.bssid, params->bssid, ETH_ALEN);
@@ -1034,6 +1069,14 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata,
 	sdata->u.ibss.privacy = params->privacy;
 	sdata->u.ibss.control_port = params->control_port;
 	sdata->u.ibss.basic_rates = params->basic_rates;
+
+	/* fix basic_rates if channel does not support these rates */
+	rate_flags = ieee80211_chandef_rate_flags(&params->chandef);
+	sband = sdata->local->hw.wiphy->bands[params->chandef.chan->band];
+	for (i = 0; i < sband->n_bitrates; i++) {
+		if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
+			sdata->u.ibss.basic_rates &= ~BIT(i);
+	}
 	memcpy(sdata->vif.bss_conf.mcast_rate, params->mcast_rate,
 	       sizeof(params->mcast_rate));
 
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 338e7ca21ea5..6596da66bfaf 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1601,7 +1601,7 @@ void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
 int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
 			     size_t buffer_len, const u8 *ie, size_t ie_len,
 			     enum ieee80211_band band, u32 rate_mask,
-			     u8 channel);
+			     struct cfg80211_chan_def *chandef);
 struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
 					  u8 *dst, u32 ratemask,
 					  struct ieee80211_channel *chan,
@@ -1617,7 +1617,7 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
 void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata,
 				  const size_t supp_rates_len,
 				  const u8 *supp_rates);
-u32 ieee80211_sta_get_rates(struct ieee80211_local *local,
+u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata,
 			    struct ieee802_11_elems *elems,
 			    enum ieee80211_band band, u32 *basic_rates);
 int __ieee80211_request_smps(struct ieee80211_sub_if_data *sdata,
@@ -1634,6 +1634,9 @@ u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap,
 			       u16 prot_mode);
 u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
 			       u32 cap);
+int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef,
+			     const struct ieee80211_supported_band *sband,
+			     const u8 *srates, int srates_len, u32 *rates);
 int ieee80211_add_srates_ie(struct ieee80211_sub_if_data *sdata,
 			    struct sk_buff *skb, bool need_basic,
 			    enum ieee80211_band band);
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 447f41bbe744..e536f22fbeca 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -62,7 +62,6 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata,
 			struct ieee802_11_elems *ie)
 {
 	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
-	struct ieee80211_local *local = sdata->local;
 	u32 basic_rates = 0;
 	struct cfg80211_chan_def sta_chan_def;
 
@@ -85,7 +84,7 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata,
 	     (ifmsh->mesh_auth_id == ie->mesh_config->meshconf_auth)))
 		return false;
 
-	ieee80211_sta_get_rates(local, ie, ieee80211_get_sdata_band(sdata),
+	ieee80211_sta_get_rates(sdata, ie, ieee80211_get_sdata_band(sdata),
 				&basic_rates);
 
 	if (sdata->vif.bss_conf.basic_rates != basic_rates)
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index 02c05fa15c20..6b65d5055f5b 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -379,7 +379,7 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata,
 	u32 rates, basic_rates = 0, changed = 0;
 
 	sband = local->hw.wiphy->bands[band];
-	rates = ieee80211_sta_get_rates(local, elems, band, &basic_rates);
+	rates = ieee80211_sta_get_rates(sdata, elems, band, &basic_rates);
 
 	spin_lock_bh(&sta->lock);
 	sta->last_rx = jiffies;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index ae31968d42d3..f7552c2b74eb 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -478,27 +478,6 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata,
 
 /* frame sending functions */
 
-static int ieee80211_compatible_rates(const u8 *supp_rates, int supp_rates_len,
-				      struct ieee80211_supported_band *sband,
-				      u32 *rates)
-{
-	int i, j, count;
-	*rates = 0;
-	count = 0;
-	for (i = 0; i < supp_rates_len; i++) {
-		int rate = (supp_rates[i] & 0x7F) * 5;
-
-		for (j = 0; j < sband->n_bitrates; j++)
-			if (sband->bitrates[j].bitrate == rate) {
-				*rates |= BIT(j);
-				count++;
-				break;
-			}
-	}
-
-	return count;
-}
-
 static void ieee80211_add_ht_ie(struct ieee80211_sub_if_data *sdata,
 				struct sk_buff *skb, u8 ap_ht_param,
 				struct ieee80211_supported_band *sband,
@@ -617,12 +596,12 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
 	struct ieee80211_mgmt *mgmt;
 	u8 *pos, qos_info;
 	size_t offset = 0, noffset;
-	int i, count, rates_len, supp_rates_len;
+	int i, count, rates_len, supp_rates_len, shift;
 	u16 capab;
 	struct ieee80211_supported_band *sband;
 	struct ieee80211_chanctx_conf *chanctx_conf;
 	struct ieee80211_channel *chan;
-	u32 rates = 0;
+	u32 rate_flags, rates = 0;
 
 	sdata_assert_lock(sdata);
 
@@ -633,8 +612,10 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
 		return;
 	}
 	chan = chanctx_conf->def.chan;
+	rate_flags = ieee80211_chandef_rate_flags(&chanctx_conf->def);
 	rcu_read_unlock();
 	sband = local->hw.wiphy->bands[chan->band];
+	shift = ieee80211_vif_get_shift(&sdata->vif);
 
 	if (assoc_data->supp_rates_len) {
 		/*
@@ -643,17 +624,24 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
 		 * in the association request (e.g. D-Link DAP 1353 in
 		 * b-only mode)...
 		 */
-		rates_len = ieee80211_compatible_rates(assoc_data->supp_rates,
-						       assoc_data->supp_rates_len,
-						       sband, &rates);
+		rates_len = ieee80211_parse_bitrates(&chanctx_conf->def, sband,
+						     assoc_data->supp_rates,
+						     assoc_data->supp_rates_len,
+						     &rates);
 	} else {
 		/*
 		 * In case AP not provide any supported rates information
 		 * before association, we send information element(s) with
 		 * all rates that we support.
 		 */
-		rates = ~0;
-		rates_len = sband->n_bitrates;
+		rates_len = 0;
+		for (i = 0; i < sband->n_bitrates; i++) {
+			if ((rate_flags & sband->bitrates[i].flags)
+			    != rate_flags)
+				continue;
+			rates |= BIT(i);
+			rates_len++;
+		}
 	}
 
 	skb = alloc_skb(local->hw.extra_tx_headroom +
@@ -730,8 +718,9 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
 	count = 0;
 	for (i = 0; i < sband->n_bitrates; i++) {
 		if (BIT(i) & rates) {
-			int rate = sband->bitrates[i].bitrate;
-			*pos++ = (u8) (rate / 5);
+			int rate = DIV_ROUND_UP(sband->bitrates[i].bitrate,
+						5 * (1 << shift));
+			*pos++ = (u8) rate;
 			if (++count == 8)
 				break;
 		}
@@ -744,8 +733,10 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
 
 		for (i++; i < sband->n_bitrates; i++) {
 			if (BIT(i) & rates) {
-				int rate = sband->bitrates[i].bitrate;
-				*pos++ = (u8) (rate / 5);
+				int rate;
+				rate = DIV_ROUND_UP(sband->bitrates[i].bitrate,
+						    5 * (1 << shift));
+				*pos++ = (u8) rate;
 			}
 		}
 	}
@@ -2432,15 +2423,16 @@ static void ieee80211_get_rates(struct ieee80211_supported_band *sband,
 				u8 *supp_rates, unsigned int supp_rates_len,
 				u32 *rates, u32 *basic_rates,
 				bool *have_higher_than_11mbit,
-				int *min_rate, int *min_rate_index)
+				int *min_rate, int *min_rate_index,
+				int shift, u32 rate_flags)
 {
 	int i, j;
 
 	for (i = 0; i < supp_rates_len; i++) {
-		int rate = (supp_rates[i] & 0x7f) * 5;
+		int rate = supp_rates[i] & 0x7f;
 		bool is_basic = !!(supp_rates[i] & 0x80);
 
-		if (rate > 110)
+		if ((rate * 5 * (1 << shift)) > 110)
 			*have_higher_than_11mbit = true;
 
 		/*
@@ -2456,12 +2448,20 @@ static void ieee80211_get_rates(struct ieee80211_supported_band *sband,
 			continue;
 
 		for (j = 0; j < sband->n_bitrates; j++) {
-			if (sband->bitrates[j].bitrate == rate) {
+			struct ieee80211_rate *br;
+			int brate;
+
+			br = &sband->bitrates[j];
+			if ((rate_flags & br->flags) != rate_flags)
+				continue;
+
+			brate = DIV_ROUND_UP(br->bitrate, (1 << shift) * 5);
+			if (brate == rate) {
 				*rates |= BIT(j);
 				if (is_basic)
 					*basic_rates |= BIT(j);
-				if (rate < *min_rate) {
-					*min_rate = rate;
+				if ((rate * 5) < *min_rate) {
+					*min_rate = rate * 5;
 					*min_rate_index = j;
 				}
 				break;
@@ -3884,27 +3884,40 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata,
 		if (!new_sta)
 			return -ENOMEM;
 	}
-
 	if (new_sta) {
 		u32 rates = 0, basic_rates = 0;
 		bool have_higher_than_11mbit;
 		int min_rate = INT_MAX, min_rate_index = -1;
+		struct ieee80211_chanctx_conf *chanctx_conf;
 		struct ieee80211_supported_band *sband;
 		const struct cfg80211_bss_ies *ies;
+		int shift;
+		u32 rate_flags;
 
 		sband = local->hw.wiphy->bands[cbss->channel->band];
 
 		err = ieee80211_prep_channel(sdata, cbss);
 		if (err) {
 			sta_info_free(local, new_sta);
-			return err;
+			return -EINVAL;
 		}
+		shift = ieee80211_vif_get_shift(&sdata->vif);
+
+		rcu_read_lock();
+		chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+		if (WARN_ON(!chanctx_conf)) {
+			rcu_read_unlock();
+			return -EINVAL;
+		}
+		rate_flags = ieee80211_chandef_rate_flags(&chanctx_conf->def);
+		rcu_read_unlock();
 
 		ieee80211_get_rates(sband, bss->supp_rates,
 				    bss->supp_rates_len,
 				    &rates, &basic_rates,
 				    &have_higher_than_11mbit,
-				    &min_rate, &min_rate_index);
+				    &min_rate, &min_rate_index,
+				    shift, rate_flags);
 
 		/*
 		 * This used to be a workaround for basic rates missing
diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
index 30d58d2d13e2..ba63ac851c2b 100644
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c
@@ -232,37 +232,28 @@ static void rc_send_low_broadcast(s8 *idx, u32 basic_rates,
 	/* could not find a basic rate; use original selection */
 }
 
-static inline s8
-rate_lowest_non_cck_index(struct ieee80211_supported_band *sband,
-			  struct ieee80211_sta *sta)
+static void __rate_control_send_low(struct ieee80211_hw *hw,
+				    struct ieee80211_supported_band *sband,
+				    struct ieee80211_sta *sta,
+				    struct ieee80211_tx_info *info)
 {
 	int i;
+	u32 rate_flags =
+		ieee80211_chandef_rate_flags(&hw->conf.chandef);
 
+	if ((sband->band == IEEE80211_BAND_2GHZ) &&
+	    (info->flags & IEEE80211_TX_CTL_NO_CCK_RATE))
+		rate_flags |= IEEE80211_RATE_ERP_G;
+
+	info->control.rates[0].idx = 0;
 	for (i = 0; i < sband->n_bitrates; i++) {
-		struct ieee80211_rate *srate = &sband->bitrates[i];
-		if ((srate->bitrate == 10) || (srate->bitrate == 20) ||
-		    (srate->bitrate == 55) || (srate->bitrate == 110))
+		if (!rate_supported(sta, sband->band, i))
 			continue;
 
-		if (rate_supported(sta, sband->band, i))
-			return i;
+		info->control.rates[0].idx = i;
+		break;
 	}
-
-	/* No matching rate found */
-	return 0;
-}
-
-static void __rate_control_send_low(struct ieee80211_hw *hw,
-				    struct ieee80211_supported_band *sband,
-				    struct ieee80211_sta *sta,
-				    struct ieee80211_tx_info *info)
-{
-	if ((sband->band != IEEE80211_BAND_2GHZ) ||
-	    !(info->flags & IEEE80211_TX_CTL_NO_CCK_RATE))
-		info->control.rates[0].idx = rate_lowest_index(sband, sta);
-	else
-		info->control.rates[0].idx =
-			rate_lowest_non_cck_index(sband, sta);
+	WARN_ON_ONCE(i == sband->n_bitrates);
 
 	info->control.rates[0].count =
 		(info->flags & IEEE80211_TX_CTL_NO_ACK) ?
@@ -585,6 +576,7 @@ static void rate_control_apply_mask(struct ieee80211_sub_if_data *sdata,
 	u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN];
 	bool has_mcs_mask;
 	u32 mask;
+	u32 rate_flags;
 	int i;
 
 	/*
@@ -594,6 +586,12 @@ static void rate_control_apply_mask(struct ieee80211_sub_if_data *sdata,
 	 */
 	mask = sdata->rc_rateidx_mask[info->band];
 	has_mcs_mask = sdata->rc_has_mcs_mask[info->band];
+	rate_flags =
+		ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef);
+	for (i = 0; i < sband->n_bitrates; i++)
+		if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
+			mask &= ~BIT(i);
+
 	if (mask == (1 << sband->n_bitrates) - 1 && !has_mcs_mask)
 		return;
 
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index deeeca1299bd..0ac75127b7d2 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -236,8 +236,13 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 		 */
 		*pos = 0;
 	} else {
+		int shift = 0;
 		rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_RATE);
-		*pos = rate->bitrate / 5;
+		if (status->flag & RX_FLAG_10MHZ)
+			shift = 1;
+		else if (status->flag & RX_FLAG_5MHZ)
+			shift = 2;
+		*pos = DIV_ROUND_UP(rate->bitrate, 5 * (1 << shift));
 	}
 	pos++;
 
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 1b122a79b0d8..819d0956eb3b 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -204,10 +204,29 @@ void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb)
 		ieee80211_rx_bss_put(local, bss);
 }
 
+static void
+ieee80211_prepare_scan_chandef(struct cfg80211_chan_def *chandef,
+			       enum nl80211_bss_scan_width scan_width)
+{
+	memset(chandef, 0, sizeof(*chandef));
+	switch (scan_width) {
+	case NL80211_BSS_CHAN_WIDTH_5:
+		chandef->width = NL80211_CHAN_WIDTH_5;
+		break;
+	case NL80211_BSS_CHAN_WIDTH_10:
+		chandef->width = NL80211_CHAN_WIDTH_10;
+		break;
+	default:
+		chandef->width = NL80211_CHAN_WIDTH_20_NOHT;
+		break;
+	}
+}
+
 /* return false if no more work */
 static bool ieee80211_prep_hw_scan(struct ieee80211_local *local)
 {
 	struct cfg80211_scan_request *req = local->scan_req;
+	struct cfg80211_chan_def chandef;
 	enum ieee80211_band band;
 	int i, ielen, n_chans;
 
@@ -229,11 +248,12 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local)
 	} while (!n_chans);
 
 	local->hw_scan_req->n_channels = n_chans;
+	ieee80211_prepare_scan_chandef(&chandef, req->scan_width);
 
 	ielen = ieee80211_build_preq_ies(local, (u8 *)local->hw_scan_req->ie,
 					 local->hw_scan_ies_bufsize,
 					 req->ie, req->ie_len, band,
-					 req->rates[band], 0);
+					 req->rates[band], &chandef);
 	local->hw_scan_req->ie_len = ielen;
 	local->hw_scan_req->no_cck = req->no_cck;
 
@@ -912,6 +932,7 @@ int ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
 {
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_sched_scan_ies sched_scan_ies = {};
+	struct cfg80211_chan_def chandef;
 	int ret, i, iebufsz;
 
 	iebufsz = 2 + IEEE80211_MAX_SSID_LEN +
@@ -939,10 +960,12 @@ int ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
 			goto out_free;
 		}
 
+		ieee80211_prepare_scan_chandef(&chandef, req->scan_width);
+
 		sched_scan_ies.len[i] =
 			ieee80211_build_preq_ies(local, sched_scan_ies.ie[i],
 						 iebufsz, req->ie, req->ie_len,
-						 i, (u32) -1, 0);
+						 i, (u32) -1, &chandef);
 	}
 
 	ret = drv_sched_scan_start(local, sdata, req, &sched_scan_ies);
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 43439203f4e4..6ad4c14385ef 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -252,9 +252,10 @@ static int ieee80211_tx_radiotap_len(struct ieee80211_tx_info *info)
 	return len;
 }
 
-static void ieee80211_add_tx_radiotap_header(struct ieee80211_supported_band
-					     *sband, struct sk_buff *skb,
-					     int retry_count, int rtap_len)
+static void
+ieee80211_add_tx_radiotap_header(struct ieee80211_supported_band *sband,
+				 struct sk_buff *skb, int retry_count,
+				 int rtap_len, int shift)
 {
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
@@ -280,8 +281,11 @@ static void ieee80211_add_tx_radiotap_header(struct ieee80211_supported_band
 	/* IEEE80211_RADIOTAP_RATE */
 	if (info->status.rates[0].idx >= 0 &&
 	    !(info->status.rates[0].flags & IEEE80211_TX_RC_MCS)) {
+		u16 rate;
+
 		rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_RATE);
-		*pos = sband->bitrates[info->status.rates[0].idx].bitrate / 5;
+		rate = sband->bitrates[info->status.rates[0].idx].bitrate;
+		*pos = DIV_ROUND_UP(rate, 5 * (1 << shift));
 		/* padding for tx flags */
 		pos += 2;
 	}
@@ -424,6 +428,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
 	bool acked;
 	struct ieee80211_bar *bar;
 	int rtap_len;
+	int shift = 0;
 
 	for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) {
 		if ((info->flags & IEEE80211_TX_CTL_AMPDU) &&
@@ -458,6 +463,8 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
 		if (!ether_addr_equal(hdr->addr2, sta->sdata->vif.addr))
 			continue;
 
+		shift = ieee80211_vif_get_shift(&sta->sdata->vif);
+
 		if (info->flags & IEEE80211_TX_STATUS_EOSP)
 			clear_sta_flag(sta, WLAN_STA_SP);
 
@@ -624,7 +631,8 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
 		dev_kfree_skb(skb);
 		return;
 	}
-	ieee80211_add_tx_radiotap_header(sband, skb, retry_count, rtap_len);
+	ieee80211_add_tx_radiotap_header(sband, skb, retry_count, rtap_len,
+					 shift);
 
 	/* XXX: is this sufficient for BPF? */
 	skb_set_mac_header(skb, 0);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 3523daa0b15c..f82301b6cef2 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -40,12 +40,22 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
 				 struct sk_buff *skb, int group_addr,
 				 int next_frag_len)
 {
-	int rate, mrate, erp, dur, i, shift;
+	int rate, mrate, erp, dur, i, shift = 0;
 	struct ieee80211_rate *txrate;
 	struct ieee80211_local *local = tx->local;
 	struct ieee80211_supported_band *sband;
 	struct ieee80211_hdr *hdr;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_chanctx_conf *chanctx_conf;
+	u32 rate_flags = 0;
+
+	rcu_read_lock();
+	chanctx_conf = rcu_dereference(tx->sdata->vif.chanctx_conf);
+	if (chanctx_conf) {
+		shift = ieee80211_chandef_get_shift(&chanctx_conf->def);
+		rate_flags = ieee80211_chandef_rate_flags(&chanctx_conf->def);
+	}
+	rcu_read_unlock();
 
 	/* assume HW handles this */
 	if (tx->rate.flags & IEEE80211_TX_RC_MCS)
@@ -122,8 +132,11 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
 		if (r->bitrate > txrate->bitrate)
 			break;
 
+		if ((rate_flags & r->flags) != rate_flags)
+			continue;
+
 		if (tx->sdata->vif.bss_conf.basic_rates & BIT(i))
-			rate = r->bitrate;
+			rate = DIV_ROUND_UP(r->bitrate, 1 << shift);
 
 		switch (sband->band) {
 		case IEEE80211_BAND_2GHZ: {
@@ -150,11 +163,9 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
 	if (rate == -1) {
 		/* No matching basic rate found; use highest suitable mandatory
 		 * PHY rate */
-		rate = mrate;
+		rate = DIV_ROUND_UP(mrate, 1 << shift);
 	}
 
-	shift = ieee80211_vif_get_shift(&tx->sdata->vif);
-
 	/* Don't calculate ACKs for QoS Frames with NoAck Policy set */
 	if (ieee80211_is_data_qos(hdr->frame_control) &&
 	    *(ieee80211_get_qos_ctl(hdr)) & IEEE80211_QOS_CTL_ACK_POLICY_NOACK)
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 61856e17a1e4..1e45891ca219 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -141,14 +141,18 @@ int ieee80211_frame_duration(enum ieee80211_band band, size_t len,
 		dur = 16; /* SIFS + signal ext */
 		dur += 16; /* IEEE 802.11-2012 18.3.2.4: T_PREAMBLE = 16 usec */
 		dur += 4; /* IEEE 802.11-2012 18.3.2.4: T_SIGNAL = 4 usec */
-		dur += 4 * DIV_ROUND_UP((16 + 8 * (len + 4) + 6) * 10,
-					4 * rate); /* T_SYM x N_SYM */
 
 		/* IEEE 802.11-2012 18.3.2.4: all values above are:
 		 *  * times 4 for 5 MHz
 		 *  * times 2 for 10 MHz
 		 */
 		dur *= 1 << shift;
+
+		/* rates should already consider the channel bandwidth,
+		 * don't apply divisor again.
+		 */
+		dur += 4 * DIV_ROUND_UP((16 + 8 * (len + 4) + 6) * 10,
+					4 * rate); /* T_SYM x N_SYM */
 	} else {
 		/*
 		 * 802.11b or 802.11g with 802.11b compatibility:
@@ -205,7 +209,7 @@ __le16 ieee80211_rts_duration(struct ieee80211_hw *hw,
 	struct ieee80211_rate *rate;
 	struct ieee80211_sub_if_data *sdata;
 	bool short_preamble;
-	int erp, shift = 0;
+	int erp, shift = 0, bitrate;
 	u16 dur;
 	struct ieee80211_supported_band *sband;
 
@@ -224,14 +228,16 @@ __le16 ieee80211_rts_duration(struct ieee80211_hw *hw,
 		shift = ieee80211_vif_get_shift(vif);
 	}
 
+	bitrate = DIV_ROUND_UP(rate->bitrate, 1 << shift);
+
 	/* CTS duration */
-	dur = ieee80211_frame_duration(sband->band, 10, rate->bitrate,
+	dur = ieee80211_frame_duration(sband->band, 10, bitrate,
 				       erp, short_preamble, shift);
 	/* Data frame duration */
-	dur += ieee80211_frame_duration(sband->band, frame_len, rate->bitrate,
+	dur += ieee80211_frame_duration(sband->band, frame_len, bitrate,
 					erp, short_preamble, shift);
 	/* ACK duration */
-	dur += ieee80211_frame_duration(sband->band, 10, rate->bitrate,
+	dur += ieee80211_frame_duration(sband->band, 10, bitrate,
 					erp, short_preamble, shift);
 
 	return cpu_to_le16(dur);
@@ -247,7 +253,7 @@ __le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw,
 	struct ieee80211_rate *rate;
 	struct ieee80211_sub_if_data *sdata;
 	bool short_preamble;
-	int erp, shift = 0;
+	int erp, shift = 0, bitrate;
 	u16 dur;
 	struct ieee80211_supported_band *sband;
 
@@ -265,12 +271,14 @@ __le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw,
 		shift = ieee80211_vif_get_shift(vif);
 	}
 
+	bitrate = DIV_ROUND_UP(rate->bitrate, 1 << shift);
+
 	/* Data frame duration */
-	dur = ieee80211_frame_duration(sband->band, frame_len, rate->bitrate,
+	dur = ieee80211_frame_duration(sband->band, frame_len, bitrate,
 				       erp, short_preamble, shift);
 	if (!(frame_txctl->flags & IEEE80211_TX_CTL_NO_ACK)) {
 		/* ACK duration */
-		dur += ieee80211_frame_duration(sband->band, 10, rate->bitrate,
+		dur += ieee80211_frame_duration(sband->band, 10, bitrate,
 						erp, short_preamble, shift);
 	}
 
@@ -1175,7 +1183,7 @@ void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
 int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
 			     size_t buffer_len, const u8 *ie, size_t ie_len,
 			     enum ieee80211_band band, u32 rate_mask,
-			     u8 channel)
+			     struct cfg80211_chan_def *chandef)
 {
 	struct ieee80211_supported_band *sband;
 	u8 *pos = buffer, *end = buffer + buffer_len;
@@ -1184,16 +1192,26 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
 	u8 rates[32];
 	int num_rates;
 	int ext_rates_len;
+	int shift;
+	u32 rate_flags;
 
 	sband = local->hw.wiphy->bands[band];
 	if (WARN_ON_ONCE(!sband))
 		return 0;
 
+	rate_flags = ieee80211_chandef_rate_flags(chandef);
+	shift = ieee80211_chandef_get_shift(chandef);
+
 	num_rates = 0;
 	for (i = 0; i < sband->n_bitrates; i++) {
 		if ((BIT(i) & rate_mask) == 0)
 			continue; /* skip rate */
-		rates[num_rates++] = (u8) (sband->bitrates[i].bitrate / 5);
+		if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
+			continue;
+
+		rates[num_rates++] =
+			(u8) DIV_ROUND_UP(sband->bitrates[i].bitrate,
+					  (1 << shift) * 5);
 	}
 
 	supp_rates_len = min_t(int, num_rates, 8);
@@ -1233,12 +1251,13 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
 		pos += ext_rates_len;
 	}
 
-	if (channel && sband->band == IEEE80211_BAND_2GHZ) {
+	if (chandef->chan && sband->band == IEEE80211_BAND_2GHZ) {
 		if (end - pos < 3)
 			goto out_err;
 		*pos++ = WLAN_EID_DS_PARAMS;
 		*pos++ = 1;
-		*pos++ = channel;
+		*pos++ = ieee80211_frequency_to_channel(
+				chandef->chan->center_freq);
 	}
 
 	/* insert custom IEs that go before HT */
@@ -1303,9 +1322,9 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
 					  bool directed)
 {
 	struct ieee80211_local *local = sdata->local;
+	struct cfg80211_chan_def chandef;
 	struct sk_buff *skb;
 	struct ieee80211_mgmt *mgmt;
-	u8 chan_no;
 	int ies_len;
 
 	/*
@@ -1313,10 +1332,11 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
 	 * in order to maximize the chance that we get a response.  Some
 	 * badly-behaved APs don't respond when this parameter is included.
 	 */
+	chandef.width = sdata->vif.bss_conf.chandef.width;
 	if (directed)
-		chan_no = 0;
+		chandef.chan = NULL;
 	else
-		chan_no = ieee80211_frequency_to_channel(chan->center_freq);
+		chandef.chan = chan;
 
 	skb = ieee80211_probereq_get(&local->hw, &sdata->vif,
 				     ssid, ssid_len, 100 + ie_len);
@@ -1326,7 +1346,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
 	ies_len = ieee80211_build_preq_ies(local, skb_tail_pointer(skb),
 					   skb_tailroom(skb),
 					   ie, ie_len, chan->band,
-					   ratemask, chan_no);
+					   ratemask, &chandef);
 	skb_put(skb, ies_len);
 
 	if (dst) {
@@ -1360,16 +1380,19 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
 	}
 }
 
-u32 ieee80211_sta_get_rates(struct ieee80211_local *local,
+u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata,
 			    struct ieee802_11_elems *elems,
 			    enum ieee80211_band band, u32 *basic_rates)
 {
 	struct ieee80211_supported_band *sband;
 	struct ieee80211_rate *bitrates;
 	size_t num_rates;
-	u32 supp_rates;
-	int i, j;
-	sband = local->hw.wiphy->bands[band];
+	u32 supp_rates, rate_flags;
+	int i, j, shift;
+	sband = sdata->local->hw.wiphy->bands[band];
+
+	rate_flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef);
+	shift = ieee80211_vif_get_shift(&sdata->vif);
 
 	if (WARN_ON(!sband))
 		return 1;
@@ -1394,7 +1417,15 @@ u32 ieee80211_sta_get_rates(struct ieee80211_local *local,
 			continue;
 
 		for (j = 0; j < num_rates; j++) {
-			if (bitrates[j].bitrate == own_rate) {
+			int brate;
+			if ((rate_flags & sband->bitrates[j].flags)
+			    != rate_flags)
+				continue;
+
+			brate = DIV_ROUND_UP(sband->bitrates[j].bitrate,
+					     1 << shift);
+
+			if (brate == own_rate) {
 				supp_rates |= BIT(j);
 				if (basic_rates && is_basic)
 					*basic_rates |= BIT(j);
@@ -2017,18 +2048,56 @@ void ieee80211_ht_oper_to_chandef(struct ieee80211_channel *control_chan,
 	cfg80211_chandef_create(chandef, control_chan, channel_type);
 }
 
+int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef,
+			     const struct ieee80211_supported_band *sband,
+			     const u8 *srates, int srates_len, u32 *rates)
+{
+	u32 rate_flags = ieee80211_chandef_rate_flags(chandef);
+	int shift = ieee80211_chandef_get_shift(chandef);
+	struct ieee80211_rate *br;
+	int brate, rate, i, j, count = 0;
+
+	*rates = 0;
+
+	for (i = 0; i < srates_len; i++) {
+		rate = srates[i] & 0x7f;
+
+		for (j = 0; j < sband->n_bitrates; j++) {
+			br = &sband->bitrates[j];
+			if ((rate_flags & br->flags) != rate_flags)
+				continue;
+
+			brate = DIV_ROUND_UP(br->bitrate, (1 << shift) * 5);
+			if (brate == rate) {
+				*rates |= BIT(j);
+				count++;
+				break;
+			}
+		}
+	}
+	return count;
+}
+
 int ieee80211_add_srates_ie(struct ieee80211_sub_if_data *sdata,
 			    struct sk_buff *skb, bool need_basic,
 			    enum ieee80211_band band)
 {
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_supported_band *sband;
-	int rate;
+	int rate, shift;
 	u8 i, rates, *pos;
 	u32 basic_rates = sdata->vif.bss_conf.basic_rates;
+	u32 rate_flags;
 
+	shift = ieee80211_vif_get_shift(&sdata->vif);
+	rate_flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef);
 	sband = local->hw.wiphy->bands[band];
-	rates = sband->n_bitrates;
+	rates = 0;
+	for (i = 0; i < sband->n_bitrates; i++) {
+		if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
+			continue;
+		rates++;
+	}
 	if (rates > 8)
 		rates = 8;
 
@@ -2040,10 +2109,15 @@ int ieee80211_add_srates_ie(struct ieee80211_sub_if_data *sdata,
 	*pos++ = rates;
 	for (i = 0; i < rates; i++) {
 		u8 basic = 0;
+		if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
+			continue;
+
 		if (need_basic && basic_rates & BIT(i))
 			basic = 0x80;
 		rate = sband->bitrates[i].bitrate;
-		*pos++ = basic | (u8) (rate / 5);
+		rate = DIV_ROUND_UP(sband->bitrates[i].bitrate,
+				    5 * (1 << shift));
+		*pos++ = basic | (u8) rate;
 	}
 
 	return 0;
@@ -2055,12 +2129,22 @@ int ieee80211_add_ext_srates_ie(struct ieee80211_sub_if_data *sdata,
 {
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_supported_band *sband;
-	int rate;
+	int rate, skip, shift;
 	u8 i, exrates, *pos;
 	u32 basic_rates = sdata->vif.bss_conf.basic_rates;
+	u32 rate_flags;
+
+	rate_flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef);
+	shift = ieee80211_vif_get_shift(&sdata->vif);
 
 	sband = local->hw.wiphy->bands[band];
-	exrates = sband->n_bitrates;
+	exrates = 0;
+	for (i = 0; i < sband->n_bitrates; i++) {
+		if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
+			continue;
+		exrates++;
+	}
+
 	if (exrates > 8)
 		exrates -= 8;
 	else
@@ -2073,12 +2157,19 @@ int ieee80211_add_ext_srates_ie(struct ieee80211_sub_if_data *sdata,
 		pos = skb_put(skb, exrates + 2);
 		*pos++ = WLAN_EID_EXT_SUPP_RATES;
 		*pos++ = exrates;
+		skip = 0;
 		for (i = 8; i < sband->n_bitrates; i++) {
 			u8 basic = 0;
+			if ((rate_flags & sband->bitrates[i].flags)
+			    != rate_flags)
+				continue;
+			if (skip++ < 8)
+				continue;
 			if (need_basic && basic_rates & BIT(i))
 				basic = 0x80;
-			rate = sband->bitrates[i].bitrate;
-			*pos++ = basic | (u8) (rate / 5);
+			rate = DIV_ROUND_UP(sband->bitrates[i].bitrate,
+					    5 * (1 << shift));
+			*pos++ = basic | (u8) rate;
 		}
 	}
 	return 0;
@@ -2162,9 +2253,17 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
 			ri.flags |= RATE_INFO_FLAGS_SHORT_GI;
 	} else {
 		struct ieee80211_supported_band *sband;
+		int shift = 0;
+		int bitrate;
+
+		if (status->flag & RX_FLAG_10MHZ)
+			shift = 1;
+		if (status->flag & RX_FLAG_5MHZ)
+			shift = 2;
 
 		sband = local->hw.wiphy->bands[status->band];
-		ri.legacy = sband->bitrates[status->rate_idx].bitrate;
+		bitrate = sband->bitrates[status->rate_idx].bitrate;
+		ri.legacy = DIV_ROUND_UP(bitrate, (1 << shift));
 	}
 
 	rate = cfg80211_calculate_bitrate(&ri);
-- 
cgit v1.2.3


From 74608aca4d82e2855b1a6a2cd60331d1a98f2d6a Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <simon.wunderlich@s2003.tu-chemnitz.de>
Date: Mon, 8 Jul 2013 16:55:54 +0200
Subject: cfg80211/mac80211: get mandatory rates based on scan width

Mandatory rates for 5 and 10 MHz are different from the rates used for
20 MHz in 2.4 GHz mode, as they use OFDM only.

Signed-off-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Mathias Kretschmer <mathias.kretschmer@fokus.fraunhofer.de>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 include/net/cfg80211.h |  4 +++-
 net/mac80211/ibss.c    | 19 +++++++++++++++----
 net/wireless/mesh.c    |  5 ++++-
 net/wireless/util.c    | 14 ++++++++++----
 4 files changed, 32 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index bae8614a450f..88f15631701a 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3128,11 +3128,13 @@ ieee80211_get_response_rate(struct ieee80211_supported_band *sband,
 /**
  * ieee80211_mandatory_rates - get mandatory rates for a given band
  * @sband: the band to look for rates in
+ * @scan_width: width of the control channel
  *
  * This function returns a bitmap of the mandatory rates for the given
  * band, bits are set according to the rate position in the bitrates array.
  */
-u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband);
+u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband,
+			      enum nl80211_bss_scan_width scan_width);
 
 /*
  * Radiotap parsing functions -- for controlled injection support
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index 272a3b37615c..299603e4939f 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -371,6 +371,7 @@ ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid,
 	struct sta_info *sta;
 	struct ieee80211_chanctx_conf *chanctx_conf;
 	struct ieee80211_supported_band *sband;
+	enum nl80211_bss_scan_width scan_width;
 	int band;
 
 	/*
@@ -399,6 +400,7 @@ ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid,
 	if (WARN_ON_ONCE(!chanctx_conf))
 		return NULL;
 	band = chanctx_conf->def.chan->band;
+	scan_width = cfg80211_chandef_to_scan_width(&chanctx_conf->def);
 	rcu_read_unlock();
 
 	sta = sta_info_alloc(sdata, addr, GFP_KERNEL);
@@ -412,7 +414,7 @@ ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid,
 	/* make sure mandatory rates are always added */
 	sband = local->hw.wiphy->bands[band];
 	sta->sta.supp_rates[band] = supp_rates |
-			ieee80211_mandatory_rates(sband);
+			ieee80211_mandatory_rates(sband, scan_width);
 
 	return ieee80211_ibss_finish_sta(sta);
 }
@@ -476,6 +478,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 	u64 beacon_timestamp, rx_timestamp;
 	u32 supp_rates = 0;
 	enum ieee80211_band band = rx_status->band;
+	enum nl80211_bss_scan_width scan_width;
 	struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band];
 	bool rates_updated = false;
 
@@ -504,9 +507,15 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 
 				prev_rates = sta->sta.supp_rates[band];
 				/* make sure mandatory rates are always added */
-				sta->sta.supp_rates[band] = supp_rates |
-					ieee80211_mandatory_rates(sband);
+				scan_width = NL80211_BSS_CHAN_WIDTH_20;
+				if (rx_status->flag & RX_FLAG_5MHZ)
+					scan_width = NL80211_BSS_CHAN_WIDTH_5;
+				if (rx_status->flag & RX_FLAG_10MHZ)
+					scan_width = NL80211_BSS_CHAN_WIDTH_10;
 
+				sta->sta.supp_rates[band] = supp_rates |
+					ieee80211_mandatory_rates(sband,
+								  scan_width);
 				if (sta->sta.supp_rates[band] != prev_rates) {
 					ibss_dbg(sdata,
 						 "updated supp_rates set for %pM based on beacon/probe_resp (0x%x -> 0x%x)\n",
@@ -640,6 +649,7 @@ void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata,
 	struct sta_info *sta;
 	struct ieee80211_chanctx_conf *chanctx_conf;
 	struct ieee80211_supported_band *sband;
+	enum nl80211_bss_scan_width scan_width;
 	int band;
 
 	/*
@@ -665,6 +675,7 @@ void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata,
 		return;
 	}
 	band = chanctx_conf->def.chan->band;
+	scan_width = cfg80211_chandef_to_scan_width(&chanctx_conf->def);
 	rcu_read_unlock();
 
 	sta = sta_info_alloc(sdata, addr, GFP_ATOMIC);
@@ -676,7 +687,7 @@ void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata,
 	/* make sure mandatory rates are always added */
 	sband = local->hw.wiphy->bands[band];
 	sta->sta.supp_rates[band] = supp_rates |
-			ieee80211_mandatory_rates(sband);
+			ieee80211_mandatory_rates(sband, scan_width);
 
 	spin_lock(&ifibss->incomplete_lock);
 	list_add(&sta->list, &ifibss->incomplete_stations);
diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c
index 30c49202ee4d..0553fd4d85ae 100644
--- a/net/wireless/mesh.c
+++ b/net/wireless/mesh.c
@@ -167,9 +167,12 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
 	 * basic rates
 	 */
 	if (!setup->basic_rates) {
+		enum nl80211_bss_scan_width scan_width;
 		struct ieee80211_supported_band *sband =
 				rdev->wiphy.bands[setup->chandef.chan->band];
-		setup->basic_rates = ieee80211_mandatory_rates(sband);
+		scan_width = cfg80211_chandef_to_scan_width(&setup->chandef);
+		setup->basic_rates = ieee80211_mandatory_rates(sband,
+							       scan_width);
 	}
 
 	if (!cfg80211_reg_can_beacon(&rdev->wiphy, &setup->chandef))
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 74458b7f61eb..ce090c1c5e4f 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -33,7 +33,8 @@ ieee80211_get_response_rate(struct ieee80211_supported_band *sband,
 }
 EXPORT_SYMBOL(ieee80211_get_response_rate);
 
-u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband)
+u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband,
+			      enum nl80211_bss_scan_width scan_width)
 {
 	struct ieee80211_rate *bitrates;
 	u32 mandatory_rates = 0;
@@ -43,10 +44,15 @@ u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband)
 	if (WARN_ON(!sband))
 		return 1;
 
-	if (sband->band == IEEE80211_BAND_2GHZ)
-		mandatory_flag = IEEE80211_RATE_MANDATORY_B;
-	else
+	if (sband->band == IEEE80211_BAND_2GHZ) {
+		if (scan_width == NL80211_BSS_CHAN_WIDTH_5 ||
+		    scan_width == NL80211_BSS_CHAN_WIDTH_10)
+			mandatory_flag = IEEE80211_RATE_MANDATORY_G;
+		else
+			mandatory_flag = IEEE80211_RATE_MANDATORY_B;
+	} else {
 		mandatory_flag = IEEE80211_RATE_MANDATORY_A;
+	}
 
 	bitrates = sband->bitrates;
 	for (i = 0; i < sband->n_bitrates; i++)
-- 
cgit v1.2.3


From 0430c883470d0c9a23661ea9f02c56b1d91cf93c Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <simon.wunderlich@s2003.tu-chemnitz.de>
Date: Mon, 8 Jul 2013 16:55:55 +0200
Subject: cfg80211/mac80211: use reduced txpower for 5 and 10 MHz

Some regulations (like germany, but also FCC) express their transmission
power limit in dBm/MHz or mW/MHz. To cope with that and be on the safe
side, reduce the maximum power to half (10 MHz) or quarter (5 MHz)
when operating on these reduced bandwidth channels.

Signed-off-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Mathias Kretschmer <mathias.kretschmer@fokus.fraunhofer.de>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 include/net/cfg80211.h | 27 +++++++++++++++++++++++++++
 net/mac80211/iface.c   |  2 +-
 net/mac80211/main.c    |  2 +-
 net/mac80211/mlme.c    |  3 ++-
 4 files changed, 31 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 88f15631701a..aeaf6dff6e05 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -460,6 +460,33 @@ ieee80211_chandef_rate_flags(struct cfg80211_chan_def *chandef)
 	return 0;
 }
 
+/**
+ * ieee80211_chandef_max_power - maximum transmission power for the chandef
+ *
+ * In some regulations, the transmit power may depend on the configured channel
+ * bandwidth which may be defined as dBm/MHz. This function returns the actual
+ * max_power for non-standard (20 MHz) channels.
+ *
+ * @chandef: channel definition for the channel
+ *
+ * Returns: maximum allowed transmission power in dBm for the chandef
+ */
+static inline int
+ieee80211_chandef_max_power(struct cfg80211_chan_def *chandef)
+{
+	switch (chandef->width) {
+	case NL80211_CHAN_WIDTH_5:
+		return min(chandef->chan->max_reg_power - 6,
+			   chandef->chan->max_power);
+	case NL80211_CHAN_WIDTH_10:
+		return min(chandef->chan->max_reg_power - 3,
+			   chandef->chan->max_power);
+	default:
+		break;
+	}
+	return chandef->chan->max_power;
+}
+
 /**
  * enum survey_info_flags - survey information flags
  *
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index cc117591f678..4c41c11958c8 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -54,7 +54,7 @@ bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata)
 		return false;
 	}
 
-	power = chanctx_conf->def.chan->max_power;
+	power = ieee80211_chandef_max_power(&chanctx_conf->def);
 	rcu_read_unlock();
 
 	if (sdata->user_power_level != IEEE80211_UNSET_POWER_LEVEL)
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 091088ac7890..6da98c77496b 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -151,7 +151,7 @@ static u32 ieee80211_hw_conf_chan(struct ieee80211_local *local)
 		changed |= IEEE80211_CONF_CHANGE_SMPS;
 	}
 
-	power = chandef.chan->max_power;
+	power = ieee80211_chandef_max_power(&chandef);
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index f7552c2b74eb..211246b46819 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -747,7 +747,8 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
 		*pos++ = WLAN_EID_PWR_CAPABILITY;
 		*pos++ = 2;
 		*pos++ = 0; /* min tx power */
-		*pos++ = chan->max_power; /* max tx power */
+		 /* max tx power */
+		*pos++ = ieee80211_chandef_max_power(&chanctx_conf->def);
 
 		/* 2. supported channels */
 		/* TODO: get this in reg domain format */
-- 
cgit v1.2.3


From 7ca15a0ae865067aac8d36e27e0acbe4a8f1e70a Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <simon.wunderlich@s2003.tu-chemnitz.de>
Date: Mon, 8 Jul 2013 16:55:56 +0200
Subject: mac80211: allow scanning for 5/10 MHz channels in IBSS

Use a chandef instead of just the channel for scanning, and enable
5/10 Mhz scanning for IBSS mode. Also reporting is changed to the new
inform_bss functions.

Signed-off-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Mathias Kretschmer <mathias.kretschmer@fokus.fraunhofer.de>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 net/mac80211/ibss.c        | 16 ++++++++++++----
 net/mac80211/ieee80211_i.h |  5 +++--
 net/mac80211/main.c        | 13 ++-----------
 net/mac80211/scan.c        | 45 +++++++++++++++++++++++++++++++++++++++------
 4 files changed, 56 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index 299603e4939f..8fa1c27bdfe3 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -51,6 +51,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	u32 bss_change, rate_flags, rates = 0, rates_added = 0;
 	u8 supp_rates[IEEE80211_MAX_SUPP_RATES];
 	struct cfg80211_chan_def chandef;
+	enum nl80211_bss_scan_width scan_width;
 	struct beacon_data *presp;
 	int frame_len;
 	int shift;
@@ -271,8 +272,10 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	mod_timer(&ifibss->timer,
 		  round_jiffies(jiffies + IEEE80211_IBSS_MERGE_INTERVAL));
 
-	bss = cfg80211_inform_bss_frame(local->hw.wiphy, chan,
-					mgmt, presp->head_len, 0, GFP_KERNEL);
+	scan_width = cfg80211_chandef_to_scan_width(&chandef);
+	bss = cfg80211_inform_bss_width_frame(local->hw.wiphy, chan,
+					      scan_width, mgmt,
+					      presp->head_len, 0, GFP_KERNEL);
 	cfg80211_put_bss(local->hw.wiphy, bss);
 	netif_carrier_on(sdata->dev);
 	cfg80211_ibss_joined(sdata->dev, ifibss->bssid, GFP_KERNEL);
@@ -726,6 +729,7 @@ static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata)
 static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata)
 {
 	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+	enum nl80211_bss_scan_width scan_width;
 
 	sdata_assert_lock(sdata);
 
@@ -747,8 +751,9 @@ static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata)
 	sdata_info(sdata,
 		   "No active IBSS STAs - trying to scan for other IBSS networks with same SSID (merge)\n");
 
+	scan_width = cfg80211_chandef_to_scan_width(&ifibss->chandef);
 	ieee80211_request_ibss_scan(sdata, ifibss->ssid, ifibss->ssid_len,
-				    NULL);
+				    NULL, scan_width);
 }
 
 static void ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata)
@@ -798,6 +803,7 @@ static void ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata)
 	struct cfg80211_bss *cbss;
 	struct ieee80211_channel *chan = NULL;
 	const u8 *bssid = NULL;
+	enum nl80211_bss_scan_width scan_width;
 	int active_ibss;
 	u16 capability;
 
@@ -846,8 +852,10 @@ static void ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata)
 					IEEE80211_SCAN_INTERVAL)) {
 		sdata_info(sdata, "Trigger new scan to find an IBSS to join\n");
 
+		scan_width = cfg80211_chandef_to_scan_width(&ifibss->chandef);
 		ieee80211_request_ibss_scan(sdata, ifibss->ssid,
-					    ifibss->ssid_len, chan);
+					    ifibss->ssid_len, chan,
+					    scan_width);
 	} else {
 		int interval = IEEE80211_SCAN_INTERVAL;
 
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 6596da66bfaf..23a191b03bbf 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1057,7 +1057,7 @@ struct ieee80211_local {
 	struct cfg80211_ssid scan_ssid;
 	struct cfg80211_scan_request *int_scan_req;
 	struct cfg80211_scan_request *scan_req, *hw_scan_req;
-	struct ieee80211_channel *scan_channel;
+	struct cfg80211_chan_def scan_chandef;
 	enum ieee80211_band hw_scan_band;
 	int scan_channel_idx;
 	int scan_ies_len;
@@ -1337,7 +1337,8 @@ void ieee80211_mesh_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 void ieee80211_scan_work(struct work_struct *work);
 int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata,
 				const u8 *ssid, u8 ssid_len,
-				struct ieee80211_channel *chan);
+				struct ieee80211_channel *chan,
+				enum nl80211_bss_scan_width scan_width);
 int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
 			   struct cfg80211_scan_request *req);
 void ieee80211_scan_cancel(struct ieee80211_local *local);
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 6da98c77496b..25eb35b01938 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -102,17 +102,8 @@ static u32 ieee80211_hw_conf_chan(struct ieee80211_local *local)
 
 	offchannel_flag = local->hw.conf.flags & IEEE80211_CONF_OFFCHANNEL;
 
-	if (local->scan_channel) {
-		chandef.chan = local->scan_channel;
-		/* If scanning on oper channel, use whatever channel-type
-		 * is currently in use.
-		 */
-		if (chandef.chan == local->_oper_chandef.chan) {
-			chandef = local->_oper_chandef;
-		} else {
-			chandef.width = NL80211_CHAN_WIDTH_20_NOHT;
-			chandef.center_freq1 = chandef.chan->center_freq;
-		}
+	if (local->scan_chandef.chan) {
+		chandef = local->scan_chandef;
 	} else if (local->tmp_channel) {
 		chandef.chan = local->tmp_channel;
 		chandef.width = NL80211_CHAN_WIDTH_20_NOHT;
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 819d0956eb3b..08afe74b98f4 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -66,6 +66,7 @@ ieee80211_bss_info_update(struct ieee80211_local *local,
 	struct cfg80211_bss *cbss;
 	struct ieee80211_bss *bss;
 	int clen, srlen;
+	enum nl80211_bss_scan_width scan_width;
 	s32 signal = 0;
 
 	if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
@@ -73,8 +74,15 @@ ieee80211_bss_info_update(struct ieee80211_local *local,
 	else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)
 		signal = (rx_status->signal * 100) / local->hw.max_signal;
 
-	cbss = cfg80211_inform_bss_frame(local->hw.wiphy, channel,
-					 mgmt, len, signal, GFP_ATOMIC);
+	scan_width = NL80211_BSS_CHAN_WIDTH_20;
+	if (rx_status->flag & RX_FLAG_5MHZ)
+		scan_width = NL80211_BSS_CHAN_WIDTH_5;
+	if (rx_status->flag & RX_FLAG_10MHZ)
+		scan_width = NL80211_BSS_CHAN_WIDTH_10;
+
+	cbss = cfg80211_inform_bss_width_frame(local->hw.wiphy, channel,
+					       scan_width, mgmt, len, signal,
+					       GFP_ATOMIC);
 	if (!cbss)
 		return NULL;
 
@@ -300,7 +308,7 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted,
 	rcu_assign_pointer(local->scan_sdata, NULL);
 
 	local->scanning = 0;
-	local->scan_channel = NULL;
+	local->scan_chandef.chan = NULL;
 
 	/* Set power back to normal operating levels. */
 	ieee80211_hw_config(local, 0);
@@ -635,11 +643,34 @@ static void ieee80211_scan_state_set_channel(struct ieee80211_local *local,
 {
 	int skip;
 	struct ieee80211_channel *chan;
+	enum nl80211_bss_scan_width oper_scan_width;
 
 	skip = 0;
 	chan = local->scan_req->channels[local->scan_channel_idx];
 
-	local->scan_channel = chan;
+	local->scan_chandef.chan = chan;
+	local->scan_chandef.center_freq1 = chan->center_freq;
+	local->scan_chandef.center_freq2 = 0;
+	switch (local->scan_req->scan_width) {
+	case NL80211_BSS_CHAN_WIDTH_5:
+		local->scan_chandef.width = NL80211_CHAN_WIDTH_5;
+		break;
+	case NL80211_BSS_CHAN_WIDTH_10:
+		local->scan_chandef.width = NL80211_CHAN_WIDTH_10;
+		break;
+	case NL80211_BSS_CHAN_WIDTH_20:
+		/* If scanning on oper channel, use whatever channel-type
+		 * is currently in use.
+		 */
+		oper_scan_width = cfg80211_chandef_to_scan_width(
+					&local->_oper_chandef);
+		if (chan == local->_oper_chandef.chan &&
+		    oper_scan_width == local->scan_req->scan_width)
+			local->scan_chandef = local->_oper_chandef;
+		else
+			local->scan_chandef.width = NL80211_CHAN_WIDTH_20_NOHT;
+		break;
+	}
 
 	if (ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL))
 		skip = 1;
@@ -679,7 +710,7 @@ static void ieee80211_scan_state_suspend(struct ieee80211_local *local,
 					 unsigned long *next_delay)
 {
 	/* switch back to the operating channel */
-	local->scan_channel = NULL;
+	local->scan_chandef.chan = NULL;
 	ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL);
 
 	/* disable PS */
@@ -821,7 +852,8 @@ int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
 
 int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata,
 				const u8 *ssid, u8 ssid_len,
-				struct ieee80211_channel *chan)
+				struct ieee80211_channel *chan,
+				enum nl80211_bss_scan_width scan_width)
 {
 	struct ieee80211_local *local = sdata->local;
 	int ret = -EBUSY;
@@ -871,6 +903,7 @@ int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata,
 
 	local->int_scan_req->ssids = &local->scan_ssid;
 	local->int_scan_req->n_ssids = 1;
+	local->int_scan_req->scan_width = scan_width;
 	memcpy(local->int_scan_req->ssids[0].ssid, ssid, IEEE80211_MAX_SSID_LEN);
 	local->int_scan_req->ssids[0].ssid_len = ssid_len;
 
-- 
cgit v1.2.3


From 4b42aab1e8fcac2e9b835a7d93bf30dc9a032faa Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <simon.wunderlich@s2003.tu-chemnitz.de>
Date: Mon, 8 Jul 2013 16:55:57 +0200
Subject: mac80211: return if IBSS chandef can not be used

This was originally designed to fail when a 40+/40- mode can not be
used, but basic modes (such as 5/10/20 MHz) must be handled
with an error.

Signed-off-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 net/mac80211/ibss.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index 8fa1c27bdfe3..449702df24ab 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -85,6 +85,14 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 
 	chandef = ifibss->chandef;
 	if (!cfg80211_reg_can_beacon(local->hw.wiphy, &chandef)) {
+		if (chandef.width == NL80211_CHAN_WIDTH_5 ||
+		    chandef.width == NL80211_CHAN_WIDTH_10 ||
+		    chandef.width == NL80211_CHAN_WIDTH_20_NOHT ||
+		    chandef.width == NL80211_CHAN_WIDTH_20) {
+			sdata_info(sdata,
+				   "Failed to join IBSS, beacons forbidden\n");
+			return;
+		}
 		chandef.width = NL80211_CHAN_WIDTH_20;
 		chandef.center_freq1 = chan->center_freq;
 	}
-- 
cgit v1.2.3


From bf3726457276c8773ec97da30c6459caf512b22f Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <simon.wunderlich@s2003.tu-chemnitz.de>
Date: Mon, 8 Jul 2013 16:55:58 +0200
Subject: nl80211: allow 5 and 10 MHz channels for IBSS

Whether the wiphy supports it or not is already checked, so what is left
is to enable these channel types.

Signed-off-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Mathias Kretschmer <mathias.kretschmer@fokus.fraunhofer.de>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 net/wireless/nl80211.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index ef4c312cc92c..03d4ef95292e 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -6355,6 +6355,8 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
 		return -EINVAL;
 
 	switch (ibss.chandef.width) {
+	case NL80211_CHAN_WIDTH_5:
+	case NL80211_CHAN_WIDTH_10:
 	case NL80211_CHAN_WIDTH_20_NOHT:
 		break;
 	case NL80211_CHAN_WIDTH_20:
-- 
cgit v1.2.3


From 2ec9c1f67ab1f58b5bf5ac19e4b61b9f75c83a04 Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <simon.wunderlich@s2003.tu-chemnitz.de>
Date: Thu, 11 Jul 2013 18:07:46 +0200
Subject: mac80211: fix regression when initializing ibss wmm params

There appear to be two regressions in ibss.c when calling
ieee80211_sta_def_wmm_params():
 * the second argument should be a rate length, not a rate array. This
   was introduced by my commit "mac80211: select and adjust bitrates
   according to channel mode"
 * the third argument is not initialized (anymore), making further
   checks within this function useless.

Since ieee80211_sta_def_wmm_params() is only used by ibss anyway,
remove the function entirely and handle the operating mode decision
immediately.

Signed-off-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ibss.c        | 12 ++++++++++--
 net/mac80211/ieee80211_i.h |  3 ---
 net/mac80211/util.c        | 26 --------------------------
 3 files changed, 10 insertions(+), 31 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index 449702df24ab..83197c3ae5f1 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -49,9 +49,9 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_supported_band *sband;
 	struct cfg80211_bss *bss;
 	u32 bss_change, rate_flags, rates = 0, rates_added = 0;
-	u8 supp_rates[IEEE80211_MAX_SUPP_RATES];
 	struct cfg80211_chan_def chandef;
 	enum nl80211_bss_scan_width scan_width;
+	bool have_higher_than_11mbit = false;
 	struct beacon_data *presp;
 	int frame_len;
 	int shift;
@@ -149,6 +149,8 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	for (i = 0; i < sband->n_bitrates; i++) {
 		if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
 			continue;
+		if (sband->bitrates[i].bitrate > 110)
+			have_higher_than_11mbit = true;
 
 		rates |= BIT(i);
 		rates_n++;
@@ -270,11 +272,17 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	sdata->vif.bss_conf.use_short_slot = chan->band == IEEE80211_BAND_5GHZ;
 	bss_change |= BSS_CHANGED_ERP_SLOT;
 
+	/* cf. IEEE 802.11 9.2.12 */
+	if (chan->band == IEEE80211_BAND_2GHZ && have_higher_than_11mbit)
+		sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE;
+	else
+		sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE;
+
 	sdata->vif.bss_conf.ibss_joined = true;
 	sdata->vif.bss_conf.ibss_creator = creator;
 	ieee80211_bss_info_change_notify(sdata, bss_change);
 
-	ieee80211_sta_def_wmm_params(sdata, rates, supp_rates);
+	ieee80211_set_wmm_default(sdata, true);
 
 	ifibss->state = IEEE80211_IBSS_MLME_JOINED;
 	mod_timer(&ifibss->timer,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 23a191b03bbf..3d32df1fbc6d 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1615,9 +1615,6 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
 			      u32 ratemask, bool directed, u32 tx_flags,
 			      struct ieee80211_channel *channel, bool scan);
 
-void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata,
-				  const size_t supp_rates_len,
-				  const u8 *supp_rates);
 u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata,
 			    struct ieee802_11_elems *elems,
 			    enum ieee80211_band band, u32 *basic_rates);
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 1e45891ca219..d23c5a705a68 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -1073,32 +1073,6 @@ void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata,
 	}
 }
 
-void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata,
-				  const size_t supp_rates_len,
-				  const u8 *supp_rates)
-{
-	struct ieee80211_chanctx_conf *chanctx_conf;
-	int i, have_higher_than_11mbit = 0;
-
-	/* cf. IEEE 802.11 9.2.12 */
-	for (i = 0; i < supp_rates_len; i++)
-		if ((supp_rates[i] & 0x7f) * 5 > 110)
-			have_higher_than_11mbit = 1;
-
-	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
-
-	if (chanctx_conf &&
-	    chanctx_conf->def.chan->band == IEEE80211_BAND_2GHZ &&
-	    have_higher_than_11mbit)
-		sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE;
-	else
-		sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE;
-	rcu_read_unlock();
-
-	ieee80211_set_wmm_default(sdata, true);
-}
-
 void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
 			 u16 transaction, u16 auth_alg, u16 status,
 			 const u8 *extra, size_t extra_len, const u8 *da,
-- 
cgit v1.2.3


From 93c75786e8d8e8e245a479209a84b19292d08e7e Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <simon.wunderlich@s2003.tu-chemnitz.de>
Date: Thu, 11 Jul 2013 20:29:49 +0200
Subject: mac80211: fix off-by-one regression in ibss beacon generation

There is an off-by-one error in the beacon generation for the ibss mode,
falsely a rate the extended supported rates which was already added to
supported rates, messing up the beacon. This was introduced by commit
"mac80211: select and adjust bitrates according to channel mode".

Signed-off-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ibss.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index 83197c3ae5f1..5e6836c3aa4c 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -168,8 +168,10 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 		if (basic_rates & BIT(ri))
 			basic = 0x80;
 		*pos++ = basic | (u8) rate;
-		if (++rates_added == 8)
+		if (++rates_added == 8) {
+			ri++; /* continue at next rate for EXT_SUPP_RATES */
 			break;
+		}
 	}
 
 	if (sband->band == IEEE80211_BAND_2GHZ) {
-- 
cgit v1.2.3


From b60e527a729cbb9c88b49cc10738a13328cd1ed2 Mon Sep 17 00:00:00 2001
From: Chun-Yeow Yeoh <yeohchunyeow@gmail.com>
Date: Fri, 12 Jul 2013 18:55:22 +0800
Subject: mac80211: set forwarding in mesh capability info

Set the Forwarding bit in Mesh Capability Info according
to dot11MeshForwarding as defined in IEEE 802.11-2012
section 8.4.2.100.8.

Signed-off-by: Chun-Yeow Yeoh <yeohchunyeow@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mesh.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index e536f22fbeca..885a5f6e2c21 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -273,7 +273,9 @@ int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata,
 	neighbors = min_t(int, neighbors, IEEE80211_MAX_MESH_PEERINGS);
 	*pos++ = neighbors << 1;
 	/* Mesh capability */
-	*pos = IEEE80211_MESHCONF_CAPAB_FORWARDING;
+	*pos = 0x00;
+	*pos |= ifmsh->mshcfg.dot11MeshForwarding ?
+			IEEE80211_MESHCONF_CAPAB_FORWARDING : 0x00;
 	*pos |= ifmsh->accepting_plinks ?
 			IEEE80211_MESHCONF_CAPAB_ACCEPT_PLINKS : 0x00;
 	/* Mesh PS mode. See IEEE802.11-2012 8.4.2.100.8 */
-- 
cgit v1.2.3


From c82b5a74cc739385db6e4275fe504a0e9469bf01 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Sun, 14 Jul 2013 23:39:20 +0300
Subject: mac80211: make active monitor injection work w/ HW queue

When a driver (like hwsim) uses HW queue control an
active monitor vif needs to be used for the queues,
make the code do that. Otherwise we'd bail out and
drop the frames.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/tx.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index f82301b6cef2..be4d3caf4879 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1272,6 +1272,10 @@ static bool __ieee80211_tx(struct ieee80211_local *local,
 
 	switch (sdata->vif.type) {
 	case NL80211_IFTYPE_MONITOR:
+		if (sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE) {
+			vif = &sdata->vif;
+			break;
+		}
 		sdata = rcu_dereference(local->monitor_sdata);
 		if (sdata) {
 			vif = &sdata->vif;
-- 
cgit v1.2.3


From aafee33423e78e86becda986b24a5819a7d16ff7 Mon Sep 17 00:00:00 2001
From: Dragos Foianu <dragos.foianu@gmail.com>
Date: Wed, 17 Jul 2013 12:25:38 +0100
Subject: net/irda: fixed style issues in irttp

Applied error fixes suggested by checpatch.pl

Signed-off-by: Dragos Foianu <dragos.foianu@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/irda/irttp.c | 50 +++++++++++++++++++++++---------------------------
 1 file changed, 23 insertions(+), 27 deletions(-)

(limited to 'net')

diff --git a/net/irda/irttp.c b/net/irda/irttp.c
index ae43c62f9045..85372cfa7b9f 100644
--- a/net/irda/irttp.c
+++ b/net/irda/irttp.c
@@ -75,7 +75,7 @@ static pi_minor_info_t pi_minor_call_table[] = {
 	{ NULL, 0 },                                             /* 0x00 */
 	{ irttp_param_max_sdu_size, PV_INTEGER | PV_BIG_ENDIAN } /* 0x01 */
 };
-static pi_major_info_t pi_major_call_table[] = {{ pi_minor_call_table, 2 }};
+static pi_major_info_t pi_major_call_table[] = { { pi_minor_call_table, 2 } };
 static pi_param_info_t param_info = { pi_major_call_table, 1, 0x0f, 4 };
 
 /************************ GLOBAL PROCEDURES ************************/
@@ -205,7 +205,7 @@ static void irttp_todo_expired(unsigned long data)
  */
 static void irttp_flush_queues(struct tsap_cb *self)
 {
-	struct sk_buff* skb;
+	struct sk_buff *skb;
 
 	IRDA_DEBUG(4, "%s()\n", __func__);
 
@@ -400,7 +400,7 @@ struct tsap_cb *irttp_open_tsap(__u8 stsap_sel, int credit, notify_t *notify)
 	/* The IrLMP spec (IrLMP 1.1 p10) says that we have the right to
 	 * use only 0x01-0x6F. Of course, we can use LSAP_ANY as well.
 	 * JeanII */
-	if((stsap_sel != LSAP_ANY) &&
+	if ((stsap_sel != LSAP_ANY) &&
 	   ((stsap_sel < 0x01) || (stsap_sel >= 0x70))) {
 		IRDA_DEBUG(0, "%s(), invalid tsap!\n", __func__);
 		return NULL;
@@ -427,7 +427,7 @@ struct tsap_cb *irttp_open_tsap(__u8 stsap_sel, int credit, notify_t *notify)
 	ttp_notify.data_indication = irttp_data_indication;
 	ttp_notify.udata_indication = irttp_udata_indication;
 	ttp_notify.flow_indication = irttp_flow_indication;
-	if(notify->status_indication != NULL)
+	if (notify->status_indication != NULL)
 		ttp_notify.status_indication = irttp_status_indication;
 	ttp_notify.instance = self;
 	strncpy(ttp_notify.name, notify->name, NOTIFY_MAX_NAME);
@@ -639,8 +639,7 @@ int irttp_data_request(struct tsap_cb *self, struct sk_buff *skb)
 	 */
 	if ((self->tx_max_sdu_size != 0) &&
 	    (self->tx_max_sdu_size != TTP_SAR_UNBOUND) &&
-	    (skb->len > self->tx_max_sdu_size))
-	{
+	    (skb->len > self->tx_max_sdu_size)) {
 		IRDA_ERROR("%s: SAR enabled, but data is larger than TxMaxSduSize!\n",
 			   __func__);
 		ret = -EMSGSIZE;
@@ -733,8 +732,7 @@ static void irttp_run_tx_queue(struct tsap_cb *self)
 	 * poll us through irttp_flow_indication() - Jean II */
 	while ((self->send_credit > 0) &&
 	       (!irlmp_lap_tx_queue_full(self->lsap)) &&
-	       (skb = skb_dequeue(&self->tx_queue)))
-	{
+	       (skb = skb_dequeue(&self->tx_queue))) {
 		/*
 		 *  Since we can transmit and receive frames concurrently,
 		 *  the code below is a critical region and we must assure that
@@ -798,8 +796,7 @@ static void irttp_run_tx_queue(struct tsap_cb *self)
 	 * where we can spend a bit of time doing stuff. - Jean II */
 	if ((self->tx_sdu_busy) &&
 	    (skb_queue_len(&self->tx_queue) < TTP_TX_LOW_THRESHOLD) &&
-	    (!self->close_pend))
-	{
+	    (!self->close_pend)) {
 		if (self->notify.flow_indication)
 			self->notify.flow_indication(self->notify.instance,
 						     self, FLOW_START);
@@ -892,7 +889,7 @@ static int irttp_udata_indication(void *instance, void *sap,
 	/* Just pass data to layer above */
 	if (self->notify.udata_indication) {
 		err = self->notify.udata_indication(self->notify.instance,
-						    self,skb);
+						    self, skb);
 		/* Same comment as in irttp_do_data_indication() */
 		if (!err)
 			return 0;
@@ -1057,7 +1054,7 @@ static void irttp_flow_indication(void *instance, void *sap, LOCAL_FLOW flow)
 	 * to do that. Jean II */
 
 	/* If we need to send disconnect. try to do it now */
-	if(self->disconnect_pend)
+	if (self->disconnect_pend)
 		irttp_start_todo_timer(self, 0);
 }
 
@@ -1116,7 +1113,7 @@ int irttp_connect_request(struct tsap_cb *self, __u8 dtsap_sel,
 	IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -EBADR;);
 
 	if (self->connected) {
-		if(userdata)
+		if (userdata)
 			dev_kfree_skb(userdata);
 		return -EISCONN;
 	}
@@ -1137,7 +1134,7 @@ int irttp_connect_request(struct tsap_cb *self, __u8 dtsap_sel,
 		 *  headers
 		 */
 		IRDA_ASSERT(skb_headroom(userdata) >= TTP_MAX_HEADER,
-			{ dev_kfree_skb(userdata); return -1; } );
+			{ dev_kfree_skb(userdata); return -1; });
 	}
 
 	/* Initialize connection parameters */
@@ -1157,7 +1154,7 @@ int irttp_connect_request(struct tsap_cb *self, __u8 dtsap_sel,
 	 *  Give away max 127 credits for now
 	 */
 	if (n > 127) {
-		self->avail_credit=n-127;
+		self->avail_credit = n - 127;
 		n = 127;
 	}
 
@@ -1166,10 +1163,10 @@ int irttp_connect_request(struct tsap_cb *self, __u8 dtsap_sel,
 	/* SAR enabled? */
 	if (max_sdu_size > 0) {
 		IRDA_ASSERT(skb_headroom(tx_skb) >= (TTP_MAX_HEADER + TTP_SAR_HEADER),
-			{ dev_kfree_skb(tx_skb); return -1; } );
+			{ dev_kfree_skb(tx_skb); return -1; });
 
 		/* Insert SAR parameters */
-		frame = skb_push(tx_skb, TTP_HEADER+TTP_SAR_HEADER);
+		frame = skb_push(tx_skb, TTP_HEADER + TTP_SAR_HEADER);
 
 		frame[0] = TTP_PARAMETERS | n;
 		frame[1] = 0x04; /* Length */
@@ -1386,7 +1383,7 @@ int irttp_connect_response(struct tsap_cb *self, __u32 max_sdu_size,
 		 *  headers
 		 */
 		IRDA_ASSERT(skb_headroom(userdata) >= TTP_MAX_HEADER,
-			{ dev_kfree_skb(userdata); return -1; } );
+			{ dev_kfree_skb(userdata); return -1; });
 	}
 
 	self->avail_credit = 0;
@@ -1409,10 +1406,10 @@ int irttp_connect_response(struct tsap_cb *self, __u32 max_sdu_size,
 	/* SAR enabled? */
 	if (max_sdu_size > 0) {
 		IRDA_ASSERT(skb_headroom(tx_skb) >= (TTP_MAX_HEADER + TTP_SAR_HEADER),
-			{ dev_kfree_skb(tx_skb); return -1; } );
+			{ dev_kfree_skb(tx_skb); return -1; });
 
 		/* Insert TTP header with SAR parameters */
-		frame = skb_push(tx_skb, TTP_HEADER+TTP_SAR_HEADER);
+		frame = skb_push(tx_skb, TTP_HEADER + TTP_SAR_HEADER);
 
 		frame[0] = TTP_PARAMETERS | n;
 		frame[1] = 0x04; /* Length */
@@ -1522,7 +1519,7 @@ int irttp_disconnect_request(struct tsap_cb *self, struct sk_buff *userdata,
 	 * function may be called from various context, like user, timer
 	 * for following a disconnect_indication() (i.e. net_bh).
 	 * Jean II */
-	if(test_and_set_bit(0, &self->disconnect_pend)) {
+	if (test_and_set_bit(0, &self->disconnect_pend)) {
 		IRDA_DEBUG(0, "%s(), disconnect already pending\n",
 			   __func__);
 		if (userdata)
@@ -1627,7 +1624,7 @@ static void irttp_disconnect_indication(void *instance, void *sap,
 	 * Jean II */
 
 	/* No need to notify the client if has already tried to disconnect */
-	if(self->notify.disconnect_indication)
+	if (self->notify.disconnect_indication)
 		self->notify.disconnect_indication(self->notify.instance, self,
 						   reason, skb);
 	else
@@ -1738,8 +1735,7 @@ static void irttp_run_rx_queue(struct tsap_cb *self)
 		 *  This is the last fragment, so time to reassemble!
 		 */
 		if ((self->rx_sdu_size <= self->rx_max_sdu_size) ||
-		    (self->rx_max_sdu_size == TTP_SAR_UNBOUND))
-		{
+		    (self->rx_max_sdu_size == TTP_SAR_UNBOUND)) {
 			/*
 			 * A little optimizing. Only queue the fragment if
 			 * there are other fragments. Since if this is the
@@ -1860,7 +1856,7 @@ static int irttp_seq_show(struct seq_file *seq, void *v)
 	seq_printf(seq, "dtsap_sel: %02x\n",
 		   self->dtsap_sel);
 	seq_printf(seq, "  connected: %s, ",
-		   self->connected? "TRUE":"FALSE");
+		   self->connected ? "TRUE" : "FALSE");
 	seq_printf(seq, "avail credit: %d, ",
 		   self->avail_credit);
 	seq_printf(seq, "remote credit: %d, ",
@@ -1876,9 +1872,9 @@ static int irttp_seq_show(struct seq_file *seq, void *v)
 	seq_printf(seq, "rx_queue len: %u\n",
 		   skb_queue_len(&self->rx_queue));
 	seq_printf(seq, "  tx_sdu_busy: %s, ",
-		   self->tx_sdu_busy? "TRUE":"FALSE");
+		   self->tx_sdu_busy ? "TRUE" : "FALSE");
 	seq_printf(seq, "rx_sdu_busy: %s\n",
-		   self->rx_sdu_busy? "TRUE":"FALSE");
+		   self->rx_sdu_busy ? "TRUE" : "FALSE");
 	seq_printf(seq, "  max_seg_size: %u, ",
 		   self->max_seg_size);
 	seq_printf(seq, "tx_max_sdu_size: %u, ",
-- 
cgit v1.2.3


From cb820f8e4b7f73d1a32175e6591735b25bb5398d Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Fri, 19 Jul 2013 19:40:09 +0200
Subject: net: Provide a generic socket error queue delivery method for Tx time
 stamps.

This patch moves the private error queue delivery function from the
af_packet code to the core socket method. In this way, network layers
only needing the error queue for transmit time stamping can share common
code.

Signed-off-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h     |  2 ++
 net/core/sock.c        | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 net/packet/af_packet.c | 48 ++----------------------------------------------
 3 files changed, 51 insertions(+), 46 deletions(-)

(limited to 'net')

diff --git a/include/net/sock.h b/include/net/sock.h
index 95a5a2c6925a..e0473f61693f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2249,6 +2249,8 @@ static inline struct sock *skb_steal_sock(struct sk_buff *skb)
 extern void sock_enable_timestamp(struct sock *sk, int flag);
 extern int sock_get_timestamp(struct sock *, struct timeval __user *);
 extern int sock_get_timestampns(struct sock *, struct timespec __user *);
+extern int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
+			      int level, int type);
 
 /*
  *	Enable debug/info messages
diff --git a/net/core/sock.c b/net/core/sock.c
index 548d716c5f62..85e8de1bc7fd 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -93,6 +93,7 @@
 
 #include <linux/capability.h>
 #include <linux/errno.h>
+#include <linux/errqueue.h>
 #include <linux/types.h>
 #include <linux/socket.h>
 #include <linux/in.h>
@@ -2425,6 +2426,52 @@ void sock_enable_timestamp(struct sock *sk, int flag)
 	}
 }
 
+int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
+		       int level, int type)
+{
+	struct sock_exterr_skb *serr;
+	struct sk_buff *skb, *skb2;
+	int copied, err;
+
+	err = -EAGAIN;
+	skb = skb_dequeue(&sk->sk_error_queue);
+	if (skb == NULL)
+		goto out;
+
+	copied = skb->len;
+	if (copied > len) {
+		msg->msg_flags |= MSG_TRUNC;
+		copied = len;
+	}
+	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	if (err)
+		goto out_free_skb;
+
+	sock_recv_timestamp(msg, sk, skb);
+
+	serr = SKB_EXT_ERR(skb);
+	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
+
+	msg->msg_flags |= MSG_ERRQUEUE;
+	err = copied;
+
+	/* Reset and regenerate socket error */
+	spin_lock_bh(&sk->sk_error_queue.lock);
+	sk->sk_err = 0;
+	if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
+		sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
+		spin_unlock_bh(&sk->sk_error_queue.lock);
+		sk->sk_error_report(sk);
+	} else
+		spin_unlock_bh(&sk->sk_error_queue.lock);
+
+out_free_skb:
+	kfree_skb(skb);
+out:
+	return err;
+}
+EXPORT_SYMBOL(sock_recv_errqueue);
+
 /*
  *	Get a socket option on an socket.
  *
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 4b66c752eae5..4cb28a7f639b 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2638,51 +2638,6 @@ out:
 	return err;
 }
 
-static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
-{
-	struct sock_exterr_skb *serr;
-	struct sk_buff *skb, *skb2;
-	int copied, err;
-
-	err = -EAGAIN;
-	skb = skb_dequeue(&sk->sk_error_queue);
-	if (skb == NULL)
-		goto out;
-
-	copied = skb->len;
-	if (copied > len) {
-		msg->msg_flags |= MSG_TRUNC;
-		copied = len;
-	}
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
-	if (err)
-		goto out_free_skb;
-
-	sock_recv_timestamp(msg, sk, skb);
-
-	serr = SKB_EXT_ERR(skb);
-	put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
-		 sizeof(serr->ee), &serr->ee);
-
-	msg->msg_flags |= MSG_ERRQUEUE;
-	err = copied;
-
-	/* Reset and regenerate socket error */
-	spin_lock_bh(&sk->sk_error_queue.lock);
-	sk->sk_err = 0;
-	if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
-		sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
-		spin_unlock_bh(&sk->sk_error_queue.lock);
-		sk->sk_error_report(sk);
-	} else
-		spin_unlock_bh(&sk->sk_error_queue.lock);
-
-out_free_skb:
-	kfree_skb(skb);
-out:
-	return err;
-}
-
 /*
  *	Pull a packet from our receive queue and hand it to the user.
  *	If necessary we block.
@@ -2708,7 +2663,8 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
 #endif
 
 	if (flags & MSG_ERRQUEUE) {
-		err = packet_recv_error(sk, msg, len);
+		err = sock_recv_errqueue(sk, msg, len,
+					 SOL_PACKET, PACKET_TX_TIMESTAMP);
 		goto out;
 	}
 
-- 
cgit v1.2.3


From 375fe02c91792917aa26d68a87ab110d1937f44e Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 22 Jul 2013 16:20:45 -0700
Subject: tcp: consolidate SYNACK RTT sampling

The first patch consolidates SYNACK and other RTT measurement to use a
central function tcp_ack_update_rtt(). A (small) bonus is now SYNACK
RTT measurement happens after PAWS check, potentially reducing the
impact of RTO seeding on bad TCP timestamps values.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h        |  9 ---------
 net/ipv4/tcp_input.c     | 14 ++++++++++++--
 net/ipv4/tcp_ipv4.c      |  2 --
 net/ipv4/tcp_minisocks.c |  8 ++------
 net/ipv6/tcp_ipv6.c      |  2 --
 5 files changed, 14 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index d1980054ec75..f9777dbede75 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1094,15 +1094,6 @@ static inline void tcp_openreq_init(struct request_sock *req,
 	ireq->loc_port = tcp_hdr(skb)->dest;
 }
 
-/* Compute time elapsed between SYNACK and the ACK completing 3WHS */
-static inline void tcp_synack_rtt_meas(struct sock *sk,
-				       struct request_sock *req)
-{
-	if (tcp_rsk(req)->snt_synack)
-		tcp_valid_rtt_meas(sk,
-		    tcp_time_stamp - tcp_rsk(req)->snt_synack);
-}
-
 extern void tcp_enter_memory_pressure(struct sock *sk);
 
 static inline int keepalive_intvl_when(const struct tcp_sock *tp)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 28af45abe062..b531710596ec 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2853,6 +2853,17 @@ static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
 		tcp_ack_no_tstamp(sk, seq_rtt, flag);
 }
 
+/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
+static void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	s32 seq_rtt = -1;
+
+	if (tp->lsndtime && !tp->total_retrans)
+		seq_rtt = tcp_time_stamp - tp->lsndtime;
+	tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt);
+}
+
 static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -5624,9 +5635,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		 * so release it.
 		 */
 		if (req) {
-			tcp_synack_rtt_meas(sk, req);
 			tp->total_retrans = req->num_retrans;
-
 			reqsk_fastopen_remove(sk, req, false);
 		} else {
 			/* Make sure socket is routed, for correct metrics. */
@@ -5651,6 +5660,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
 		tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
 		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
+		tcp_synack_rtt_meas(sk, req);
 
 		if (tp->rx_opt.tstamp_ok)
 			tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b299da5ff499..2e3f129df0eb 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1671,8 +1671,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
 
 	tcp_initialize_rcv_mss(newsk);
-	tcp_synack_rtt_meas(newsk, req);
-	newtp->total_retrans = req->num_retrans;
 
 #ifdef CONFIG_TCP_MD5SIG
 	/* Copy over the MD5 key from the original socket */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index ab1c08658528..58a3e69aef64 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -411,6 +411,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
 		tcp_enable_early_retrans(newtp);
 		newtp->tlp_high_seq = 0;
+		newtp->lsndtime = treq->snt_synack;
+		newtp->total_retrans = req->num_retrans;
 
 		/* So many TCP implementations out there (incorrectly) count the
 		 * initial SYN frame in their delayed-ACK and congestion control
@@ -666,12 +668,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 	if (!(flg & TCP_FLAG_ACK))
 		return NULL;
 
-	/* Got ACK for our SYNACK, so update baseline for SYNACK RTT sample. */
-	if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr)
-		tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr;
-	else if (req->num_retrans) /* don't take RTT sample if retrans && ~TS */
-		tcp_rsk(req)->snt_synack = 0;
-
 	/* For Fast Open no more processing is needed (sk is the
 	 * child socket).
 	 */
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 6e1649d58533..80fe69ef2188 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1237,8 +1237,6 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
 
 	tcp_initialize_rcv_mss(newsk);
-	tcp_synack_rtt_meas(newsk, req);
-	newtp->total_retrans = req->num_retrans;
 
 	newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6;
 	newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
-- 
cgit v1.2.3


From 5b08e47caf1f2034a3a5b566bbccc8b0be3961ca Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 22 Jul 2013 16:20:46 -0700
Subject: tcp: prefer packet timing to TS-ECR for RTT

Prefer packet timings to TS-ecr for RTT measurements when both
sources are available. That's because broken middle-boxes and remote
peer can return packets with corrupted TS ECR fields. Similarly most
congestion controls that require RTT signals favor timing-based
sources as well. Also check for bad TS ECR values to avoid RTT
blow-ups. It has happened on production Web servers.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    |  1 -
 net/ipv4/tcp_input.c | 67 ++++++++++++++--------------------------------------
 2 files changed, 18 insertions(+), 50 deletions(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index f9777dbede75..c5868471abae 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -591,7 +591,6 @@ extern void tcp_initialize_rcv_mss(struct sock *sk);
 extern int tcp_mtu_to_mss(struct sock *sk, int pmtu);
 extern int tcp_mss_to_mtu(struct sock *sk, int mss);
 extern void tcp_mtup_init(struct sock *sk);
-extern void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt);
 extern void tcp_init_buffer_space(struct sock *sk);
 
 static inline void tcp_bound_rto(const struct sock *sk)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b531710596ec..c7398f05d12b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2792,65 +2792,36 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 	tcp_xmit_retransmit_queue(sk);
 }
 
-void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt)
+static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
+				      s32 seq_rtt)
 {
-	tcp_rtt_estimator(sk, seq_rtt);
-	tcp_set_rto(sk);
-	inet_csk(sk)->icsk_backoff = 0;
-}
-EXPORT_SYMBOL(tcp_valid_rtt_meas);
+	const struct tcp_sock *tp = tcp_sk(sk);
+
+	/* Prefer RTT measured from ACK's timing to TS-ECR. This is because
+	 * broken middle-boxes or peers may corrupt TS-ECR fields. But
+	 * Karn's algorithm forbids taking RTT if some retransmitted data
+	 * is acked (RFC6298).
+	 */
+	if (flag & FLAG_RETRANS_DATA_ACKED)
+		seq_rtt = -1;
 
-/* Read draft-ietf-tcplw-high-performance before mucking
- * with this code. (Supersedes RFC1323)
- */
-static void tcp_ack_saw_tstamp(struct sock *sk, int flag)
-{
 	/* RTTM Rule: A TSecr value received in a segment is used to
 	 * update the averaged RTT measurement only if the segment
 	 * acknowledges some new data, i.e., only if it advances the
 	 * left edge of the send window.
-	 *
 	 * See draft-ietf-tcplw-high-performance-00, section 3.3.
-	 * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
-	 *
-	 * Changed: reset backoff as soon as we see the first valid sample.
-	 * If we do not, we get strongly overestimated rto. With timestamps
-	 * samples are accepted even from very old segments: f.e., when rtt=1
-	 * increases to 8, we retransmit 5 times and after 8 seconds delayed
-	 * answer arrives rto becomes 120 seconds! If at least one of segments
-	 * in window is lost... Voila.	 			--ANK (010210)
 	 */
-	struct tcp_sock *tp = tcp_sk(sk);
+	if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
+		seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
 
-	tcp_valid_rtt_meas(sk, tcp_time_stamp - tp->rx_opt.rcv_tsecr);
-}
-
-static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag)
-{
-	/* We don't have a timestamp. Can only use
-	 * packets that are not retransmitted to determine
-	 * rtt estimates. Also, we must not reset the
-	 * backoff for rto until we get a non-retransmitted
-	 * packet. This allows us to deal with a situation
-	 * where the network delay has increased suddenly.
-	 * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
-	 */
-
-	if (flag & FLAG_RETRANS_DATA_ACKED)
+	if (seq_rtt < 0)
 		return;
 
-	tcp_valid_rtt_meas(sk, seq_rtt);
-}
+	tcp_rtt_estimator(sk, seq_rtt);
+	tcp_set_rto(sk);
 
-static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
-				      const s32 seq_rtt)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-	/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
-	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
-		tcp_ack_saw_tstamp(sk, flag);
-	else if (seq_rtt >= 0)
-		tcp_ack_no_tstamp(sk, seq_rtt, flag);
+	/* RFC6298: only reset backoff on valid RTT measurement. */
+	inet_csk(sk)->icsk_backoff = 0;
 }
 
 /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
@@ -2989,8 +2960,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 			if (sacked & TCPCB_SACKED_RETRANS)
 				tp->retrans_out -= acked_pcount;
 			flag |= FLAG_RETRANS_DATA_ACKED;
-			ca_seq_rtt = -1;
-			seq_rtt = -1;
 		} else {
 			ca_seq_rtt = now - scb->when;
 			last_ackt = skb->tstamp;
-- 
cgit v1.2.3


From 59c9af4234b0c21a1ed05cf65bf014d0c1a67bfd Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 22 Jul 2013 16:20:47 -0700
Subject: tcp: measure RTT from new SACK

Take RTT sample if an ACK selectively acks some sequences that
have never been retransmitted. The Karn's algorithm does not apply
even if that ACK (s)acks other retransmitted sequences, because it
must been generated by an original but perhaps out-of-order packet.
There is no ambiguity. In case when multiple blocks are newly
sacked because of ACK losses the earliest block is used to
measure RTT, similar to cummulative ACKs.

Such RTT samples allow the sender to estimate the RTO during loss
recovery and packet reordering events. It is still useful even with
TCP timestamps. That's because during these events the SND.UNA may
not advance preventing RTT samples from TS ECR (thus the FLAG_ACKED
check before calling tcp_ack_update_rtt()).  Therefore this new
RTT source is complementary to existing ACK and TS RTT mechanisms.

This patch does not update the RTO. It is done in the next patch.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c7398f05d12b..b85bc7c3736a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1048,6 +1048,7 @@ struct tcp_sacktag_state {
 	int reord;
 	int fack_count;
 	int flag;
+	s32 rtt; /* RTT measured by SACKing never-retransmitted data */
 };
 
 /* Check if skb is fully within the SACK block. In presence of GSO skbs,
@@ -1108,7 +1109,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
 static u8 tcp_sacktag_one(struct sock *sk,
 			  struct tcp_sacktag_state *state, u8 sacked,
 			  u32 start_seq, u32 end_seq,
-			  bool dup_sack, int pcount)
+			  int dup_sack, int pcount, u32 xmit_time)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	int fack_count = state->fack_count;
@@ -1148,6 +1149,9 @@ static u8 tcp_sacktag_one(struct sock *sk,
 							   state->reord);
 				if (!after(end_seq, tp->high_seq))
 					state->flag |= FLAG_ORIG_SACK_ACKED;
+				/* Pick the earliest sequence sacked for RTT */
+				if (state->rtt < 0)
+					state->rtt = tcp_time_stamp - xmit_time;
 			}
 
 			if (sacked & TCPCB_LOST) {
@@ -1205,7 +1209,8 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
 	 * tcp_highest_sack_seq() when skb is highest_sack.
 	 */
 	tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
-			start_seq, end_seq, dup_sack, pcount);
+			start_seq, end_seq, dup_sack, pcount,
+			TCP_SKB_CB(skb)->when);
 
 	if (skb == tp->lost_skb_hint)
 		tp->lost_cnt_hint += pcount;
@@ -1479,7 +1484,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
 						TCP_SKB_CB(skb)->seq,
 						TCP_SKB_CB(skb)->end_seq,
 						dup_sack,
-						tcp_skb_pcount(skb));
+						tcp_skb_pcount(skb),
+						TCP_SKB_CB(skb)->when);
 
 			if (!before(TCP_SKB_CB(skb)->seq,
 				    tcp_highest_sack_seq(tp)))
@@ -1536,7 +1542,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl
 
 static int
 tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
-			u32 prior_snd_una)
+			u32 prior_snd_una, s32 *sack_rtt)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	const unsigned char *ptr = (skb_transport_header(ack_skb) +
@@ -1554,6 +1560,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
 
 	state.flag = 0;
 	state.reord = tp->packets_out;
+	state.rtt = -1;
 
 	if (!tp->sacked_out) {
 		if (WARN_ON(tp->fackets_out))
@@ -1737,6 +1744,7 @@ out:
 	WARN_ON((int)tp->retrans_out < 0);
 	WARN_ON((int)tcp_packets_in_flight(tp) < 0);
 #endif
+	*sack_rtt = state.rtt;
 	return state.flag;
 }
 
@@ -3254,6 +3262,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	int prior_packets = tp->packets_out;
 	const int prior_unsacked = tp->packets_out - tp->sacked_out;
 	int acked = 0; /* Number of packets newly acked */
+	s32 sack_rtt = -1;
 
 	/* If the ack is older than previous acks
 	 * then we can probably ignore it.
@@ -3310,7 +3319,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 		flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
 
 		if (TCP_SKB_CB(skb)->sacked)
-			flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
+			flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
+							&sack_rtt);
 
 		if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
 			flag |= FLAG_ECE;
@@ -3382,7 +3392,8 @@ old_ack:
 	 * If data was DSACKed, see if we can undo a cwnd reduction.
 	 */
 	if (TCP_SKB_CB(skb)->sacked) {
-		flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
+		flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
+						&sack_rtt);
 		tcp_fastretrans_alert(sk, acked, prior_unsacked,
 				      is_dupack, flag);
 	}
-- 
cgit v1.2.3


From ed08495c31bb991de636d2488abaa50b39f2ff4a Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 22 Jul 2013 16:20:48 -0700
Subject: tcp: use RTT from SACK for RTO

If RTT is not available because Karn's check has failed or no
new packet is acked, use the RTT measured from SACK to estimate
the RTO. The sender can continue to estimate the RTO during loss
recovery or reordering event upon receiving non-partial ACKs.

This also changes when the RTO is re-armed. Previously it is
only re-armed when some data is cummulatively acknowledged (i.e.,
SND.UNA advances), but now it is re-armed whenever RTT estimator
is updated. This feature is particularly useful to reduce spurious
timeout for buffer bloat including cellular carriers [1], and
RTT estimation on reordering events.

[1] "An In-depth Study of LTE: Effect of Network Protocol and
 Application Behavior on Performance", In Proc. of SIGCOMM 2013

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b85bc7c3736a..b61274b666f6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2800,8 +2800,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 	tcp_xmit_retransmit_queue(sk);
 }
 
-static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
-				      s32 seq_rtt)
+static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
+				      s32 seq_rtt, s32 sack_rtt)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 
@@ -2813,6 +2813,9 @@ static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
 	if (flag & FLAG_RETRANS_DATA_ACKED)
 		seq_rtt = -1;
 
+	if (seq_rtt < 0)
+		seq_rtt = sack_rtt;
+
 	/* RTTM Rule: A TSecr value received in a segment is used to
 	 * update the averaged RTT measurement only if the segment
 	 * acknowledges some new data, i.e., only if it advances the
@@ -2823,13 +2826,14 @@ static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
 		seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
 
 	if (seq_rtt < 0)
-		return;
+		return false;
 
 	tcp_rtt_estimator(sk, seq_rtt);
 	tcp_set_rto(sk);
 
 	/* RFC6298: only reset backoff on valid RTT measurement. */
 	inet_csk(sk)->icsk_backoff = 0;
+	return true;
 }
 
 /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
@@ -2840,7 +2844,7 @@ static void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
 
 	if (tp->lsndtime && !tp->total_retrans)
 		seq_rtt = tcp_time_stamp - tp->lsndtime;
-	tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt);
+	tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1);
 }
 
 static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
@@ -2929,7 +2933,7 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
  * arrived at the other end.
  */
 static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
-			       u32 prior_snd_una)
+			       u32 prior_snd_una, s32 sack_rtt)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -3019,6 +3023,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 	if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
 		flag |= FLAG_SACK_RENEGING;
 
+	if (tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt) ||
+	    (flag & FLAG_ACKED))
+		tcp_rearm_rto(sk);
+
 	if (flag & FLAG_ACKED) {
 		const struct tcp_congestion_ops *ca_ops
 			= inet_csk(sk)->icsk_ca_ops;
@@ -3028,9 +3036,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 			tcp_mtup_probe_success(sk);
 		}
 
-		tcp_ack_update_rtt(sk, flag, seq_rtt);
-		tcp_rearm_rto(sk);
-
 		if (tcp_is_reno(tp)) {
 			tcp_remove_reno_sacks(sk, pkts_acked);
 		} else {
@@ -3339,7 +3344,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	/* See if we can take anything off of the retransmit queue. */
 	acked = tp->packets_out;
-	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
+	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt);
 	acked -= tp->packets_out;
 
 	if (tcp_ack_is_dubious(sk, flag)) {
-- 
cgit v1.2.3


From 4aa5dee4d9997879adff858514844efab5a15a01 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Sat, 20 Jul 2013 12:13:53 +0200
Subject: net: convert resend IGMP to notifier event

Until now, bond_resend_igmp_join_requests() looks for vlans attached to
bonding device, bridge where bonding act as port manually. It does not
care of other scenarios, like stacked bonds or team device above. Make
this more generic and use netdev notifier to propagate the event to
upper devices and to actually call ip_mc_rejoin_groups().

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Acked-by: Veaceslav Falico <vfalico@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 44 ++++++++-------------------------------
 drivers/net/team/team.c         |  4 ++++
 include/linux/igmp.h            |  1 -
 include/linux/netdevice.h       |  1 +
 net/8021q/vlan.c                |  1 +
 net/bridge/br_notify.c          |  5 +++++
 net/ipv4/igmp.c                 | 46 +++++++++++++++++++++++++++++++++++------
 7 files changed, 60 insertions(+), 42 deletions(-)

(limited to 'net')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 07f257d44a1e..ae9864c9fa38 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -715,15 +715,6 @@ static int bond_set_allmulti(struct bonding *bond, int inc)
 	return err;
 }
 
-static void __bond_resend_igmp_join_requests(struct net_device *dev)
-{
-	struct in_device *in_dev;
-
-	in_dev = __in_dev_get_rcu(dev);
-	if (in_dev)
-		ip_mc_rejoin_groups(in_dev);
-}
-
 /*
  * Retrieve the list of registered multicast addresses for the bonding
  * device and retransmit an IGMP JOIN request to the current active
@@ -731,33 +722,12 @@ static void __bond_resend_igmp_join_requests(struct net_device *dev)
  */
 static void bond_resend_igmp_join_requests(struct bonding *bond)
 {
-	struct net_device *bond_dev, *vlan_dev, *upper_dev;
-	struct vlan_entry *vlan;
-
-	read_lock(&bond->lock);
-	rcu_read_lock();
-
-	bond_dev = bond->dev;
-
-	/* rejoin all groups on bond device */
-	__bond_resend_igmp_join_requests(bond_dev);
-
-	/*
-	 * if bond is enslaved to a bridge,
-	 * then rejoin all groups on its master
-	 */
-	upper_dev = netdev_master_upper_dev_get_rcu(bond_dev);
-	if (upper_dev && upper_dev->priv_flags & IFF_EBRIDGE)
-		__bond_resend_igmp_join_requests(upper_dev);
-
-	/* rejoin all groups on vlan devices */
-	list_for_each_entry(vlan, &bond->vlan_list, vlan_list) {
-		vlan_dev = __vlan_find_dev_deep(bond_dev, htons(ETH_P_8021Q),
-						vlan->vlan_id);
-		if (vlan_dev)
-			__bond_resend_igmp_join_requests(vlan_dev);
+	if (!rtnl_trylock()) {
+		queue_delayed_work(bond->wq, &bond->mcast_work, 0);
+		return;
 	}
-	rcu_read_unlock();
+	call_netdevice_notifiers(NETDEV_RESEND_IGMP, bond->dev);
+	rtnl_unlock();
 
 	/* We use curr_slave_lock to protect against concurrent access to
 	 * igmp_retrans from multiple running instances of this function and
@@ -3234,6 +3204,10 @@ static int bond_slave_netdev_event(unsigned long event,
 	case NETDEV_FEAT_CHANGE:
 		bond_compute_features(bond);
 		break;
+	case NETDEV_RESEND_IGMP:
+		/* Propagate to master device */
+		call_netdevice_notifiers(event, slave->bond->dev);
+		break;
 	default:
 		break;
 	}
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 0433ee994f8c..2587dc86d533 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -2785,6 +2785,10 @@ static int team_device_event(struct notifier_block *unused,
 	case NETDEV_PRE_TYPE_CHANGE:
 		/* Forbid to change type of underlaying device */
 		return NOTIFY_BAD;
+	case NETDEV_RESEND_IGMP:
+		/* Propagate to master device */
+		call_netdevice_notifiers(event, port->team->dev);
+		break;
 	}
 	return NOTIFY_DONE;
 }
diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index e3362b5f13e8..f47550d75f85 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -129,6 +129,5 @@ extern void ip_mc_unmap(struct in_device *);
 extern void ip_mc_remap(struct in_device *);
 extern void ip_mc_dec_group(struct in_device *in_dev, __be32 addr);
 extern void ip_mc_inc_group(struct in_device *in_dev, __be32 addr);
-extern void ip_mc_rejoin_groups(struct in_device *in_dev);
 
 #endif
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0741a1e919a5..2bb2357d83bb 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1633,6 +1633,7 @@ struct packet_offload {
 #define NETDEV_NOTIFY_PEERS	0x0013
 #define NETDEV_JOIN		0x0014
 #define NETDEV_CHANGEUPPER	0x0015
+#define NETDEV_RESEND_IGMP	0x0016
 
 extern int register_netdevice_notifier(struct notifier_block *nb);
 extern int unregister_netdevice_notifier(struct notifier_block *nb);
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 2fb2d88e8c2e..03a92e117657 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -459,6 +459,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 
 	case NETDEV_NOTIFY_PEERS:
 	case NETDEV_BONDING_FAILOVER:
+	case NETDEV_RESEND_IGMP:
 		/* Propagate to vlan devices */
 		vlan_group_for_each_dev(grp, i, vlandev)
 			call_netdevice_notifiers(event, vlandev);
diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c
index 3a3f371b2841..2998dd1769a0 100644
--- a/net/bridge/br_notify.c
+++ b/net/bridge/br_notify.c
@@ -102,6 +102,11 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
 	case NETDEV_PRE_TYPE_CHANGE:
 		/* Forbid underlaying device to change its type. */
 		return NOTIFY_BAD;
+
+	case NETDEV_RESEND_IGMP:
+		/* Propagate to master device */
+		call_netdevice_notifiers(event, br->dev);
+		break;
 	}
 
 	/* Events that may cause spanning tree to refresh */
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index cd71190d2962..375aca372250 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1323,16 +1323,17 @@ out:
 EXPORT_SYMBOL(ip_mc_inc_group);
 
 /*
- *	Resend IGMP JOIN report; used for bonding.
- *	Called with rcu_read_lock()
+ *	Resend IGMP JOIN report; used by netdev notifier.
  */
-void ip_mc_rejoin_groups(struct in_device *in_dev)
+static void ip_mc_rejoin_groups(struct in_device *in_dev)
 {
 #ifdef CONFIG_IP_MULTICAST
 	struct ip_mc_list *im;
 	int type;
 
-	for_each_pmc_rcu(in_dev, im) {
+	ASSERT_RTNL();
+
+	for_each_pmc_rtnl(in_dev, im) {
 		if (im->multiaddr == IGMP_ALL_HOSTS)
 			continue;
 
@@ -1349,7 +1350,6 @@ void ip_mc_rejoin_groups(struct in_device *in_dev)
 	}
 #endif
 }
-EXPORT_SYMBOL(ip_mc_rejoin_groups);
 
 /*
  *	A socket has left a multicast group on device dev
@@ -2735,8 +2735,42 @@ static struct pernet_operations igmp_net_ops = {
 	.exit = igmp_net_exit,
 };
 
+static int igmp_netdev_event(struct notifier_block *this,
+			     unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct in_device *in_dev;
+
+	switch (event) {
+	case NETDEV_RESEND_IGMP:
+		in_dev = __in_dev_get_rtnl(dev);
+		if (in_dev)
+			ip_mc_rejoin_groups(in_dev);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block igmp_notifier = {
+	.notifier_call = igmp_netdev_event,
+};
+
 int __init igmp_mc_proc_init(void)
 {
-	return register_pernet_subsys(&igmp_net_ops);
+	int err;
+
+	err = register_pernet_subsys(&igmp_net_ops);
+	if (err)
+		return err;
+	err = register_netdevice_notifier(&igmp_notifier);
+	if (err)
+		goto reg_notif_fail;
+	return 0;
+
+reg_notif_fail:
+	unregister_pernet_subsys(&igmp_net_ops);
+	return err;
 }
 #endif
-- 
cgit v1.2.3


From c4854ec8c483fb8deea7400c11cd7fd7c03e306c Mon Sep 17 00:00:00 2001
From: Rami Rosen <ramirose@gmail.com>
Date: Sat, 20 Jul 2013 15:09:28 +0300
Subject: ipmr: change the prototype of ip_mr_forward().

This patch changes the prototpye of the ip_mr_forward() method to return void
instead of int.

The ip_mr_forward() method always returns 0; moreover, the return value of this
method is not checked anywhere.

Signed-off-by: Rami Rosen <ramirose@gmail.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipmr.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 132a09664704..bacc0bcf48ce 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -127,9 +127,9 @@ static struct kmem_cache *mrt_cachep __read_mostly;
 static struct mr_table *ipmr_new_table(struct net *net, u32 id);
 static void ipmr_free_table(struct mr_table *mrt);
 
-static int ip_mr_forward(struct net *net, struct mr_table *mrt,
-			 struct sk_buff *skb, struct mfc_cache *cache,
-			 int local);
+static void ip_mr_forward(struct net *net, struct mr_table *mrt,
+			  struct sk_buff *skb, struct mfc_cache *cache,
+			  int local);
 static int ipmr_cache_report(struct mr_table *mrt,
 			     struct sk_buff *pkt, vifi_t vifi, int assert);
 static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
@@ -1795,9 +1795,9 @@ static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
 
 /* "local" means that we should preserve one skb (for local delivery) */
 
-static int ip_mr_forward(struct net *net, struct mr_table *mrt,
-			 struct sk_buff *skb, struct mfc_cache *cache,
-			 int local)
+static void ip_mr_forward(struct net *net, struct mr_table *mrt,
+			  struct sk_buff *skb, struct mfc_cache *cache,
+			  int local)
 {
 	int psend = -1;
 	int vif, ct;
@@ -1903,14 +1903,13 @@ last_forward:
 				ipmr_queue_xmit(net, mrt, skb2, cache, psend);
 		} else {
 			ipmr_queue_xmit(net, mrt, skb, cache, psend);
-			return 0;
+			return;
 		}
 	}
 
 dont_forward:
 	if (!local)
 		kfree_skb(skb);
-	return 0;
 }
 
 static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
-- 
cgit v1.2.3


From 2b52c3ada8f4391d82d8088d9169ac2a8c4d0411 Mon Sep 17 00:00:00 2001
From: Rami Rosen <ramirose@gmail.com>
Date: Sun, 21 Jul 2013 03:00:31 +0300
Subject: ip6mr: change the prototype of ip6_mr_forward().

This patch changes the prototpye of the ip6_mr_forward() method to return void
instead of int.

The ip6_mr_forward() method always returns 0; moreover, the return value of this
method is not checked anywhere.

Signed-off-by: Rami Rosen <ramirose@gmail.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6mr.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 583e8d435f9a..6d78615f6726 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -110,8 +110,8 @@ static struct kmem_cache *mrt_cachep __read_mostly;
 static struct mr6_table *ip6mr_new_table(struct net *net, u32 id);
 static void ip6mr_free_table(struct mr6_table *mrt);
 
-static int ip6_mr_forward(struct net *net, struct mr6_table *mrt,
-			  struct sk_buff *skb, struct mfc6_cache *cache);
+static void ip6_mr_forward(struct net *net, struct mr6_table *mrt,
+			   struct sk_buff *skb, struct mfc6_cache *cache);
 static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt,
 			      mifi_t mifi, int assert);
 static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
@@ -2069,8 +2069,8 @@ static int ip6mr_find_vif(struct mr6_table *mrt, struct net_device *dev)
 	return ct;
 }
 
-static int ip6_mr_forward(struct net *net, struct mr6_table *mrt,
-			  struct sk_buff *skb, struct mfc6_cache *cache)
+static void ip6_mr_forward(struct net *net, struct mr6_table *mrt,
+			   struct sk_buff *skb, struct mfc6_cache *cache)
 {
 	int psend = -1;
 	int vif, ct;
@@ -2151,12 +2151,11 @@ forward:
 last_forward:
 	if (psend != -1) {
 		ip6mr_forward2(net, mrt, skb, cache, psend);
-		return 0;
+		return;
 	}
 
 dont_forward:
 	kfree_skb(skb);
-	return 0;
 }
 
 
-- 
cgit v1.2.3


From b9959fd3b0fa8ee5f45012ae5258d87ee6852baa Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Sun, 21 Jul 2013 10:46:25 +0800
Subject: vti: switch to new ip tunnel code

GRE tunnel and IPIP tunnel already switched to the new
ip tunnel code, VTI tunnel can use it too.

Cc: Pravin B Shelar <pshelar@nicira.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Saurabh Mohan <saurabh.mohan@vyatta.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Cong Wang <amwang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_vti.c | 528 ++++++------------------------------------------------
 1 file changed, 52 insertions(+), 476 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 17cc0ffa8c0d..79b263da4168 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -44,176 +44,10 @@
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 
-#define HASH_SIZE  16
-#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&(HASH_SIZE-1))
-
 static struct rtnl_link_ops vti_link_ops __read_mostly;
 
 static int vti_net_id __read_mostly;
-struct vti_net {
-	struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
-	struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
-	struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
-	struct ip_tunnel __rcu *tunnels_wc[1];
-	struct ip_tunnel __rcu **tunnels[4];
-
-	struct net_device *fb_tunnel_dev;
-};
-
-static int vti_fb_tunnel_init(struct net_device *dev);
 static int vti_tunnel_init(struct net_device *dev);
-static void vti_tunnel_setup(struct net_device *dev);
-static void vti_dev_free(struct net_device *dev);
-static int vti_tunnel_bind_dev(struct net_device *dev);
-
-#define VTI_XMIT(stats1, stats2) do {				\
-	int err;						\
-	int pkt_len = skb->len;					\
-	err = dst_output(skb);					\
-	if (net_xmit_eval(err) == 0) {				\
-		u64_stats_update_begin(&(stats1)->syncp);	\
-		(stats1)->tx_bytes += pkt_len;			\
-		(stats1)->tx_packets++;				\
-		u64_stats_update_end(&(stats1)->syncp);		\
-	} else {						\
-		(stats2)->tx_errors++;				\
-		(stats2)->tx_aborted_errors++;			\
-	}							\
-} while (0)
-
-
-static struct ip_tunnel *vti_tunnel_lookup(struct net *net,
-					   __be32 remote, __be32 local)
-{
-	unsigned h0 = HASH(remote);
-	unsigned h1 = HASH(local);
-	struct ip_tunnel *t;
-	struct vti_net *ipn = net_generic(net, vti_net_id);
-
-	for_each_ip_tunnel_rcu(t, ipn->tunnels_r_l[h0 ^ h1])
-		if (local == t->parms.iph.saddr &&
-		    remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
-			return t;
-	for_each_ip_tunnel_rcu(t, ipn->tunnels_r[h0])
-		if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
-			return t;
-
-	for_each_ip_tunnel_rcu(t, ipn->tunnels_l[h1])
-		if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
-			return t;
-
-	for_each_ip_tunnel_rcu(t, ipn->tunnels_wc[0])
-		if (t && (t->dev->flags&IFF_UP))
-			return t;
-	return NULL;
-}
-
-static struct ip_tunnel __rcu **__vti_bucket(struct vti_net *ipn,
-					     struct ip_tunnel_parm *parms)
-{
-	__be32 remote = parms->iph.daddr;
-	__be32 local = parms->iph.saddr;
-	unsigned h = 0;
-	int prio = 0;
-
-	if (remote) {
-		prio |= 2;
-		h ^= HASH(remote);
-	}
-	if (local) {
-		prio |= 1;
-		h ^= HASH(local);
-	}
-	return &ipn->tunnels[prio][h];
-}
-
-static inline struct ip_tunnel __rcu **vti_bucket(struct vti_net *ipn,
-						  struct ip_tunnel *t)
-{
-	return __vti_bucket(ipn, &t->parms);
-}
-
-static void vti_tunnel_unlink(struct vti_net *ipn, struct ip_tunnel *t)
-{
-	struct ip_tunnel __rcu **tp;
-	struct ip_tunnel *iter;
-
-	for (tp = vti_bucket(ipn, t);
-	     (iter = rtnl_dereference(*tp)) != NULL;
-	     tp = &iter->next) {
-		if (t == iter) {
-			rcu_assign_pointer(*tp, t->next);
-			break;
-		}
-	}
-}
-
-static void vti_tunnel_link(struct vti_net *ipn, struct ip_tunnel *t)
-{
-	struct ip_tunnel __rcu **tp = vti_bucket(ipn, t);
-
-	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
-	rcu_assign_pointer(*tp, t);
-}
-
-static struct ip_tunnel *vti_tunnel_locate(struct net *net,
-					   struct ip_tunnel_parm *parms,
-					   int create)
-{
-	__be32 remote = parms->iph.daddr;
-	__be32 local = parms->iph.saddr;
-	struct ip_tunnel *t, *nt;
-	struct ip_tunnel __rcu **tp;
-	struct net_device *dev;
-	char name[IFNAMSIZ];
-	struct vti_net *ipn = net_generic(net, vti_net_id);
-
-	for (tp = __vti_bucket(ipn, parms);
-	     (t = rtnl_dereference(*tp)) != NULL;
-	     tp = &t->next) {
-		if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
-			return t;
-	}
-	if (!create)
-		return NULL;
-
-	if (parms->name[0])
-		strlcpy(name, parms->name, IFNAMSIZ);
-	else
-		strcpy(name, "vti%d");
-
-	dev = alloc_netdev(sizeof(*t), name, vti_tunnel_setup);
-	if (dev == NULL)
-		return NULL;
-
-	dev_net_set(dev, net);
-
-	nt = netdev_priv(dev);
-	nt->parms = *parms;
-	dev->rtnl_link_ops = &vti_link_ops;
-
-	vti_tunnel_bind_dev(dev);
-
-	if (register_netdevice(dev) < 0)
-		goto failed_free;
-
-	dev_hold(dev);
-	vti_tunnel_link(ipn, nt);
-	return nt;
-
-failed_free:
-	free_netdev(dev);
-	return NULL;
-}
-
-static void vti_tunnel_uninit(struct net_device *dev)
-{
-	struct net *net = dev_net(dev);
-	struct vti_net *ipn = net_generic(net, vti_net_id);
-
-	vti_tunnel_unlink(ipn, netdev_priv(dev));
-	dev_put(dev);
-}
 
 static int vti_err(struct sk_buff *skb, u32 info)
 {
@@ -222,6 +56,8 @@ static int vti_err(struct sk_buff *skb, u32 info)
 	 * 8 bytes of packet payload. It means, that precise relaying of
 	 * ICMP in the real Internet is absolutely infeasible.
 	 */
+	struct net *net = dev_net(skb->dev);
+	struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
 	struct iphdr *iph = (struct iphdr *)skb->data;
 	const int type = icmp_hdr(skb)->type;
 	const int code = icmp_hdr(skb)->code;
@@ -252,7 +88,8 @@ static int vti_err(struct sk_buff *skb, u32 info)
 
 	err = -ENOENT;
 
-	t = vti_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
+	t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+			     iph->daddr, iph->saddr, 0);
 	if (t == NULL)
 		goto out;
 
@@ -281,8 +118,11 @@ static int vti_rcv(struct sk_buff *skb)
 {
 	struct ip_tunnel *tunnel;
 	const struct iphdr *iph = ip_hdr(skb);
+	struct net *net = dev_net(skb->dev);
+	struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
 
-	tunnel = vti_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
+	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+				  iph->saddr, iph->daddr, 0);
 	if (tunnel != NULL) {
 		struct pcpu_tstats *tstats;
 
@@ -311,7 +151,6 @@ static int vti_rcv(struct sk_buff *skb)
 static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
-	struct pcpu_tstats *tstats;
 	struct iphdr  *tiph = &tunnel->parms.iph;
 	u8     tos;
 	struct rtable *rt;		/* Route to the other host */
@@ -319,6 +158,7 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct iphdr  *old_iph = ip_hdr(skb);
 	__be32 dst = tiph->daddr;
 	struct flowi4 fl4;
+	int err;
 
 	if (skb->protocol != htons(ETH_P_IP))
 		goto tx_error;
@@ -367,8 +207,10 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	nf_reset(skb);
 	skb->dev = skb_dst(skb)->dev;
 
-	tstats = this_cpu_ptr(dev->tstats);
-	VTI_XMIT(tstats, &dev->stats);
+	err = dst_output(skb);
+	if (net_xmit_eval(err) == 0)
+		err = skb->len;
+	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
 	return NETDEV_TX_OK;
 
 tx_error_icmp:
@@ -379,198 +221,57 @@ tx_error:
 	return NETDEV_TX_OK;
 }
 
-static int vti_tunnel_bind_dev(struct net_device *dev)
-{
-	struct net_device *tdev = NULL;
-	struct ip_tunnel *tunnel;
-	struct iphdr *iph;
-
-	tunnel = netdev_priv(dev);
-	iph = &tunnel->parms.iph;
-
-	if (iph->daddr) {
-		struct rtable *rt;
-		struct flowi4 fl4;
-		memset(&fl4, 0, sizeof(fl4));
-		flowi4_init_output(&fl4, tunnel->parms.link,
-				   be32_to_cpu(tunnel->parms.i_key),
-				   RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
-				   IPPROTO_IPIP, 0,
-				   iph->daddr, iph->saddr, 0, 0);
-		rt = ip_route_output_key(dev_net(dev), &fl4);
-		if (!IS_ERR(rt)) {
-			tdev = rt->dst.dev;
-			ip_rt_put(rt);
-		}
-		dev->flags |= IFF_POINTOPOINT;
-	}
-
-	if (!tdev && tunnel->parms.link)
-		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
-
-	if (tdev) {
-		dev->hard_header_len = tdev->hard_header_len +
-				       sizeof(struct iphdr);
-		dev->mtu = tdev->mtu;
-	}
-	dev->iflink = tunnel->parms.link;
-	return dev->mtu;
-}
-
 static int
 vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 {
 	int err = 0;
 	struct ip_tunnel_parm p;
-	struct ip_tunnel *t;
-	struct net *net = dev_net(dev);
-	struct vti_net *ipn = net_generic(net, vti_net_id);
-
-	switch (cmd) {
-	case SIOCGETTUNNEL:
-		t = NULL;
-		if (dev == ipn->fb_tunnel_dev) {
-			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data,
-					   sizeof(p))) {
-				err = -EFAULT;
-				break;
-			}
-			t = vti_tunnel_locate(net, &p, 0);
-		}
-		if (t == NULL)
-			t = netdev_priv(dev);
-		memcpy(&p, &t->parms, sizeof(p));
-		p.i_flags |= GRE_KEY | VTI_ISVTI;
-		p.o_flags |= GRE_KEY;
-		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
-			err = -EFAULT;
-		break;
-
-	case SIOCADDTUNNEL:
-	case SIOCCHGTUNNEL:
-		err = -EPERM;
-		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
-			goto done;
 
-		err = -EFAULT;
-		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
-			goto done;
+	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+		return -EFAULT;
 
-		err = -EINVAL;
+	if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
 		    p.iph.ihl != 5)
-			goto done;
-
-		t = vti_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
-
-		if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
-			if (t != NULL) {
-				if (t->dev != dev) {
-					err = -EEXIST;
-					break;
-				}
-			} else {
-				if (((dev->flags&IFF_POINTOPOINT) &&
-				    !p.iph.daddr) ||
-				    (!(dev->flags&IFF_POINTOPOINT) &&
-				    p.iph.daddr)) {
-					err = -EINVAL;
-					break;
-				}
-				t = netdev_priv(dev);
-				vti_tunnel_unlink(ipn, t);
-				synchronize_net();
-				t->parms.iph.saddr = p.iph.saddr;
-				t->parms.iph.daddr = p.iph.daddr;
-				t->parms.i_key = p.i_key;
-				t->parms.o_key = p.o_key;
-				t->parms.iph.protocol = IPPROTO_IPIP;
-				memcpy(dev->dev_addr, &p.iph.saddr, 4);
-				memcpy(dev->broadcast, &p.iph.daddr, 4);
-				vti_tunnel_link(ipn, t);
-				netdev_state_change(dev);
-			}
-		}
-
-		if (t) {
-			err = 0;
-			if (cmd == SIOCCHGTUNNEL) {
-				t->parms.i_key = p.i_key;
-				t->parms.o_key = p.o_key;
-				if (t->parms.link != p.link) {
-					t->parms.link = p.link;
-					vti_tunnel_bind_dev(dev);
-					netdev_state_change(dev);
-				}
-			}
-			p.i_flags |= GRE_KEY | VTI_ISVTI;
-			p.o_flags |= GRE_KEY;
-			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms,
-					 sizeof(p)))
-				err = -EFAULT;
-		} else
-			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
-		break;
+			return -EINVAL;
+	}
 
-	case SIOCDELTUNNEL:
-		err = -EPERM;
-		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
-			goto done;
-
-		if (dev == ipn->fb_tunnel_dev) {
-			err = -EFAULT;
-			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data,
-					   sizeof(p)))
-				goto done;
-			err = -ENOENT;
-
-			t = vti_tunnel_locate(net, &p, 0);
-			if (t == NULL)
-				goto done;
-			err = -EPERM;
-			if (t->dev == ipn->fb_tunnel_dev)
-				goto done;
-			dev = t->dev;
-		}
-		unregister_netdevice(dev);
-		err = 0;
-		break;
+	err = ip_tunnel_ioctl(dev, &p, cmd);
+	if (err)
+		return err;
 
-	default:
-		err = -EINVAL;
+	if (cmd != SIOCDELTUNNEL) {
+		p.i_flags |= GRE_KEY | VTI_ISVTI;
+		p.o_flags |= GRE_KEY;
 	}
 
-done:
-	return err;
-}
-
-static int vti_tunnel_change_mtu(struct net_device *dev, int new_mtu)
-{
-	if (new_mtu < 68 || new_mtu > 0xFFF8)
-		return -EINVAL;
-	dev->mtu = new_mtu;
+	if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+		return -EFAULT;
 	return 0;
 }
 
 static const struct net_device_ops vti_netdev_ops = {
 	.ndo_init	= vti_tunnel_init,
-	.ndo_uninit	= vti_tunnel_uninit,
+	.ndo_uninit	= ip_tunnel_uninit,
 	.ndo_start_xmit	= vti_tunnel_xmit,
 	.ndo_do_ioctl	= vti_tunnel_ioctl,
-	.ndo_change_mtu	= vti_tunnel_change_mtu,
+	.ndo_change_mtu	= ip_tunnel_change_mtu,
 	.ndo_get_stats64 = ip_tunnel_get_stats64,
 };
 
-static void vti_dev_free(struct net_device *dev)
+static void vti_tunnel_setup(struct net_device *dev)
 {
-	free_percpu(dev->tstats);
-	free_netdev(dev);
+	dev->netdev_ops		= &vti_netdev_ops;
+	ip_tunnel_setup(dev, vti_net_id);
 }
 
-static void vti_tunnel_setup(struct net_device *dev)
+static int vti_tunnel_init(struct net_device *dev)
 {
-	dev->netdev_ops		= &vti_netdev_ops;
-	dev->destructor		= vti_dev_free;
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	struct iphdr *iph = &tunnel->parms.iph;
+
+	memcpy(dev->dev_addr, &iph->saddr, 4);
+	memcpy(dev->broadcast, &iph->daddr, 4);
 
 	dev->type		= ARPHRD_TUNNEL;
 	dev->hard_header_len	= LL_MAX_HEADER + sizeof(struct iphdr);
@@ -581,38 +282,18 @@ static void vti_tunnel_setup(struct net_device *dev)
 	dev->features		|= NETIF_F_NETNS_LOCAL;
 	dev->features		|= NETIF_F_LLTX;
 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
-}
 
-static int vti_tunnel_init(struct net_device *dev)
-{
-	struct ip_tunnel *tunnel = netdev_priv(dev);
-
-	tunnel->dev = dev;
-	strcpy(tunnel->parms.name, dev->name);
-
-	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
-	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
-
-	dev->tstats = alloc_percpu(struct pcpu_tstats);
-	if (!dev->tstats)
-		return -ENOMEM;
-
-	return 0;
+	return ip_tunnel_init(dev);
 }
 
-static int __net_init vti_fb_tunnel_init(struct net_device *dev)
+static void __net_init vti_fb_tunnel_init(struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 	struct iphdr *iph = &tunnel->parms.iph;
-	struct vti_net *ipn = net_generic(dev_net(dev), vti_net_id);
 
 	iph->version		= 4;
 	iph->protocol		= IPPROTO_IPIP;
 	iph->ihl		= 5;
-
-	dev_hold(dev);
-	rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
-	return 0;
 }
 
 static struct xfrm_tunnel vti_handler __read_mostly = {
@@ -621,76 +302,30 @@ static struct xfrm_tunnel vti_handler __read_mostly = {
 	.priority	=	1,
 };
 
-static void vti_destroy_tunnels(struct vti_net *ipn, struct list_head *head)
-{
-	int prio;
-
-	for (prio = 1; prio < 4; prio++) {
-		int h;
-		for (h = 0; h < HASH_SIZE; h++) {
-			struct ip_tunnel *t;
-
-			t = rtnl_dereference(ipn->tunnels[prio][h]);
-			while (t != NULL) {
-				unregister_netdevice_queue(t->dev, head);
-				t = rtnl_dereference(t->next);
-			}
-		}
-	}
-}
-
 static int __net_init vti_init_net(struct net *net)
 {
 	int err;
-	struct vti_net *ipn = net_generic(net, vti_net_id);
-
-	ipn->tunnels[0] = ipn->tunnels_wc;
-	ipn->tunnels[1] = ipn->tunnels_l;
-	ipn->tunnels[2] = ipn->tunnels_r;
-	ipn->tunnels[3] = ipn->tunnels_r_l;
-
-	ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
-					  "ip_vti0",
-					  vti_tunnel_setup);
-	if (!ipn->fb_tunnel_dev) {
-		err = -ENOMEM;
-		goto err_alloc_dev;
-	}
-	dev_net_set(ipn->fb_tunnel_dev, net);
-
-	err = vti_fb_tunnel_init(ipn->fb_tunnel_dev);
-	if (err)
-		goto err_reg_dev;
-	ipn->fb_tunnel_dev->rtnl_link_ops = &vti_link_ops;
+	struct ip_tunnel_net *itn;
 
-	err = register_netdev(ipn->fb_tunnel_dev);
+	err = ip_tunnel_init_net(net, vti_net_id, &vti_link_ops, "ip_vti0");
 	if (err)
-		goto err_reg_dev;
+		return err;
+	itn = net_generic(net, vti_net_id);
+	vti_fb_tunnel_init(itn->fb_tunnel_dev);
 	return 0;
-
-err_reg_dev:
-	vti_dev_free(ipn->fb_tunnel_dev);
-err_alloc_dev:
-	/* nothing */
-	return err;
 }
 
 static void __net_exit vti_exit_net(struct net *net)
 {
-	struct vti_net *ipn = net_generic(net, vti_net_id);
-	LIST_HEAD(list);
-
-	rtnl_lock();
-	vti_destroy_tunnels(ipn, &list);
-	unregister_netdevice_many(&list);
-	rtnl_unlock();
+	struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+	ip_tunnel_delete_net(itn);
 }
 
 static struct pernet_operations vti_net_ops = {
 	.init = vti_init_net,
 	.exit = vti_exit_net,
 	.id   = &vti_net_id,
-	.size = sizeof(struct vti_net),
+	.size = sizeof(struct ip_tunnel_net),
 };
 
 static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -728,78 +363,19 @@ static void vti_netlink_parms(struct nlattr *data[],
 static int vti_newlink(struct net *src_net, struct net_device *dev,
 		       struct nlattr *tb[], struct nlattr *data[])
 {
-	struct ip_tunnel *nt;
-	struct net *net = dev_net(dev);
-	struct vti_net *ipn = net_generic(net, vti_net_id);
-	int mtu;
-	int err;
-
-	nt = netdev_priv(dev);
-	vti_netlink_parms(data, &nt->parms);
-
-	if (vti_tunnel_locate(net, &nt->parms, 0))
-		return -EEXIST;
+	struct ip_tunnel_parm parms;
 
-	mtu = vti_tunnel_bind_dev(dev);
-	if (!tb[IFLA_MTU])
-		dev->mtu = mtu;
-
-	err = register_netdevice(dev);
-	if (err)
-		goto out;
-
-	dev_hold(dev);
-	vti_tunnel_link(ipn, nt);
-
-out:
-	return err;
+	vti_netlink_parms(data, &parms);
+	return ip_tunnel_newlink(dev, tb, &parms);
 }
 
 static int vti_changelink(struct net_device *dev, struct nlattr *tb[],
 			  struct nlattr *data[])
 {
-	struct ip_tunnel *t, *nt;
-	struct net *net = dev_net(dev);
-	struct vti_net *ipn = net_generic(net, vti_net_id);
 	struct ip_tunnel_parm p;
-	int mtu;
-
-	if (dev == ipn->fb_tunnel_dev)
-		return -EINVAL;
 
-	nt = netdev_priv(dev);
 	vti_netlink_parms(data, &p);
-
-	t = vti_tunnel_locate(net, &p, 0);
-
-	if (t) {
-		if (t->dev != dev)
-			return -EEXIST;
-	} else {
-		t = nt;
-
-		vti_tunnel_unlink(ipn, t);
-		t->parms.iph.saddr = p.iph.saddr;
-		t->parms.iph.daddr = p.iph.daddr;
-		t->parms.i_key = p.i_key;
-		t->parms.o_key = p.o_key;
-		if (dev->type != ARPHRD_ETHER) {
-			memcpy(dev->dev_addr, &p.iph.saddr, 4);
-			memcpy(dev->broadcast, &p.iph.daddr, 4);
-		}
-		vti_tunnel_link(ipn, t);
-		netdev_state_change(dev);
-	}
-
-	if (t->parms.link != p.link) {
-		t->parms.link = p.link;
-		mtu = vti_tunnel_bind_dev(dev);
-		if (!tb[IFLA_MTU])
-			dev->mtu = mtu;
-		netdev_state_change(dev);
-	}
-
-	return 0;
+	return ip_tunnel_changelink(dev, tb, &p);
 }
 
 static size_t vti_get_size(const struct net_device *dev)
@@ -865,7 +441,7 @@ static int __init vti_init(void)
 	err = xfrm4_mode_tunnel_input_register(&vti_handler);
 	if (err < 0) {
 		unregister_pernet_device(&vti_net_ops);
-		pr_info(KERN_INFO "vti init: can't register tunnel\n");
+		pr_info("vti init: can't register tunnel\n");
 	}
 
 	err = rtnl_link_register(&vti_link_ops);
-- 
cgit v1.2.3


From 86a37def2bba8d22f8b512e7c8a0f6f6e197728c Mon Sep 17 00:00:00 2001
From: "fan.du" <fan.du@windriver.com>
Date: Mon, 22 Jul 2013 14:20:12 +0800
Subject: net ipv6: Remove rebundant rt6i_nsiblings initialization

Seting rt->rt6i_nsiblings to zero is rebundant, because above memset
zeroed the rest of rt excluding the first dst memember.

Signed-off-by: Fan Du <fan.du@windriver.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: James Morris <jmorris@namei.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a8c891aa2464..74ab1f74abcd 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -285,7 +285,6 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net,
 		rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
 		rt->rt6i_genid = rt_genid(net);
 		INIT_LIST_HEAD(&rt->rt6i_siblings);
-		rt->rt6i_nsiblings = 0;
 	}
 	return rt;
 }
-- 
cgit v1.2.3


From 9225b23057aeba13957e8033f69a554dc246a80e Mon Sep 17 00:00:00 2001
From: "fan.du" <fan.du@windriver.com>
Date: Mon, 22 Jul 2013 14:21:09 +0800
Subject: net: ipv6 eliminate parameter "int addrlen" in function fib6_add_1

The "int addrlen" in fib6_add_1 is rebundant, as we can get it from
parameter "struct in6_addr *addr" once we modified its type.
And also fix some coding style issues in fib6_add_1

Signed-off-by: Fan Du <fan.du@windriver.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 5fc9c7a68d8d..8b6c77389a04 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -425,8 +425,8 @@ out:
  *	node.
  */
 
-static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr,
-				     int addrlen, int plen,
+static struct fib6_node *fib6_add_1(struct fib6_node *root,
+				     struct in6_addr *addr, int plen,
 				     int offset, int allow_create,
 				     int replace_required)
 {
@@ -543,7 +543,7 @@ insert_above:
 	   but if it is >= plen, the value is ignored in any case.
 	 */
 
-	bit = __ipv6_addr_diff(addr, &key->addr, addrlen);
+	bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr));
 
 	/*
 	 *		(intermediate)[in]
@@ -822,9 +822,9 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info)
 	if (!allow_create && !replace_required)
 		pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
 
-	fn = fib6_add_1(root, &rt->rt6i_dst.addr, sizeof(struct in6_addr),
-			rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst),
-			allow_create, replace_required);
+	fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen,
+			offsetof(struct rt6_info, rt6i_dst), allow_create,
+			replace_required);
 
 	if (IS_ERR(fn)) {
 		err = PTR_ERR(fn);
@@ -863,7 +863,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info)
 			/* Now add the first leaf node to new subtree */
 
 			sn = fib6_add_1(sfn, &rt->rt6i_src.addr,
-					sizeof(struct in6_addr), rt->rt6i_src.plen,
+					rt->rt6i_src.plen,
 					offsetof(struct rt6_info, rt6i_src),
 					allow_create, replace_required);
 
@@ -882,7 +882,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info)
 			fn->subtree = sfn;
 		} else {
 			sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr,
-					sizeof(struct in6_addr), rt->rt6i_src.plen,
+					rt->rt6i_src.plen,
 					offsetof(struct rt6_info, rt6i_src),
 					allow_create, replace_required);
 
-- 
cgit v1.2.3


From 59ea52dc461ebb05b78545064604d92faf8bb16f Mon Sep 17 00:00:00 2001
From: Andi Shyti <andi@etezian.org>
Date: Mon, 22 Jul 2013 14:59:16 +0200
Subject: net: trans_rdma: remove unused function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch gets rid of the following warning:

net/9p/trans_rdma.c:594:12: warning: ‘rdma_cancelled’ defined but not used [-Wunused-function]
 static int rdma_cancelled(struct p9_client *client, struct p9_req_t *req)

The rdma_cancelled function is not called anywhere in the kernel

Signed-off-by: Andi Shyti <andi@etezian.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/9p/trans_rdma.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'net')

diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 928f2bb9bf8d..8f68df5d2973 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -588,17 +588,6 @@ static int rdma_cancel(struct p9_client *client, struct p9_req_t *req)
 	return 1;
 }
 
-/* A request has been fully flushed without a reply.
- * That means we have posted one buffer in excess.
- */
-static int rdma_cancelled(struct p9_client *client, struct p9_req_t *req)
-{
-	struct p9_trans_rdma *rdma = client->trans;
-
-	atomic_inc(&rdma->excess_rc);
-	return 0;
-}
-
 /**
  * trans_create_rdma - Transport method for creating atransport instance
  * @client: client instance
-- 
cgit v1.2.3


From 91705c61b52029ab5da67a15a23eef08667bf40e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Tue, 23 Jul 2013 14:51:47 +0200
Subject: net: sctp: trivial: update mailing list address

The SCTP mailing list address to send patches or questions
to is linux-sctp@vger.kernel.org and not
lksctp-developers@lists.sourceforge.net anymore. Therefore,
update all occurences.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Acked-by: Vlad Yasevich <vyasevich@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/sctp.txt | 5 +----
 include/net/sctp/auth.h           | 2 +-
 include/net/sctp/checksum.h       | 2 +-
 include/net/sctp/constants.h      | 2 +-
 include/net/sctp/sctp.h           | 2 +-
 include/net/sctp/sm.h             | 2 +-
 include/net/sctp/structs.h        | 2 +-
 include/net/sctp/tsnmap.h         | 2 +-
 include/net/sctp/ulpevent.h       | 2 +-
 include/net/sctp/ulpqueue.h       | 2 +-
 include/uapi/linux/sctp.h         | 2 +-
 net/sctp/associola.c              | 2 +-
 net/sctp/auth.c                   | 2 +-
 net/sctp/bind_addr.c              | 2 +-
 net/sctp/chunk.c                  | 2 +-
 net/sctp/command.c                | 2 +-
 net/sctp/debug.c                  | 2 +-
 net/sctp/endpointola.c            | 2 +-
 net/sctp/input.c                  | 2 +-
 net/sctp/inqueue.c                | 2 +-
 net/sctp/ipv6.c                   | 2 +-
 net/sctp/objcnt.c                 | 2 +-
 net/sctp/output.c                 | 2 +-
 net/sctp/outqueue.c               | 2 +-
 net/sctp/primitive.c              | 2 +-
 net/sctp/proc.c                   | 2 +-
 net/sctp/protocol.c               | 4 ++--
 net/sctp/sm_make_chunk.c          | 2 +-
 net/sctp/sm_sideeffect.c          | 2 +-
 net/sctp/sm_statefuns.c           | 2 +-
 net/sctp/sm_statetable.c          | 2 +-
 net/sctp/socket.c                 | 2 +-
 net/sctp/ssnmap.c                 | 2 +-
 net/sctp/sysctl.c                 | 2 +-
 net/sctp/transport.c              | 2 +-
 net/sctp/tsnmap.c                 | 2 +-
 net/sctp/ulpevent.c               | 2 +-
 net/sctp/ulpqueue.c               | 2 +-
 38 files changed, 39 insertions(+), 42 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/sctp.txt b/Documentation/networking/sctp.txt
index 0c790a76910e..97b810ca9082 100644
--- a/Documentation/networking/sctp.txt
+++ b/Documentation/networking/sctp.txt
@@ -19,7 +19,6 @@ of SCTP that is RFC 2960 compliant and provides an programming interface
 referred to as the  UDP-style API of the Sockets Extensions for SCTP, as 
 proposed in IETF Internet-Drafts.    
 
-
 Caveats:  
 
 -lksctp can be built as statically or as a module.  However, be aware that 
@@ -33,6 +32,4 @@ For more information, please visit the lksctp project website:
    http://www.sf.net/projects/lksctp
 
 Or contact the lksctp developers through the mailing list:
-   <lksctp-developers@lists.sourceforge.net> 
-
-
+   <linux-sctp@vger.kernel.org>
diff --git a/include/net/sctp/auth.h b/include/net/sctp/auth.h
index 49bc9577c61e..754c511f4cf9 100644
--- a/include/net/sctp/auth.h
+++ b/include/net/sctp/auth.h
@@ -22,7 +22,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/include/net/sctp/checksum.h b/include/net/sctp/checksum.h
index 0cb08e6fb6df..78b88aa8c810 100644
--- a/include/net/sctp/checksum.h
+++ b/include/net/sctp/checksum.h
@@ -25,7 +25,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index ca50e0751e47..a1e7aa5c8bbf 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -25,7 +25,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index d8e37ecea691..554cf88605fc 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -27,7 +27,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 2a82d1384706..2aa66dda4a5a 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -27,7 +27,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email addresses:
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index e745c92a1532..75c4c16601b6 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -25,7 +25,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email addresses:
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/include/net/sctp/tsnmap.h b/include/net/sctp/tsnmap.h
index 2c5d2b4d5d1e..c361d6f7e0cb 100644
--- a/include/net/sctp/tsnmap.h
+++ b/include/net/sctp/tsnmap.h
@@ -28,7 +28,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h
index ca4693b4e09e..34f0d34d22ce 100644
--- a/include/net/sctp/ulpevent.h
+++ b/include/net/sctp/ulpevent.h
@@ -31,7 +31,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/include/net/sctp/ulpqueue.h b/include/net/sctp/ulpqueue.h
index 00e50ba3f24b..add3237a9fdb 100644
--- a/include/net/sctp/ulpqueue.h
+++ b/include/net/sctp/ulpqueue.h
@@ -30,7 +30,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email addresses:
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 66b466e4ca08..ca451e99b28b 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -28,7 +28,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index bce5b79662a6..e425ba0bd79e 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -28,7 +28,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index ba1dfc3f8def..3aab967081be 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -22,7 +22,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index 64977ea0f9c5..f34ce8bc1395 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -27,7 +27,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 5780565f5b7d..b50b90c16a86 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -24,7 +24,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/command.c b/net/sctp/command.c
index c0044019db9e..f4bebdadf007 100644
--- a/net/sctp/command.c
+++ b/net/sctp/command.c
@@ -25,7 +25,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/debug.c b/net/sctp/debug.c
index f4998780d6df..44aa4e7543a9 100644
--- a/net/sctp/debug.c
+++ b/net/sctp/debug.c
@@ -28,7 +28,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 9e3d257de0e0..825b7543a436 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -29,7 +29,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 3fa4d858c35a..7993495a4c0f 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -29,7 +29,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index cb25f040fed0..e70f60ae92ef 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -30,7 +30,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 09ffcc912d23..85d688f59e6a 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -27,7 +27,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c
index fe012c44f8df..aec346cbed67 100644
--- a/net/sctp/objcnt.c
+++ b/net/sctp/objcnt.c
@@ -26,7 +26,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/output.c b/net/sctp/output.c
index a46d1eb41762..5a55c55d71ad 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -26,7 +26,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index ef9e2bbc0f2f..51313233e635 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -28,7 +28,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/primitive.c b/net/sctp/primitive.c
index 794bb14decde..31fa437da117 100644
--- a/net/sctp/primitive.c
+++ b/net/sctp/primitive.c
@@ -29,7 +29,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 62526c477050..aff0cac45b64 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -22,7 +22,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 4a17494d736c..b52ec2510101 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -29,7 +29,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
@@ -1547,7 +1547,7 @@ module_exit(sctp_exit);
  */
 MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-132");
 MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-132");
-MODULE_AUTHOR("Linux Kernel SCTP developers <lksctp-developers@lists.sourceforge.net>");
+MODULE_AUTHOR("Linux Kernel SCTP developers <linux-sctp@vger.kernel.org>");
 MODULE_DESCRIPTION("Support for the SCTP protocol (RFC2960)");
 module_param_named(no_checksums, sctp_checksum_disable, bool, 0644);
 MODULE_PARM_DESC(no_checksums, "Disable checksums computing and verification");
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 362ae6e2fd93..780a2d4b98a2 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -29,7 +29,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 9da68852ee94..f1f3aac3f3bb 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -28,7 +28,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index f6b7109195a6..93271f0966c6 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -28,7 +28,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index 84d98d8a5a74..64ea5fd87eb2 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -28,7 +28,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index c6670d2e3f8d..02457123bdee 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -34,7 +34,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/ssnmap.c b/net/sctp/ssnmap.c
index da8603523808..72b5939101bb 100644
--- a/net/sctp/ssnmap.c
+++ b/net/sctp/ssnmap.c
@@ -24,7 +24,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 9a5c4c9eddaf..190674702b20 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -25,7 +25,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index bdbbc3fd7c14..9602c52aa617 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -30,7 +30,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/tsnmap.c b/net/sctp/tsnmap.c
index b46019568a86..0eff8667d4c7 100644
--- a/net/sctp/tsnmap.c
+++ b/net/sctp/tsnmap.c
@@ -27,7 +27,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 44a45dbee4df..c07624fca7e5 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -28,7 +28,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 04e3d470f877..2cdb3010f50f 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -27,7 +27,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
-- 
cgit v1.2.3


From 64dc61306ce7da370833289739e2f52dfc6b37ba Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 22 Jul 2013 20:26:31 -0700
Subject: net: add sk_stream_is_writeable() helper

Several call sites use the hardcoded following condition :

sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)

Lets use a helper because TCP_NOTSENT_LOWAT support will change this
condition for TCP sockets.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h    | 4 ++++
 net/ceph/messenger.c  | 2 +-
 net/core/stream.c     | 2 +-
 net/dccp/proto.c      | 4 ++--
 net/ipv4/tcp.c        | 4 ++--
 net/sunrpc/svcsock.c  | 2 +-
 net/sunrpc/xprtsock.c | 2 +-
 7 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/include/net/sock.h b/include/net/sock.h
index e0473f61693f..d0b5fdee50a2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1088,6 +1088,10 @@ static inline struct cg_proto *parent_cg_proto(struct proto *proto,
 }
 #endif
 
+static inline bool sk_stream_is_writeable(const struct sock *sk)
+{
+	return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk);
+}
 
 static inline bool sk_has_memory_pressure(const struct sock *sk)
 {
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index eb0a46a49bd4..3be308e14302 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -409,7 +409,7 @@ static void ceph_sock_write_space(struct sock *sk)
 	 * and net/core/stream.c:sk_stream_write_space().
 	 */
 	if (con_flag_test(con, CON_FLAG_WRITE_PENDING)) {
-		if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
+		if (sk_stream_is_writeable(sk)) {
 			dout("%s %p queueing write work\n", __func__, con);
 			clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 			queue_con(con);
diff --git a/net/core/stream.c b/net/core/stream.c
index f5df85dcd20b..512f0a24269b 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -30,7 +30,7 @@ void sk_stream_write_space(struct sock *sk)
 	struct socket *sock = sk->sk_socket;
 	struct socket_wq *wq;
 
-	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) {
+	if (sk_stream_is_writeable(sk) && sock) {
 		clear_bit(SOCK_NOSPACE, &sock->flags);
 
 		rcu_read_lock();
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 6c7c78b83940..ba64750f0387 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -336,7 +336,7 @@ unsigned int dccp_poll(struct file *file, struct socket *sock,
 			mask |= POLLIN | POLLRDNORM;
 
 		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
-			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
+			if (sk_stream_is_writeable(sk)) {
 				mask |= POLLOUT | POLLWRNORM;
 			} else {  /* send SIGIO later */
 				set_bit(SOCK_ASYNC_NOSPACE,
@@ -347,7 +347,7 @@ unsigned int dccp_poll(struct file *file, struct socket *sock,
 				 * wspace test but before the flags are set,
 				 * IO signal will be lost.
 				 */
-				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
+				if (sk_stream_is_writeable(sk))
 					mask |= POLLOUT | POLLWRNORM;
 			}
 		}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5423223e93c2..5eca9060bb8e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -499,7 +499,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 			mask |= POLLIN | POLLRDNORM;
 
 		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
-			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
+			if (sk_stream_is_writeable(sk)) {
 				mask |= POLLOUT | POLLWRNORM;
 			} else {  /* send SIGIO later */
 				set_bit(SOCK_ASYNC_NOSPACE,
@@ -510,7 +510,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 				 * wspace test but before the flags are set,
 				 * IO signal will be lost.
 				 */
-				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
+				if (sk_stream_is_writeable(sk))
 					mask |= POLLOUT | POLLWRNORM;
 			}
 		} else
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 305374d4fb98..0da6785ec15a 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -442,7 +442,7 @@ static void svc_tcp_write_space(struct sock *sk)
 {
 	struct socket *sock = sk->sk_socket;
 
-	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock)
+	if (sk_stream_is_writeable(sk) && sock)
 		clear_bit(SOCK_NOSPACE, &sock->flags);
 	svc_write_space(sk);
 }
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index ddf0602603bd..d6656d7768f4 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1602,7 +1602,7 @@ static void xs_tcp_write_space(struct sock *sk)
 	read_lock_bh(&sk->sk_callback_lock);
 
 	/* from net/core/stream.c:sk_stream_write_space */
-	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
+	if (sk_stream_is_writeable(sk))
 		xs_write_space(sk);
 
 	read_unlock_bh(&sk->sk_callback_lock);
-- 
cgit v1.2.3


From c9bee3b7fdecb0c1d070c7b54113b3bdfb9a3d36 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 22 Jul 2013 20:27:07 -0700
Subject: tcp: TCP_NOTSENT_LOWAT socket option

Idea of this patch is to add optional limitation of number of
unsent bytes in TCP sockets, to reduce usage of kernel memory.

TCP receiver might announce a big window, and TCP sender autotuning
might allow a large amount of bytes in write queue, but this has little
performance impact if a large part of this buffering is wasted :

Write queue needs to be large only to deal with large BDP, not
necessarily to cope with scheduling delays (incoming ACKS make room
for the application to queue more bytes)

For most workloads, using a value of 128 KB or less is OK to give
applications enough time to react to POLLOUT events in time
(or being awaken in a blocking sendmsg())

This patch adds two ways to set the limit :

1) Per socket option TCP_NOTSENT_LOWAT

2) A sysctl (/proc/sys/net/ipv4/tcp_notsent_lowat) for sockets
not using TCP_NOTSENT_LOWAT socket option (or setting a zero value)
Default value being UINT_MAX (0xFFFFFFFF), meaning this has no effect.

This changes poll()/select()/epoll() to report POLLOUT
only if number of unsent bytes is below tp->nosent_lowat

Note this might increase number of sendmsg()/sendfile() calls
when using non blocking sockets,
and increase number of context switches for blocking sockets.

Note this is not related to SO_SNDLOWAT (as SO_SNDLOWAT is
defined as :
 Specify the minimum number of bytes in the buffer until
 the socket layer will pass the data to the protocol)

Tested:

netperf sessions, and watching /proc/net/protocols "memory" column for TCP

With 200 concurrent netperf -t TCP_STREAM sessions, amount of kernel memory
used by TCP buffers shrinks by ~55 % (20567 pages instead of 45458)

lpq83:~# echo -1 >/proc/sys/net/ipv4/tcp_notsent_lowat
lpq83:~# (super_netperf 200 -t TCP_STREAM -H remote -l 90 &); sleep 60 ; grep TCP /proc/net/protocols
TCPv6     1880      2   45458   no     208   yes  ipv6        y  y  y  y  y  y  y  y  y  y  y  y  y  n  y  y  y  y  y
TCP       1696    508   45458   no     208   yes  kernel      y  y  y  y  y  y  y  y  y  y  y  y  y  n  y  y  y  y  y

lpq83:~# echo 131072 >/proc/sys/net/ipv4/tcp_notsent_lowat
lpq83:~# (super_netperf 200 -t TCP_STREAM -H remote -l 90 &); sleep 60 ; grep TCP /proc/net/protocols
TCPv6     1880      2   20567   no     208   yes  ipv6        y  y  y  y  y  y  y  y  y  y  y  y  y  n  y  y  y  y  y
TCP       1696    508   20567   no     208   yes  kernel      y  y  y  y  y  y  y  y  y  y  y  y  y  n  y  y  y  y  y

Using 128KB has no bad effect on the throughput or cpu usage
of a single flow, although there is an increase of context switches.

A bonus is that we hold socket lock for a shorter amount
of time and should improve latencies of ACK processing.

lpq83:~# echo -1 >/proc/sys/net/ipv4/tcp_notsent_lowat
lpq83:~# perf stat -e context-switches ./netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3
OMNI Send TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : +/-2.500% @ 99% conf.
Local       Remote      Local  Elapsed Throughput Throughput  Local Local  Remote Remote Local   Remote  Service
Send Socket Recv Socket Send   Time               Units       CPU   CPU    CPU    CPU    Service Service Demand
Size        Size        Size   (sec)                          Util  Util   Util   Util   Demand  Demand  Units
Final       Final                                             %     Method %      Method
1651584     6291456     16384  20.00   17447.90   10^6bits/s  3.13  S      -1.00  U      0.353   -1.000  usec/KB

 Performance counter stats for './netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3':

           412,514 context-switches

     200.034645535 seconds time elapsed

lpq83:~# echo 131072 >/proc/sys/net/ipv4/tcp_notsent_lowat
lpq83:~# perf stat -e context-switches ./netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3
OMNI Send TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : +/-2.500% @ 99% conf.
Local       Remote      Local  Elapsed Throughput Throughput  Local Local  Remote Remote Local   Remote  Service
Send Socket Recv Socket Send   Time               Units       CPU   CPU    CPU    CPU    Service Service Demand
Size        Size        Size   (sec)                          Util  Util   Util   Util   Demand  Demand  Units
Final       Final                                             %     Method %      Method
1593240     6291456     16384  20.00   17321.16   10^6bits/s  3.35  S      -1.00  U      0.381   -1.000  usec/KB

 Performance counter stats for './netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3':

         2,675,818 context-switches

     200.029651391 seconds time elapsed

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-By: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 13 +++++++++++++
 include/linux/tcp.h                    |  1 +
 include/net/sock.h                     | 19 +++++++++++++------
 include/net/tcp.h                      | 14 ++++++++++++++
 include/uapi/linux/tcp.h               |  1 +
 net/ipv4/sysctl_net_ipv4.c             |  7 +++++++
 net/ipv4/tcp.c                         |  7 +++++++
 net/ipv4/tcp_ipv4.c                    |  1 +
 net/ipv4/tcp_output.c                  |  3 +++
 net/ipv6/tcp_ipv6.c                    |  1 +
 10 files changed, 61 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 10742902146f..53cea9bcb14c 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -516,6 +516,19 @@ tcp_wmem - vector of 3 INTEGERs: min, default, max
 	this value is ignored.
 	Default: between 64K and 4MB, depending on RAM size.
 
+tcp_notsent_lowat - UNSIGNED INTEGER
+	A TCP socket can control the amount of unsent bytes in its write queue,
+	thanks to TCP_NOTSENT_LOWAT socket option. poll()/select()/epoll()
+	reports POLLOUT events if the amount of unsent bytes is below a per
+	socket value, and if the write queue is not full. sendmsg() will
+	also not add new buffers if the limit is hit.
+
+	This global variable controls the amount of unsent data for
+	sockets not using TCP_NOTSENT_LOWAT. For these sockets, a change
+	to the global variable has immediate effect.
+
+	Default: UINT_MAX (0xFFFFFFFF)
+
 tcp_workaround_signed_windows - BOOLEAN
 	If set, assume no receipt of a window scaling option means the
 	remote TCP is broken and treats the window as a signed quantity.
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 472120b4fac5..9640803a17a7 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -238,6 +238,7 @@ struct tcp_sock {
 
  	u32	rcv_wnd;	/* Current receiver window		*/
 	u32	write_seq;	/* Tail(+1) of data held in tcp send buffer */
+	u32	notsent_lowat;	/* TCP_NOTSENT_LOWAT */
 	u32	pushed_seq;	/* Last pushed seq, required to talk to windows */
 	u32	lost_out;	/* Lost packets			*/
 	u32	sacked_out;	/* SACK'd packets			*/
diff --git a/include/net/sock.h b/include/net/sock.h
index d0b5fdee50a2..b9f2b095b1ab 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -746,11 +746,6 @@ static inline int sk_stream_wspace(const struct sock *sk)
 
 extern void sk_stream_write_space(struct sock *sk);
 
-static inline bool sk_stream_memory_free(const struct sock *sk)
-{
-	return sk->sk_wmem_queued < sk->sk_sndbuf;
-}
-
 /* OOB backlog add */
 static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
 {
@@ -950,6 +945,7 @@ struct proto {
 	unsigned int		inuse_idx;
 #endif
 
+	bool			(*stream_memory_free)(const struct sock *sk);
 	/* Memory pressure */
 	void			(*enter_memory_pressure)(struct sock *sk);
 	atomic_long_t		*memory_allocated;	/* Current allocated memory. */
@@ -1088,11 +1084,22 @@ static inline struct cg_proto *parent_cg_proto(struct proto *proto,
 }
 #endif
 
+static inline bool sk_stream_memory_free(const struct sock *sk)
+{
+	if (sk->sk_wmem_queued >= sk->sk_sndbuf)
+		return false;
+
+	return sk->sk_prot->stream_memory_free ?
+		sk->sk_prot->stream_memory_free(sk) : true;
+}
+
 static inline bool sk_stream_is_writeable(const struct sock *sk)
 {
-	return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk);
+	return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) &&
+	       sk_stream_memory_free(sk);
 }
 
+
 static inline bool sk_has_memory_pressure(const struct sock *sk)
 {
 	return sk->sk_prot->memory_pressure != NULL;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index c5868471abae..18fc999dae3c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -284,6 +284,7 @@ extern int sysctl_tcp_thin_dupack;
 extern int sysctl_tcp_early_retrans;
 extern int sysctl_tcp_limit_output_bytes;
 extern int sysctl_tcp_challenge_ack_limit;
+extern unsigned int sysctl_tcp_notsent_lowat;
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
@@ -1539,6 +1540,19 @@ extern int tcp_gro_complete(struct sk_buff *skb);
 extern void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr,
 				__be32 daddr);
 
+static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp)
+{
+	return tp->notsent_lowat ?: sysctl_tcp_notsent_lowat;
+}
+
+static inline bool tcp_stream_memory_free(const struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	u32 notsent_bytes = tp->write_seq - tp->snd_nxt;
+
+	return notsent_bytes < tcp_notsent_lowat(tp);
+}
+
 #ifdef CONFIG_PROC_FS
 extern int tcp4_proc_init(void);
 extern void tcp4_proc_exit(void);
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 8d776ebc4829..377f1e59411d 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -111,6 +111,7 @@ enum {
 #define TCP_REPAIR_OPTIONS	22
 #define TCP_FASTOPEN		23	/* Enable FastOpen on listeners */
 #define TCP_TIMESTAMP		24
+#define TCP_NOTSENT_LOWAT	25	/* limit number of unsent bytes in write queue */
 
 struct tcp_repair_opt {
 	__u32	opt_code;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index b2c123c44d69..69ed203802da 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -554,6 +554,13 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &one,
 	},
+	{
+		.procname	= "tcp_notsent_lowat",
+		.data		= &sysctl_tcp_notsent_lowat,
+		.maxlen		= sizeof(sysctl_tcp_notsent_lowat),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{
 		.procname	= "tcp_rmem",
 		.data		= &sysctl_tcp_rmem,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5eca9060bb8e..c27e81392398 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2631,6 +2631,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		else
 			tp->tsoffset = val - tcp_time_stamp;
 		break;
+	case TCP_NOTSENT_LOWAT:
+		tp->notsent_lowat = val;
+		sk->sk_write_space(sk);
+		break;
 	default:
 		err = -ENOPROTOOPT;
 		break;
@@ -2847,6 +2851,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 	case TCP_TIMESTAMP:
 		val = tcp_time_stamp + tp->tsoffset;
 		break;
+	case TCP_NOTSENT_LOWAT:
+		val = tp->notsent_lowat;
+		break;
 	default:
 		return -ENOPROTOOPT;
 	}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2e3f129df0eb..2a5d5c469d17 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2800,6 +2800,7 @@ struct proto tcp_prot = {
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
 	.enter_memory_pressure	= tcp_enter_memory_pressure,
+	.stream_memory_free	= tcp_stream_memory_free,
 	.sockets_allocated	= &tcp_sockets_allocated,
 	.orphan_count		= &tcp_orphan_count,
 	.memory_allocated	= &tcp_memory_allocated,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 92fde8d1aa82..884efff5b531 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -65,6 +65,9 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
 /* By default, RFC2861 behavior.  */
 int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
 
+unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX;
+EXPORT_SYMBOL(sysctl_tcp_notsent_lowat);
+
 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 			   int push_one, gfp_t gfp);
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 80fe69ef2188..b792e870686b 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1924,6 +1924,7 @@ struct proto tcpv6_prot = {
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
 	.enter_memory_pressure	= tcp_enter_memory_pressure,
+	.stream_memory_free	= tcp_stream_memory_free,
 	.sockets_allocated	= &tcp_sockets_allocated,
 	.memory_allocated	= &tcp_memory_allocated,
 	.memory_pressure	= &tcp_memory_pressure,
-- 
cgit v1.2.3


From 18afa4b028b46f8b45ca64f94aefe717c297b07d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 23 Jul 2013 16:13:17 +0200
Subject: net: Make devnet_rename_seq static

No users outside net/core/dev.c.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 3 ---
 net/core/dev.c            | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2bb2357d83bb..3ca60b070ef0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1666,9 +1666,6 @@ extern int call_netdevice_notifiers(unsigned long val, struct net_device *dev);
 
 extern rwlock_t				dev_base_lock;		/* Device list lock */
 
-extern seqcount_t	devnet_rename_seq;	/* Device rename seq */
-
-
 #define for_each_netdev(net, d)		\
 		list_for_each_entry(d, &(net)->dev_base_head, dev_list)
 #define for_each_netdev_reverse(net, d)	\
diff --git a/net/core/dev.c b/net/core/dev.c
index 26755dd40daa..dfd9f5d56ae0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -174,7 +174,7 @@ static DEFINE_SPINLOCK(napi_hash_lock);
 static unsigned int napi_gen_id;
 static DEFINE_HASHTABLE(napi_hash, 8);
 
-seqcount_t devnet_rename_seq;
+static seqcount_t devnet_rename_seq;
 
 static inline void dev_base_seq_inc(struct net *net)
 {
-- 
cgit v1.2.3


From 637b4caeedde9b926de6e66d68d0951b0f0c83ef Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Mon, 1 Jul 2013 14:14:46 -0700
Subject: Bluetooth: Fix simple whitespace vs tab style issue

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
---
 net/bluetooth/hci_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index e3a349977595..b821b199b333 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -605,7 +605,7 @@ static void hci_init3_req(struct hci_request *req, unsigned long opt)
 	 * as supported send it. If not supported assume that the controller
 	 * does not have actual support for stored link keys which makes this
 	 * command redundant anyway.
-         */
+	 */
 	if (hdev->commands[6] & 0x80) {
 		struct hci_cp_delete_stored_link_key cp;
 
-- 
cgit v1.2.3


From acabae96df2dff253f831e94c33ef9f0f15600d0 Mon Sep 17 00:00:00 2001
From: Mikel Astiz <mikel.astiz@bmw-carit.de>
Date: Fri, 28 Jun 2013 10:56:28 +0200
Subject: Bluetooth: Use defines in in hci_get_auth_req()

Make the code in hci_get_auth_req() more readable by using the
defined macros instead of inlining magic numbers.

Signed-off-by: Mikel Astiz <mikel.astiz@bmw-carit.de>
Signed-off-by: Timo Mueller <timo.mueller@bmw-carit.de>
Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
---
 net/bluetooth/hci_event.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 0437200d92f4..ce8be0b0e9f0 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -3024,17 +3024,20 @@ unlock:
 static u8 hci_get_auth_req(struct hci_conn *conn)
 {
 	/* If remote requests dedicated bonding follow that lead */
-	if (conn->remote_auth == 0x02 || conn->remote_auth == 0x03) {
+	if (conn->remote_auth == HCI_AT_DEDICATED_BONDING ||
+	    conn->remote_auth == HCI_AT_DEDICATED_BONDING_MITM) {
 		/* If both remote and local IO capabilities allow MITM
 		 * protection then require it, otherwise don't */
-		if (conn->remote_cap == 0x03 || conn->io_capability == 0x03)
-			return 0x02;
+		if (conn->remote_cap == HCI_IO_NO_INPUT_OUTPUT ||
+		    conn->io_capability == HCI_IO_NO_INPUT_OUTPUT)
+			return HCI_AT_DEDICATED_BONDING;
 		else
-			return 0x03;
+			return HCI_AT_DEDICATED_BONDING_MITM;
 	}
 
 	/* If remote requests no-bonding follow that lead */
-	if (conn->remote_auth == 0x00 || conn->remote_auth == 0x01)
+	if (conn->remote_auth == HCI_AT_NO_BONDING ||
+	    conn->remote_auth == HCI_AT_NO_BONDING_MITM)
 		return conn->remote_auth | (conn->auth_type & 0x01);
 
 	return conn->auth_type;
-- 
cgit v1.2.3


From a767631ad1c2e785f4a8fcad26bcf50eb5786373 Mon Sep 17 00:00:00 2001
From: Mikel Astiz <mikel.astiz@bmw-carit.de>
Date: Fri, 28 Jun 2013 10:56:29 +0200
Subject: Bluetooth: Use defines instead of integer literals

Replace the occurrences of integer literals in hci_event.c with the
newly introduced macros in hci.h.

Signed-off-by: Mikel Astiz <mikel.astiz@bmw-carit.de>
Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
---
 net/bluetooth/hci_event.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index ce8be0b0e9f0..50e39f4ad429 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -3069,7 +3069,7 @@ static void hci_io_capa_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
 		/* Change the IO capability from KeyboardDisplay
 		 * to DisplayYesNo as it is not supported by BT spec. */
 		cp.capability = (conn->io_capability == 0x04) ?
-						0x01 : conn->io_capability;
+				HCI_IO_DISPLAY_YESNO : conn->io_capability;
 		conn->auth_type = hci_get_auth_req(conn);
 		cp.authentication = conn->auth_type;
 
@@ -3143,7 +3143,8 @@ static void hci_user_confirm_request_evt(struct hci_dev *hdev,
 	 * request. The only exception is when we're dedicated bonding
 	 * initiators (connect_cfm_cb set) since then we always have the MITM
 	 * bit set. */
-	if (!conn->connect_cfm_cb && loc_mitm && conn->remote_cap == 0x03) {
+	if (!conn->connect_cfm_cb && loc_mitm &&
+	    conn->remote_cap == HCI_IO_NO_INPUT_OUTPUT) {
 		BT_DBG("Rejecting request: remote device can't provide MITM");
 		hci_send_cmd(hdev, HCI_OP_USER_CONFIRM_NEG_REPLY,
 			     sizeof(ev->bdaddr), &ev->bdaddr);
@@ -3151,8 +3152,8 @@ static void hci_user_confirm_request_evt(struct hci_dev *hdev,
 	}
 
 	/* If no side requires MITM protection; auto-accept */
-	if ((!loc_mitm || conn->remote_cap == 0x03) &&
-	    (!rem_mitm || conn->io_capability == 0x03)) {
+	if ((!loc_mitm || conn->remote_cap == HCI_IO_NO_INPUT_OUTPUT) &&
+	    (!rem_mitm || conn->io_capability == HCI_IO_NO_INPUT_OUTPUT)) {
 
 		/* If we're not the initiators request authorization to
 		 * proceed from user space (mgmt_user_confirm with
-- 
cgit v1.2.3


From 1c244f79c0d6abf8634faedb9b042122481c3572 Mon Sep 17 00:00:00 2001
From: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
Date: Fri, 7 Dec 2012 03:29:10 -0200
Subject: Bluetooth: Add missing braces to an "else if"

Trivial change in the coding style.

Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
---
 net/bluetooth/l2cap_core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 8c3499bec893..b3bb7bca8e60 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -1415,8 +1415,9 @@ static void l2cap_conn_ready(struct l2cap_conn *conn)
 			sk->sk_state_change(sk);
 			release_sock(sk);
 
-		} else if (chan->state == BT_CONNECT)
+		} else if (chan->state == BT_CONNECT) {
 			l2cap_do_start(chan);
+		}
 
 		l2cap_chan_unlock(chan);
 	}
-- 
cgit v1.2.3


From 2583d706a13d0dc7fa591d5bb036454d0ddbf5b0 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Thu, 11 Jul 2013 15:41:29 +0200
Subject: Bluetooth: hidp: implement hidinput_input_event callback

We can re-enable hidinput_input_event to allow the leds of bluetooth
keyboards to be set.
Now the callbacks uses hid core to retrieve the right HID report to
send, so this version is safer.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Reviewed-by: David Herrmann <dh.herrmann@gmail.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
---
 net/bluetooth/hidp/core.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'net')

diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 0c699cdc3696..2977bf7e4b8e 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -238,6 +238,31 @@ static int hidp_send_report(struct hidp_session *session, struct hid_report *rep
 	return hidp_send_intr_message(session, hdr, buf, rsize);
 }
 
+static int hidp_hidinput_event(struct input_dev *dev, unsigned int type,
+			       unsigned int code, int value)
+{
+	struct hid_device *hid = input_get_drvdata(dev);
+	struct hidp_session *session = hid->driver_data;
+	struct hid_field *field;
+	int offset;
+
+	BT_DBG("session %p type %d code %d value %d",
+	       session, type, code, value);
+
+	if (type != EV_LED)
+		return -1;
+
+	offset = hidinput_find_field(hid, type, code, &field);
+	if (offset == -1) {
+		hid_warn(dev, "event field not found\n");
+		return -1;
+	}
+
+	hid_set_field(field, offset, value);
+
+	return hidp_send_report(session, field->report);
+}
+
 static int hidp_get_raw_report(struct hid_device *hid,
 		unsigned char report_number,
 		unsigned char *data, size_t count,
@@ -711,6 +736,7 @@ static struct hid_ll_driver hidp_hid_driver = {
 	.stop = hidp_stop,
 	.open  = hidp_open,
 	.close = hidp_close,
+	.hidinput_input_event = hidp_hidinput_event,
 };
 
 /* This function sets up the hid device. It does not add it
-- 
cgit v1.2.3


From 159d865f2078ffa4441abb0155f725368371f836 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Thu, 11 Jul 2013 15:41:30 +0200
Subject: Bluetooth: hidp: remove wrong send_report at init

The USB hid implementation does retrieve the reports during the start.
However, this implementation does not call the HID command GET_REPORT
(which would fetch the current status of each report), but use the
DATA command, which is an Output Report (so transmitting data from the
host to the device).
The Wiimote controller is already guarded against this problem in the
protocol, but it is not conformant to the specification to set all the
reports to 0 on start.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Reviewed-by: David Herrmann <dh.herrmann@gmail.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
---
 net/bluetooth/hidp/core.c | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 2977bf7e4b8e..13863de433a4 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -703,20 +703,6 @@ static int hidp_parse(struct hid_device *hid)
 
 static int hidp_start(struct hid_device *hid)
 {
-	struct hidp_session *session = hid->driver_data;
-	struct hid_report *report;
-
-	if (hid->quirks & HID_QUIRK_NO_INIT_REPORTS)
-		return 0;
-
-	list_for_each_entry(report, &hid->report_enum[HID_INPUT_REPORT].
-			report_list, list)
-		hidp_send_report(session, report);
-
-	list_for_each_entry(report, &hid->report_enum[HID_FEATURE_REPORT].
-			report_list, list)
-		hidp_send_report(session, report);
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From 93d8bf9fb8f39d6d3e461db60f883d9f81006159 Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Wed, 24 Jul 2013 11:51:41 -0700
Subject: bridge: cleanup netpoll code

This started out with fixing a sparse warning, then I realized that
the wrapper function br_netpoll_info could just be collapsed away
by rolling it into the enable code.

Also, eliminate unnecessary goto's

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Reviewed-by: Jiri Pirko <jiri@resnulli.us>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_device.c  | 12 ++++++------
 net/bridge/br_if.c      |  3 ++-
 net/bridge/br_private.h | 10 ----------
 3 files changed, 8 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 2ef66781fedb..50d86b3e3941 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -244,22 +244,22 @@ fail:
 int br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp)
 {
 	struct netpoll *np;
-	int err = 0;
+	int err;
+
+	if (!p->br->dev->npinfo)
+		return 0;
 
 	np = kzalloc(sizeof(*p->np), gfp);
-	err = -ENOMEM;
 	if (!np)
-		goto out;
+		return -ENOMEM;
 
 	err = __netpoll_setup(np, p->dev, gfp);
 	if (err) {
 		kfree(np);
-		goto out;
+		return err;
 	}
 
 	p->np = np;
-
-out:
 	return err;
 }
 
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 5623be6b9ecd..aa6c9a8ba32a 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -363,7 +363,8 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 	if (err)
 		goto err2;
 
-	if (br_netpoll_info(br) && ((err = br_netpoll_enable(p, GFP_KERNEL))))
+	err = br_netpoll_enable(p, GFP_KERNEL);
+	if (err)
 		goto err3;
 
 	err = netdev_master_upper_dev_link(dev, br->dev);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 3be89b3ce17b..43347f129943 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -333,11 +333,6 @@ extern void br_dev_delete(struct net_device *dev, struct list_head *list);
 extern netdev_tx_t br_dev_xmit(struct sk_buff *skb,
 			       struct net_device *dev);
 #ifdef CONFIG_NET_POLL_CONTROLLER
-static inline struct netpoll_info *br_netpoll_info(struct net_bridge *br)
-{
-	return br->dev->npinfo;
-}
-
 static inline void br_netpoll_send_skb(const struct net_bridge_port *p,
 				       struct sk_buff *skb)
 {
@@ -350,11 +345,6 @@ static inline void br_netpoll_send_skb(const struct net_bridge_port *p,
 extern int br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp);
 extern void br_netpoll_disable(struct net_bridge_port *p);
 #else
-static inline struct netpoll_info *br_netpoll_info(struct net_bridge *br)
-{
-	return NULL;
-}
-
 static inline void br_netpoll_send_skb(const struct net_bridge_port *p,
 				       struct sk_buff *skb)
 {
-- 
cgit v1.2.3


From 024ec3deac33ddbd81f3c887506f132b24ea21a7 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Thu, 25 Jul 2013 10:52:05 +0900
Subject: net/sctp: Refactor SCTP skb checksum computation

This patch consolidates the SCTP checksum calculation code from various
places to a single new function, sctp_compute_cksum(skb, offset).

Signed-off-by: Joe Stringer <joe@wand.net.nz>
Reviewed-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Simon Horman <horms@verge.net.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/checksum.h           | 15 +++++++++++++++
 net/netfilter/ipvs/ip_vs_proto_sctp.c | 23 +++--------------------
 net/netfilter/nf_nat_proto_sctp.c     |  8 +-------
 net/sctp/input.c                      | 10 +---------
 4 files changed, 20 insertions(+), 36 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/checksum.h b/include/net/sctp/checksum.h
index 78b88aa8c810..483e6303458d 100644
--- a/include/net/sctp/checksum.h
+++ b/include/net/sctp/checksum.h
@@ -85,4 +85,19 @@ static inline __le32 sctp_end_cksum(__u32 crc32)
 	return cpu_to_le32(~crc32);
 }
 
+/* Calculate the CRC32C checksum of an SCTP packet.  */
+static inline __le32 sctp_compute_cksum(const struct sk_buff *skb,
+					unsigned int offset)
+{
+	const struct sk_buff *iter;
+
+	__u32 crc32 = sctp_start_cksum(skb->data + offset,
+				       skb_headlen(skb) - offset);
+	skb_walk_frags(skb, iter)
+		crc32 = sctp_update_cksum((__u8 *) iter->data,
+					  skb_headlen(iter), crc32);
+
+	return sctp_end_cksum(crc32);
+}
+
 #endif /* __sctp_checksum_h__ */
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 3c0da8728036..23e596e438b3 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -66,15 +66,7 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 static void sctp_nat_csum(struct sk_buff *skb, sctp_sctphdr_t *sctph,
 			  unsigned int sctphoff)
 {
-	__u32 crc32;
-	struct sk_buff *iter;
-
-	crc32 = sctp_start_cksum((__u8 *)sctph, skb_headlen(skb) - sctphoff);
-	skb_walk_frags(skb, iter)
-		crc32 = sctp_update_cksum((u8 *) iter->data,
-					  skb_headlen(iter), crc32);
-	sctph->checksum = sctp_end_cksum(crc32);
-
+	sctph->checksum = sctp_compute_cksum(skb, sctphoff);
 	skb->ip_summed = CHECKSUM_UNNECESSARY;
 }
 
@@ -151,10 +143,7 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
 {
 	unsigned int sctphoff;
 	struct sctphdr *sh, _sctph;
-	struct sk_buff *iter;
-	__le32 cmp;
-	__le32 val;
-	__u32 tmp;
+	__le32 cmp, val;
 
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6)
@@ -168,13 +157,7 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
 		return 0;
 
 	cmp = sh->checksum;
-
-	tmp = sctp_start_cksum((__u8 *) sh, skb_headlen(skb));
-	skb_walk_frags(skb, iter)
-		tmp = sctp_update_cksum((__u8 *) iter->data,
-					skb_headlen(iter), tmp);
-
-	val = sctp_end_cksum(tmp);
+	val = sctp_compute_cksum(skb, sctphoff);
 
 	if (val != cmp) {
 		/* CRC failure, dump it. */
diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c
index 396e55d46f90..754536f2c674 100644
--- a/net/netfilter/nf_nat_proto_sctp.c
+++ b/net/netfilter/nf_nat_proto_sctp.c
@@ -34,9 +34,7 @@ sctp_manip_pkt(struct sk_buff *skb,
 	       const struct nf_conntrack_tuple *tuple,
 	       enum nf_nat_manip_type maniptype)
 {
-	struct sk_buff *frag;
 	sctp_sctphdr_t *hdr;
-	__u32 crc32;
 
 	if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
 		return false;
@@ -51,11 +49,7 @@ sctp_manip_pkt(struct sk_buff *skb,
 		hdr->dest = tuple->dst.u.sctp.port;
 	}
 
-	crc32 = sctp_start_cksum((u8 *)hdr, skb_headlen(skb) - hdroff);
-	skb_walk_frags(skb, frag)
-		crc32 = sctp_update_cksum((u8 *)frag->data, skb_headlen(frag),
-					  crc32);
-	hdr->checksum = sctp_end_cksum(crc32);
+	hdr->checksum = sctp_compute_cksum(skb, hdroff);
 
 	return true;
 }
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 7993495a4c0f..fa91aff02388 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -87,15 +87,7 @@ static inline int sctp_rcv_checksum(struct net *net, struct sk_buff *skb)
 {
 	struct sctphdr *sh = sctp_hdr(skb);
 	__le32 cmp = sh->checksum;
-	struct sk_buff *list;
-	__le32 val;
-	__u32 tmp = sctp_start_cksum((__u8 *)sh, skb_headlen(skb));
-
-	skb_walk_frags(skb, list)
-		tmp = sctp_update_cksum((__u8 *)list->data, skb_headlen(list),
-					tmp);
-
-	val = sctp_end_cksum(tmp);
+	__le32 val = sctp_compute_cksum(skb, 0);
 
 	if (val != cmp) {
 		/* CRC failure, dump it. */
-- 
cgit v1.2.3


From 82a54d0ebbee03a8dcf4e1e4016a53fed4d6c933 Mon Sep 17 00:00:00 2001
From: Asias He <asias@redhat.com>
Date: Thu, 25 Jul 2013 17:39:34 +0800
Subject: VSOCK: Move af_vsock.h and vsock_addr.h to include/net

This is useful for other VSOCK transport implemented outside the
net/vmw_vsock/ directory to use these headers.

Signed-off-by: Asias He <asias@redhat.com>
Acked-by: Andy King <acking@vmware.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/af_vsock.h         | 175 +++++++++++++++++++++++++++++++++++++++++
 include/net/vsock_addr.h       |  30 +++++++
 net/vmw_vsock/af_vsock.c       |   3 +-
 net/vmw_vsock/af_vsock.h       | 175 -----------------------------------------
 net/vmw_vsock/vmci_transport.c |   2 +-
 net/vmw_vsock/vmci_transport.h |   4 +-
 net/vmw_vsock/vsock_addr.c     |   3 +-
 net/vmw_vsock/vsock_addr.h     |  30 -------
 8 files changed, 210 insertions(+), 212 deletions(-)
 create mode 100644 include/net/af_vsock.h
 create mode 100644 include/net/vsock_addr.h
 delete mode 100644 net/vmw_vsock/af_vsock.h
 delete mode 100644 net/vmw_vsock/vsock_addr.h

(limited to 'net')

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
new file mode 100644
index 000000000000..7d64d3609ec9
--- /dev/null
+++ b/include/net/af_vsock.h
@@ -0,0 +1,175 @@
+/*
+ * VMware vSockets Driver
+ *
+ * Copyright (C) 2007-2013 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef __AF_VSOCK_H__
+#define __AF_VSOCK_H__
+
+#include <linux/kernel.h>
+#include <linux/workqueue.h>
+#include <linux/vm_sockets.h>
+
+#include "vsock_addr.h"
+
+#define LAST_RESERVED_PORT 1023
+
+#define vsock_sk(__sk)    ((struct vsock_sock *)__sk)
+#define sk_vsock(__vsk)   (&(__vsk)->sk)
+
+struct vsock_sock {
+	/* sk must be the first member. */
+	struct sock sk;
+	struct sockaddr_vm local_addr;
+	struct sockaddr_vm remote_addr;
+	/* Links for the global tables of bound and connected sockets. */
+	struct list_head bound_table;
+	struct list_head connected_table;
+	/* Accessed without the socket lock held. This means it can never be
+	 * modified outsided of socket create or destruct.
+	 */
+	bool trusted;
+	bool cached_peer_allow_dgram;	/* Dgram communication allowed to
+					 * cached peer?
+					 */
+	u32 cached_peer;  /* Context ID of last dgram destination check. */
+	const struct cred *owner;
+	/* Rest are SOCK_STREAM only. */
+	long connect_timeout;
+	/* Listening socket that this came from. */
+	struct sock *listener;
+	/* Used for pending list and accept queue during connection handshake.
+	 * The listening socket is the head for both lists.  Sockets created
+	 * for connection requests are placed in the pending list until they
+	 * are connected, at which point they are put in the accept queue list
+	 * so they can be accepted in accept().  If accept() cannot accept the
+	 * connection, it is marked as rejected so the cleanup function knows
+	 * to clean up the socket.
+	 */
+	struct list_head pending_links;
+	struct list_head accept_queue;
+	bool rejected;
+	struct delayed_work dwork;
+	u32 peer_shutdown;
+	bool sent_request;
+	bool ignore_connecting_rst;
+
+	/* Private to transport. */
+	void *trans;
+};
+
+s64 vsock_stream_has_data(struct vsock_sock *vsk);
+s64 vsock_stream_has_space(struct vsock_sock *vsk);
+void vsock_pending_work(struct work_struct *work);
+struct sock *__vsock_create(struct net *net,
+			    struct socket *sock,
+			    struct sock *parent,
+			    gfp_t priority, unsigned short type);
+
+/**** TRANSPORT ****/
+
+struct vsock_transport_recv_notify_data {
+	u64 data1; /* Transport-defined. */
+	u64 data2; /* Transport-defined. */
+	bool notify_on_block;
+};
+
+struct vsock_transport_send_notify_data {
+	u64 data1; /* Transport-defined. */
+	u64 data2; /* Transport-defined. */
+};
+
+struct vsock_transport {
+	/* Initialize/tear-down socket. */
+	int (*init)(struct vsock_sock *, struct vsock_sock *);
+	void (*destruct)(struct vsock_sock *);
+	void (*release)(struct vsock_sock *);
+
+	/* Connections. */
+	int (*connect)(struct vsock_sock *);
+
+	/* DGRAM. */
+	int (*dgram_bind)(struct vsock_sock *, struct sockaddr_vm *);
+	int (*dgram_dequeue)(struct kiocb *kiocb, struct vsock_sock *vsk,
+			     struct msghdr *msg, size_t len, int flags);
+	int (*dgram_enqueue)(struct vsock_sock *, struct sockaddr_vm *,
+			     struct iovec *, size_t len);
+	bool (*dgram_allow)(u32 cid, u32 port);
+
+	/* STREAM. */
+	/* TODO: stream_bind() */
+	ssize_t (*stream_dequeue)(struct vsock_sock *, struct iovec *,
+				  size_t len, int flags);
+	ssize_t (*stream_enqueue)(struct vsock_sock *, struct iovec *,
+				  size_t len);
+	s64 (*stream_has_data)(struct vsock_sock *);
+	s64 (*stream_has_space)(struct vsock_sock *);
+	u64 (*stream_rcvhiwat)(struct vsock_sock *);
+	bool (*stream_is_active)(struct vsock_sock *);
+	bool (*stream_allow)(u32 cid, u32 port);
+
+	/* Notification. */
+	int (*notify_poll_in)(struct vsock_sock *, size_t, bool *);
+	int (*notify_poll_out)(struct vsock_sock *, size_t, bool *);
+	int (*notify_recv_init)(struct vsock_sock *, size_t,
+		struct vsock_transport_recv_notify_data *);
+	int (*notify_recv_pre_block)(struct vsock_sock *, size_t,
+		struct vsock_transport_recv_notify_data *);
+	int (*notify_recv_pre_dequeue)(struct vsock_sock *, size_t,
+		struct vsock_transport_recv_notify_data *);
+	int (*notify_recv_post_dequeue)(struct vsock_sock *, size_t,
+		ssize_t, bool, struct vsock_transport_recv_notify_data *);
+	int (*notify_send_init)(struct vsock_sock *,
+		struct vsock_transport_send_notify_data *);
+	int (*notify_send_pre_block)(struct vsock_sock *,
+		struct vsock_transport_send_notify_data *);
+	int (*notify_send_pre_enqueue)(struct vsock_sock *,
+		struct vsock_transport_send_notify_data *);
+	int (*notify_send_post_enqueue)(struct vsock_sock *, ssize_t,
+		struct vsock_transport_send_notify_data *);
+
+	/* Shutdown. */
+	int (*shutdown)(struct vsock_sock *, int);
+
+	/* Buffer sizes. */
+	void (*set_buffer_size)(struct vsock_sock *, u64);
+	void (*set_min_buffer_size)(struct vsock_sock *, u64);
+	void (*set_max_buffer_size)(struct vsock_sock *, u64);
+	u64 (*get_buffer_size)(struct vsock_sock *);
+	u64 (*get_min_buffer_size)(struct vsock_sock *);
+	u64 (*get_max_buffer_size)(struct vsock_sock *);
+
+	/* Addressing. */
+	u32 (*get_local_cid)(void);
+};
+
+/**** CORE ****/
+
+int vsock_core_init(const struct vsock_transport *t);
+void vsock_core_exit(void);
+
+/**** UTILS ****/
+
+void vsock_release_pending(struct sock *pending);
+void vsock_add_pending(struct sock *listener, struct sock *pending);
+void vsock_remove_pending(struct sock *listener, struct sock *pending);
+void vsock_enqueue_accept(struct sock *listener, struct sock *connected);
+void vsock_insert_connected(struct vsock_sock *vsk);
+void vsock_remove_bound(struct vsock_sock *vsk);
+void vsock_remove_connected(struct vsock_sock *vsk);
+struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr);
+struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
+					 struct sockaddr_vm *dst);
+void vsock_for_each_connected_socket(void (*fn)(struct sock *sk));
+
+#endif /* __AF_VSOCK_H__ */
diff --git a/include/net/vsock_addr.h b/include/net/vsock_addr.h
new file mode 100644
index 000000000000..9ccd5316eac0
--- /dev/null
+++ b/include/net/vsock_addr.h
@@ -0,0 +1,30 @@
+/*
+ * VMware vSockets Driver
+ *
+ * Copyright (C) 2007-2013 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _VSOCK_ADDR_H_
+#define _VSOCK_ADDR_H_
+
+#include <linux/vm_sockets.h>
+
+void vsock_addr_init(struct sockaddr_vm *addr, u32 cid, u32 port);
+int vsock_addr_validate(const struct sockaddr_vm *addr);
+bool vsock_addr_bound(const struct sockaddr_vm *addr);
+void vsock_addr_unbind(struct sockaddr_vm *addr);
+bool vsock_addr_equals_addr(const struct sockaddr_vm *addr,
+			    const struct sockaddr_vm *other);
+int vsock_addr_cast(const struct sockaddr *addr, size_t len,
+		    struct sockaddr_vm **out_addr);
+
+#endif
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 593071dabd1c..57896eee1c25 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -96,8 +96,7 @@
 #include <linux/wait.h>
 #include <linux/workqueue.h>
 #include <net/sock.h>
-
-#include "af_vsock.h"
+#include <net/af_vsock.h>
 
 static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr);
 static void vsock_sk_destruct(struct sock *sk);
diff --git a/net/vmw_vsock/af_vsock.h b/net/vmw_vsock/af_vsock.h
deleted file mode 100644
index 7d64d3609ec9..000000000000
--- a/net/vmw_vsock/af_vsock.h
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * VMware vSockets Driver
- *
- * Copyright (C) 2007-2013 VMware, Inc. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation version 2 and no later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- */
-
-#ifndef __AF_VSOCK_H__
-#define __AF_VSOCK_H__
-
-#include <linux/kernel.h>
-#include <linux/workqueue.h>
-#include <linux/vm_sockets.h>
-
-#include "vsock_addr.h"
-
-#define LAST_RESERVED_PORT 1023
-
-#define vsock_sk(__sk)    ((struct vsock_sock *)__sk)
-#define sk_vsock(__vsk)   (&(__vsk)->sk)
-
-struct vsock_sock {
-	/* sk must be the first member. */
-	struct sock sk;
-	struct sockaddr_vm local_addr;
-	struct sockaddr_vm remote_addr;
-	/* Links for the global tables of bound and connected sockets. */
-	struct list_head bound_table;
-	struct list_head connected_table;
-	/* Accessed without the socket lock held. This means it can never be
-	 * modified outsided of socket create or destruct.
-	 */
-	bool trusted;
-	bool cached_peer_allow_dgram;	/* Dgram communication allowed to
-					 * cached peer?
-					 */
-	u32 cached_peer;  /* Context ID of last dgram destination check. */
-	const struct cred *owner;
-	/* Rest are SOCK_STREAM only. */
-	long connect_timeout;
-	/* Listening socket that this came from. */
-	struct sock *listener;
-	/* Used for pending list and accept queue during connection handshake.
-	 * The listening socket is the head for both lists.  Sockets created
-	 * for connection requests are placed in the pending list until they
-	 * are connected, at which point they are put in the accept queue list
-	 * so they can be accepted in accept().  If accept() cannot accept the
-	 * connection, it is marked as rejected so the cleanup function knows
-	 * to clean up the socket.
-	 */
-	struct list_head pending_links;
-	struct list_head accept_queue;
-	bool rejected;
-	struct delayed_work dwork;
-	u32 peer_shutdown;
-	bool sent_request;
-	bool ignore_connecting_rst;
-
-	/* Private to transport. */
-	void *trans;
-};
-
-s64 vsock_stream_has_data(struct vsock_sock *vsk);
-s64 vsock_stream_has_space(struct vsock_sock *vsk);
-void vsock_pending_work(struct work_struct *work);
-struct sock *__vsock_create(struct net *net,
-			    struct socket *sock,
-			    struct sock *parent,
-			    gfp_t priority, unsigned short type);
-
-/**** TRANSPORT ****/
-
-struct vsock_transport_recv_notify_data {
-	u64 data1; /* Transport-defined. */
-	u64 data2; /* Transport-defined. */
-	bool notify_on_block;
-};
-
-struct vsock_transport_send_notify_data {
-	u64 data1; /* Transport-defined. */
-	u64 data2; /* Transport-defined. */
-};
-
-struct vsock_transport {
-	/* Initialize/tear-down socket. */
-	int (*init)(struct vsock_sock *, struct vsock_sock *);
-	void (*destruct)(struct vsock_sock *);
-	void (*release)(struct vsock_sock *);
-
-	/* Connections. */
-	int (*connect)(struct vsock_sock *);
-
-	/* DGRAM. */
-	int (*dgram_bind)(struct vsock_sock *, struct sockaddr_vm *);
-	int (*dgram_dequeue)(struct kiocb *kiocb, struct vsock_sock *vsk,
-			     struct msghdr *msg, size_t len, int flags);
-	int (*dgram_enqueue)(struct vsock_sock *, struct sockaddr_vm *,
-			     struct iovec *, size_t len);
-	bool (*dgram_allow)(u32 cid, u32 port);
-
-	/* STREAM. */
-	/* TODO: stream_bind() */
-	ssize_t (*stream_dequeue)(struct vsock_sock *, struct iovec *,
-				  size_t len, int flags);
-	ssize_t (*stream_enqueue)(struct vsock_sock *, struct iovec *,
-				  size_t len);
-	s64 (*stream_has_data)(struct vsock_sock *);
-	s64 (*stream_has_space)(struct vsock_sock *);
-	u64 (*stream_rcvhiwat)(struct vsock_sock *);
-	bool (*stream_is_active)(struct vsock_sock *);
-	bool (*stream_allow)(u32 cid, u32 port);
-
-	/* Notification. */
-	int (*notify_poll_in)(struct vsock_sock *, size_t, bool *);
-	int (*notify_poll_out)(struct vsock_sock *, size_t, bool *);
-	int (*notify_recv_init)(struct vsock_sock *, size_t,
-		struct vsock_transport_recv_notify_data *);
-	int (*notify_recv_pre_block)(struct vsock_sock *, size_t,
-		struct vsock_transport_recv_notify_data *);
-	int (*notify_recv_pre_dequeue)(struct vsock_sock *, size_t,
-		struct vsock_transport_recv_notify_data *);
-	int (*notify_recv_post_dequeue)(struct vsock_sock *, size_t,
-		ssize_t, bool, struct vsock_transport_recv_notify_data *);
-	int (*notify_send_init)(struct vsock_sock *,
-		struct vsock_transport_send_notify_data *);
-	int (*notify_send_pre_block)(struct vsock_sock *,
-		struct vsock_transport_send_notify_data *);
-	int (*notify_send_pre_enqueue)(struct vsock_sock *,
-		struct vsock_transport_send_notify_data *);
-	int (*notify_send_post_enqueue)(struct vsock_sock *, ssize_t,
-		struct vsock_transport_send_notify_data *);
-
-	/* Shutdown. */
-	int (*shutdown)(struct vsock_sock *, int);
-
-	/* Buffer sizes. */
-	void (*set_buffer_size)(struct vsock_sock *, u64);
-	void (*set_min_buffer_size)(struct vsock_sock *, u64);
-	void (*set_max_buffer_size)(struct vsock_sock *, u64);
-	u64 (*get_buffer_size)(struct vsock_sock *);
-	u64 (*get_min_buffer_size)(struct vsock_sock *);
-	u64 (*get_max_buffer_size)(struct vsock_sock *);
-
-	/* Addressing. */
-	u32 (*get_local_cid)(void);
-};
-
-/**** CORE ****/
-
-int vsock_core_init(const struct vsock_transport *t);
-void vsock_core_exit(void);
-
-/**** UTILS ****/
-
-void vsock_release_pending(struct sock *pending);
-void vsock_add_pending(struct sock *listener, struct sock *pending);
-void vsock_remove_pending(struct sock *listener, struct sock *pending);
-void vsock_enqueue_accept(struct sock *listener, struct sock *connected);
-void vsock_insert_connected(struct vsock_sock *vsk);
-void vsock_remove_bound(struct vsock_sock *vsk);
-void vsock_remove_connected(struct vsock_sock *vsk);
-struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr);
-struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
-					 struct sockaddr_vm *dst);
-void vsock_for_each_connected_socket(void (*fn)(struct sock *sk));
-
-#endif /* __AF_VSOCK_H__ */
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index ffc11df02af2..9d6986634e0b 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -34,8 +34,8 @@
 #include <linux/wait.h>
 #include <linux/workqueue.h>
 #include <net/sock.h>
+#include <net/af_vsock.h>
 
-#include "af_vsock.h"
 #include "vmci_transport_notify.h"
 
 static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg);
diff --git a/net/vmw_vsock/vmci_transport.h b/net/vmw_vsock/vmci_transport.h
index fd88ea8924e4..ce6c9623d5f0 100644
--- a/net/vmw_vsock/vmci_transport.h
+++ b/net/vmw_vsock/vmci_transport.h
@@ -19,8 +19,8 @@
 #include <linux/vmw_vmci_defs.h>
 #include <linux/vmw_vmci_api.h>
 
-#include "vsock_addr.h"
-#include "af_vsock.h"
+#include <net/vsock_addr.h>
+#include <net/af_vsock.h>
 
 /* If the packet format changes in a release then this should change too. */
 #define VMCI_TRANSPORT_PACKET_VERSION 1
diff --git a/net/vmw_vsock/vsock_addr.c b/net/vmw_vsock/vsock_addr.c
index ec2611b4ea0e..82486ee55eac 100644
--- a/net/vmw_vsock/vsock_addr.c
+++ b/net/vmw_vsock/vsock_addr.c
@@ -17,8 +17,7 @@
 #include <linux/socket.h>
 #include <linux/stddef.h>
 #include <net/sock.h>
-
-#include "vsock_addr.h"
+#include <net/vsock_addr.h>
 
 void vsock_addr_init(struct sockaddr_vm *addr, u32 cid, u32 port)
 {
diff --git a/net/vmw_vsock/vsock_addr.h b/net/vmw_vsock/vsock_addr.h
deleted file mode 100644
index 9ccd5316eac0..000000000000
--- a/net/vmw_vsock/vsock_addr.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * VMware vSockets Driver
- *
- * Copyright (C) 2007-2013 VMware, Inc. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation version 2 and no later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- */
-
-#ifndef _VSOCK_ADDR_H_
-#define _VSOCK_ADDR_H_
-
-#include <linux/vm_sockets.h>
-
-void vsock_addr_init(struct sockaddr_vm *addr, u32 cid, u32 port);
-int vsock_addr_validate(const struct sockaddr_vm *addr);
-bool vsock_addr_bound(const struct sockaddr_vm *addr);
-void vsock_addr_unbind(struct sockaddr_vm *addr);
-bool vsock_addr_equals_addr(const struct sockaddr_vm *addr,
-			    const struct sockaddr_vm *other);
-int vsock_addr_cast(const struct sockaddr *addr, size_t len,
-		    struct sockaddr_vm **out_addr);
-
-#endif
-- 
cgit v1.2.3


From c26bf4a51308c85a6f97628253b99767a84ff90a Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 25 Jul 2013 18:12:18 +0200
Subject: pktgen: Add UDPCSUM flag to support UDP checksums

UDP checksums are optional, hence pktgen has been omitting them in
favour of performance. The optional flag UDPCSUM enables UDP
checksumming. If the output device supports hardware checksumming
the skb is prepared and marked CHECKSUM_PARTIAL, otherwise the
checksum is generated in software.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Ben Greear <greearb@candelatech.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/udp.h |  1 +
 net/core/pktgen.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 net/ipv4/udp.c    |  3 ++-
 3 files changed, 55 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/net/udp.h b/include/net/udp.h
index 74c10ec5e74f..ef2e0b7843a0 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -183,6 +183,7 @@ extern int udp_sendmsg(struct kiocb *iocb, struct sock *sk,
 			    struct msghdr *msg, size_t len);
 extern int udp_push_pending_frames(struct sock *sk);
 extern void udp_flush_pending_frames(struct sock *sk);
+extern void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst);
 extern int udp_rcv(struct sk_buff *skb);
 extern int udp_ioctl(struct sock *sk, int cmd, unsigned long arg);
 extern int udp_disconnect(struct sock *sk, int flags);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 9640972ec50e..48cebf2c3e7a 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -160,6 +160,7 @@
 #include <net/net_namespace.h>
 #include <net/checksum.h>
 #include <net/ipv6.h>
+#include <net/udp.h>
 #include <net/addrconf.h>
 #ifdef CONFIG_XFRM
 #include <net/xfrm.h>
@@ -198,6 +199,7 @@
 #define F_QUEUE_MAP_RND (1<<13)	/* queue map Random */
 #define F_QUEUE_MAP_CPU (1<<14)	/* queue map mirrors smp_processor_id() */
 #define F_NODE          (1<<15)	/* Node memory alloc*/
+#define F_UDPCSUM       (1<<16)	/* Include UDP checksum */
 
 /* Thread control flag bits */
 #define T_STOP        (1<<0)	/* Stop run */
@@ -631,6 +633,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
 	if (pkt_dev->flags & F_UDPDST_RND)
 		seq_printf(seq, "UDPDST_RND  ");
 
+	if (pkt_dev->flags & F_UDPCSUM)
+		seq_printf(seq, "UDPCSUM  ");
+
 	if (pkt_dev->flags & F_MPLS_RND)
 		seq_printf(seq,  "MPLS_RND  ");
 
@@ -1228,6 +1233,12 @@ static ssize_t pktgen_if_write(struct file *file,
 		else if (strcmp(f, "!NODE_ALLOC") == 0)
 			pkt_dev->flags &= ~F_NODE;
 
+		else if (strcmp(f, "UDPCSUM") == 0)
+			pkt_dev->flags |= F_UDPCSUM;
+
+		else if (strcmp(f, "!UDPCSUM") == 0)
+			pkt_dev->flags &= ~F_UDPCSUM;
+
 		else {
 			sprintf(pg_result,
 				"Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
@@ -2733,7 +2744,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
 	udph->source = htons(pkt_dev->cur_udp_src);
 	udph->dest = htons(pkt_dev->cur_udp_dst);
 	udph->len = htons(datalen + 8);	/* DATA + udphdr */
-	udph->check = 0;	/* No checksum */
+	udph->check = 0;
 
 	iph->ihl = 5;
 	iph->version = 4;
@@ -2752,6 +2763,24 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
 	skb->protocol = protocol;
 	skb->dev = odev;
 	skb->pkt_type = PACKET_HOST;
+
+	if (!(pkt_dev->flags & F_UDPCSUM)) {
+		skb->ip_summed = CHECKSUM_NONE;
+	} else if (odev->features & NETIF_F_V4_CSUM) {
+		skb->ip_summed = CHECKSUM_PARTIAL;
+		skb->csum = 0;
+		udp4_hwcsum(skb, udph->source, udph->dest);
+	} else {
+		__wsum csum = udp_csum(skb);
+
+		/* add protocol-dependent pseudo-header */
+		udph->check = csum_tcpudp_magic(udph->source, udph->dest,
+						datalen + 8, IPPROTO_UDP, csum);
+
+		if (udph->check == 0)
+			udph->check = CSUM_MANGLED_0;
+	}
+
 	pktgen_finalize_skb(pkt_dev, skb, datalen);
 
 #ifdef CONFIG_XFRM
@@ -2768,7 +2797,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
 	struct sk_buff *skb = NULL;
 	__u8 *eth;
 	struct udphdr *udph;
-	int datalen;
+	int datalen, udplen;
 	struct ipv6hdr *iph;
 	__be16 protocol = htons(ETH_P_IPV6);
 	__be32 *mpls;
@@ -2844,10 +2873,11 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
 		net_info_ratelimited("increased datalen to %d\n", datalen);
 	}
 
+	udplen = datalen + sizeof(struct udphdr);
 	udph->source = htons(pkt_dev->cur_udp_src);
 	udph->dest = htons(pkt_dev->cur_udp_dst);
-	udph->len = htons(datalen + sizeof(struct udphdr));
-	udph->check = 0;	/* No checksum */
+	udph->len = htons(udplen);
+	udph->check = 0;
 
 	*(__be32 *) iph = htonl(0x60000000);	/* Version + flow */
 
@@ -2858,7 +2888,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
 
 	iph->hop_limit = 32;
 
-	iph->payload_len = htons(sizeof(struct udphdr) + datalen);
+	iph->payload_len = htons(udplen);
 	iph->nexthdr = IPPROTO_UDP;
 
 	iph->daddr = pkt_dev->cur_in6_daddr;
@@ -2868,6 +2898,23 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
 	skb->dev = odev;
 	skb->pkt_type = PACKET_HOST;
 
+	if (!(pkt_dev->flags & F_UDPCSUM)) {
+		skb->ip_summed = CHECKSUM_NONE;
+	} else if (odev->features & NETIF_F_V6_CSUM) {
+		skb->ip_summed = CHECKSUM_PARTIAL;
+		skb->csum_start = skb_transport_header(skb) - skb->head;
+		skb->csum_offset = offsetof(struct udphdr, check);
+		udph->check = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, udplen, IPPROTO_UDP, 0);
+	} else {
+		__wsum csum = udp_csum(skb);
+
+		/* add protocol-dependent pseudo-header */
+		udph->check = csum_ipv6_magic(&iph->saddr, &iph->daddr, udplen, IPPROTO_UDP, csum);
+
+		if (udph->check == 0)
+			udph->check = CSUM_MANGLED_0;
+	}
+
 	pktgen_finalize_skb(pkt_dev, skb, datalen);
 
 	return skb;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 766e6bab9113..9e88af0e8ab0 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -704,7 +704,7 @@ EXPORT_SYMBOL(udp_flush_pending_frames);
  *	@src:	source IP address
  *	@dst:	destination IP address
  */
-static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
+void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
 {
 	struct udphdr *uh = udp_hdr(skb);
 	struct sk_buff *frags = skb_shinfo(skb)->frag_list;
@@ -740,6 +740,7 @@ static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
 			uh->check = CSUM_MANGLED_0;
 	}
 }
+EXPORT_SYMBOL_GPL(udp4_hwcsum);
 
 static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
 {
-- 
cgit v1.2.3


From 03c633e73353d3c116cab6164933d69ee2180470 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 25 Jul 2013 14:08:04 +0200
Subject: pktgen: Use ip_send_check() to compute checksum

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 48cebf2c3e7a..2dec6f1b0cd7 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2758,8 +2758,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
 	iph->frag_off = 0;
 	iplen = 20 + 8 + datalen;
 	iph->tot_len = htons(iplen);
-	iph->check = 0;
-	iph->check = ip_fast_csum((void *)iph, iph->ihl);
+	ip_send_check(iph);
 	skb->protocol = protocol;
 	skb->dev = odev;
 	skb->pkt_type = PACKET_HOST;
-- 
cgit v1.2.3


From 9d4a0314642918cbda9ed4012df51e8df608fce6 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Fri, 26 Jul 2013 17:05:16 +0200
Subject: ipv4, ipv6: send igmpv3/mld packets with TC_PRIO_CONTROL

v2:
a) Also send ipv4 igmp messages with TC_PRIO_CONTROL

Cc: William Manley <william.manley@youview.com>
Cc: Lukas Tribus <luky-37@hotmail.com>
Acked-by: Benjamin LaHaise <bcrl@kvack.org>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c  | 3 +++
 net/ipv6/mcast.c | 4 +++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 375aca372250..ef7618630f36 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -88,6 +88,7 @@
 #include <linux/if_arp.h>
 #include <linux/rtnetlink.h>
 #include <linux/times.h>
+#include <linux/pkt_sched.h>
 
 #include <net/net_namespace.h>
 #include <net/arp.h>
@@ -315,6 +316,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
 		if (size < 256)
 			return NULL;
 	}
+	skb->priority = TC_PRIO_CONTROL;
 	igmp_skb_size(skb) = size;
 
 	rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0,
@@ -670,6 +672,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
 		ip_rt_put(rt);
 		return -1;
 	}
+	skb->priority = TC_PRIO_CONTROL;
 
 	skb_dst_set(skb, &rt->dst);
 
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 99cd65c715cd..db25b8eb62bd 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -44,6 +44,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
+#include <linux/pkt_sched.h>
 #include <net/mld.h>
 
 #include <linux/netfilter.h>
@@ -1376,6 +1377,7 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, int size)
 	if (!skb)
 		return NULL;
 
+	skb->priority = TC_PRIO_CONTROL;
 	skb_reserve(skb, hlen);
 
 	if (__ipv6_get_lladdr(idev, &addr_buf, IFA_F_TENTATIVE)) {
@@ -1769,7 +1771,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
 		rcu_read_unlock();
 		return;
 	}
-
+	skb->priority = TC_PRIO_CONTROL;
 	skb_reserve(skb, hlen);
 
 	if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) {
-- 
cgit v1.2.3


From 73d94e9481a20817abe2f1b41ee441bb4f6461f7 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Mon, 29 Jul 2013 16:21:26 +1000
Subject: pktgen: add needed include file

Fixes this on PowerPC (at least):

net/core/pktgen.c: In function 'fill_packet_ipv6':
net/core/pktgen.c:2906:3: error: implicit declaration of function 'csum_ipv6_magic' [-Werror=implicit-function-declaration]
   udph->check = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, udplen, IPPROTO_UDP, 0);
   ^

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 2dec6f1b0cd7..261357a66300 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -161,6 +161,7 @@
 #include <net/checksum.h>
 #include <net/ipv6.h>
 #include <net/udp.h>
+#include <net/ip6_checksum.h>
 #include <net/addrconf.h>
 #ifdef CONFIG_XFRM
 #include <net/xfrm.h>
-- 
cgit v1.2.3


From 60ff779c4abba37a31bd8624ef45026f7fb1b70c Mon Sep 17 00:00:00 2001
From: Andi Shyti <andi@etezian.org>
Date: Thu, 25 Jul 2013 10:54:24 +0200
Subject: 9p: client: remove unused code and any reference to "cancelled"
 function

This patch reverts commit

80b45261a0b263536b043c5ccfc4ba4fc27c2acc

which was implementing a 'cancelled' functionality to notify that
a cancelled request will not be replied.

This implementation was not used anywhere and therefore removed.

Signed-off-by: Andi Shyti <andi@etezian.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/9p/transport.h | 3 ---
 net/9p/client.c            | 9 ++-------
 2 files changed, 2 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/include/net/9p/transport.h b/include/net/9p/transport.h
index d9fa68f26c41..9a36d9297114 100644
--- a/include/net/9p/transport.h
+++ b/include/net/9p/transport.h
@@ -40,8 +40,6 @@
  * @close: member function to discard a connection on this transport
  * @request: member function to issue a request to the transport
  * @cancel: member function to cancel a request (if it hasn't been sent)
- * @cancelled: member function to notify that a cancelled request will not
- *             not receive a reply
  *
  * This is the basic API for a transport module which is registered by the
  * transport module with the 9P core network module and used by the client
@@ -60,7 +58,6 @@ struct p9_trans_module {
 	void (*close) (struct p9_client *);
 	int (*request) (struct p9_client *, struct p9_req_t *req);
 	int (*cancel) (struct p9_client *, struct p9_req_t *req);
-	int (*cancelled)(struct p9_client *, struct p9_req_t *req);
 	int (*zc_request)(struct p9_client *, struct p9_req_t *,
 			  char *, char *, int , int, int, int);
 };
diff --git a/net/9p/client.c b/net/9p/client.c
index 8b93cae2d11d..ba93bdab2701 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -658,17 +658,12 @@ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq)
 
 	/*
 	 * if we haven't received a response for oldreq,
-	 * remove it from the list, and notify the transport
-	 * layer that the reply will never arrive.
+	 * remove it from the list
 	 */
-	spin_lock(&c->lock);
 	if (oldreq->status == REQ_STATUS_FLSH) {
+		spin_lock(&c->lock);
 		list_del(&oldreq->req_list);
 		spin_unlock(&c->lock);
-		if (c->trans_mod->cancelled)
-			c->trans_mod->cancelled(c, req);
-	} else {
-		spin_unlock(&c->lock);
 	}
 
 	p9_free_req(c, req);
-- 
cgit v1.2.3


From 5ad37d5deee1ff7150a2d0602370101de158ad86 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Fri, 26 Jul 2013 17:43:23 +0200
Subject: tcp: add tcp_syncookies mode to allow unconditionally generation of
 syncookies

| If you want to test which effects syncookies have to your
| network connections you can set this knob to 2 to enable
| unconditionally generation of syncookies.

Original idea and first implementation by Eric Dumazet.

Cc: Florian Westphal <fw@strlen.de>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 4 ++++
 net/ipv4/tcp_ipv4.c                    | 5 +++--
 net/ipv6/tcp_ipv6.c                    | 3 ++-
 3 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 53cea9bcb14c..36be26b2ef7a 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -440,6 +440,10 @@ tcp_syncookies - BOOLEAN
 	SYN flood warnings in logs not being really flooded, your server
 	is seriously misconfigured.
 
+	If you want to test which effects syncookies have to your
+	network connections you can set this knob to 2 to enable
+	unconditionally generation of syncookies.
+
 tcp_fastopen - INTEGER
 	Enable TCP Fast Open feature (draft-ietf-tcpm-fastopen) to send data
 	in the opening SYN packet. To use this feature, the client application
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2a5d5c469d17..280efe5f19c1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -890,7 +890,7 @@ bool tcp_syn_flood_action(struct sock *sk,
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
 
 	lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
-	if (!lopt->synflood_warned) {
+	if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
 		lopt->synflood_warned = 1;
 		pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
 			proto, ntohs(tcp_hdr(skb)->dest), msg);
@@ -1462,7 +1462,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	 * limitations, they conserve resources and peer is
 	 * evidently real one.
 	 */
-	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
+	if ((sysctl_tcp_syncookies == 2 ||
+	     inet_csk_reqsk_queue_is_full(sk)) && !isn) {
 		want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
 		if (!want_cookie)
 			goto drop;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index b792e870686b..38c196ca6011 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -963,7 +963,8 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	if (!ipv6_unicast_destination(skb))
 		goto drop;
 
-	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
+	if ((sysctl_tcp_syncookies == 2 ||
+	     inet_csk_reqsk_queue_is_full(sk)) && !isn) {
 		want_cookie = tcp_syn_flood_action(sk, skb, "TCPv6");
 		if (!want_cookie)
 			goto drop;
-- 
cgit v1.2.3


From ffd756b3174e496cf6f3c5458c434e31d2cd48b0 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Mon, 29 Jul 2013 13:44:15 +0200
Subject: pktgen: Require CONFIG_INET due to use of IPv4 checksum function

Unlike for IPv6, the IPv4 checksum functions are only available
if CONFIG_INET is set.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/Kconfig b/net/Kconfig
index 37702491abe9..37162eb88f07 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -281,7 +281,7 @@ menu "Network testing"
 
 config NET_PKTGEN
 	tristate "Packet Generator (USE WITH CAUTION)"
-	depends on PROC_FS
+	depends on INET && PROC_FS
 	---help---
 	  This module will inject preconfigured packets, at a configurable
 	  rate, out of a given interface.  It is used for network interface
-- 
cgit v1.2.3


From 66b52b0dc82c5c88d769dc1c7d44cf45d0deb07c Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Mon, 29 Jul 2013 18:16:49 +0200
Subject: net: add ndo to get id of physical port of the device

This patch adds a ndo for getting physical port of the device. Driver
which is aware of being virtual function of some physical port should
implement this ndo. This is applicable not only for IOV, but for other
solutions (NPAR, multichannel) as well. Basically if there is possible
to have multiple netdevs on the single hw port.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Acked-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 20 ++++++++++++++++++++
 net/core/dev.c            | 18 ++++++++++++++++++
 2 files changed, 38 insertions(+)

(limited to 'net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3ca60b070ef0..875f869dc38a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -728,6 +728,16 @@ struct netdev_fcoe_hbainfo {
 };
 #endif
 
+#define MAX_PHYS_PORT_ID_LEN 32
+
+/* This structure holds a unique identifier to identify the
+ * physical port used by a netdevice.
+ */
+struct netdev_phys_port_id {
+	unsigned char id[MAX_PHYS_PORT_ID_LEN];
+	unsigned char id_len;
+};
+
 /*
  * This structure defines the management hooks for network devices.
  * The following hooks can be defined; unless noted otherwise, they are
@@ -932,6 +942,12 @@ struct netdev_fcoe_hbainfo {
  *	that determine carrier state from physical hardware properties (eg
  *	network cables) or protocol-dependent mechanisms (eg
  *	USB_CDC_NOTIFY_NETWORK_CONNECTION) should NOT implement this function.
+ *
+ * int (*ndo_get_phys_port_id)(struct net_device *dev,
+ *			       struct netdev_phys_port_id *ppid);
+ *	Called to get ID of physical port of this device. If driver does
+ *	not implement this, it is assumed that the hw is not able to have
+ *	multiple net devices on single physical port.
  */
 struct net_device_ops {
 	int			(*ndo_init)(struct net_device *dev);
@@ -1060,6 +1076,8 @@ struct net_device_ops {
 						      struct nlmsghdr *nlh);
 	int			(*ndo_change_carrier)(struct net_device *dev,
 						      bool new_carrier);
+	int			(*ndo_get_phys_port_id)(struct net_device *dev,
+							struct netdev_phys_port_id *ppid);
 };
 
 /*
@@ -2315,6 +2333,8 @@ extern int		dev_set_mac_address(struct net_device *,
 					    struct sockaddr *);
 extern int		dev_change_carrier(struct net_device *,
 					   bool new_carrier);
+extern int		dev_get_phys_port_id(struct net_device *dev,
+					     struct netdev_phys_port_id *ppid);
 extern int		dev_hard_start_xmit(struct sk_buff *skb,
 					    struct net_device *dev,
 					    struct netdev_queue *txq);
diff --git a/net/core/dev.c b/net/core/dev.c
index dfd9f5d56ae0..58eb802584b9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4988,6 +4988,24 @@ int dev_change_carrier(struct net_device *dev, bool new_carrier)
 }
 EXPORT_SYMBOL(dev_change_carrier);
 
+/**
+ *	dev_get_phys_port_id - Get device physical port ID
+ *	@dev: device
+ *	@ppid: port ID
+ *
+ *	Get device physical port ID
+ */
+int dev_get_phys_port_id(struct net_device *dev,
+			 struct netdev_phys_port_id *ppid)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	if (!ops->ndo_get_phys_port_id)
+		return -EOPNOTSUPP;
+	return ops->ndo_get_phys_port_id(dev, ppid);
+}
+EXPORT_SYMBOL(dev_get_phys_port_id);
+
 /**
  *	dev_new_index	-	allocate an ifindex
  *	@net: the applicable net namespace
-- 
cgit v1.2.3


From 66cae9ed6bc46b8cc57a9693f99f69926f3cc7ef Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Mon, 29 Jul 2013 18:16:50 +0200
Subject: rtnl: export physical port id via RT netlink

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Acked-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: Narendra K <narendra_k@dell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h |  1 +
 net/core/rtnetlink.c         | 25 ++++++++++++++++++++++++-
 2 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 03f6170ab337..04c0e7a5d484 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -143,6 +143,7 @@ enum {
 	IFLA_NUM_TX_QUEUES,
 	IFLA_NUM_RX_QUEUES,
 	IFLA_CARRIER,
+	IFLA_PHYS_PORT_ID,
 	__IFLA_MAX
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 3de740834d1f..0b2972c59b97 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -767,7 +767,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */
 	       + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
 	       + rtnl_link_get_size(dev) /* IFLA_LINKINFO */
-	       + rtnl_link_get_af_size(dev); /* IFLA_AF_SPEC */
+	       + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */
+	       + nla_total_size(MAX_PHYS_PORT_ID_LEN); /* IFLA_PHYS_PORT_ID */
 }
 
 static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
@@ -846,6 +847,24 @@ static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev)
 	return 0;
 }
 
+static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev)
+{
+	int err;
+	struct netdev_phys_port_id ppid;
+
+	err = dev_get_phys_port_id(dev, &ppid);
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			return 0;
+		return err;
+	}
+
+	if (nla_put(skb, IFLA_PHYS_PORT_ID, ppid.id_len, ppid.id))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
 static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			    int type, u32 pid, u32 seq, u32 change,
 			    unsigned int flags, u32 ext_filter_mask)
@@ -913,6 +932,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			goto nla_put_failure;
 	}
 
+	if (rtnl_phys_port_id_fill(skb, dev))
+		goto nla_put_failure;
+
 	attr = nla_reserve(skb, IFLA_STATS,
 			sizeof(struct rtnl_link_stats));
 	if (attr == NULL)
@@ -1113,6 +1135,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_PROMISCUITY]	= { .type = NLA_U32 },
 	[IFLA_NUM_TX_QUEUES]	= { .type = NLA_U32 },
 	[IFLA_NUM_RX_QUEUES]	= { .type = NLA_U32 },
+	[IFLA_PHYS_PORT_ID]	= { .type = NLA_BINARY, .len = MAX_PHYS_PORT_ID_LEN },
 };
 EXPORT_SYMBOL(ifla_policy);
 
-- 
cgit v1.2.3


From ff80e519ab1b3a6abb2c6bbf684b98be07111879 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Mon, 29 Jul 2013 18:16:51 +0200
Subject: net: export physical port id via sysfs

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Acked-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: Narendra K <narendra_k@dell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net-sysfs.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'net')

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 981fed397d1d..8826b0d1e0cc 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -334,6 +334,27 @@ static ssize_t store_group(struct device *dev, struct device_attribute *attr,
 	return netdev_store(dev, attr, buf, len, change_group);
 }
 
+static ssize_t show_phys_port_id(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct net_device *netdev = to_net_dev(dev);
+	ssize_t ret = -EINVAL;
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	if (dev_isalive(netdev)) {
+		struct netdev_phys_port_id ppid;
+
+		ret = dev_get_phys_port_id(netdev, &ppid);
+		if (!ret)
+			ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id);
+	}
+	rtnl_unlock();
+
+	return ret;
+}
+
 static struct device_attribute net_class_attributes[] = {
 	__ATTR(addr_assign_type, S_IRUGO, show_addr_assign_type, NULL),
 	__ATTR(addr_len, S_IRUGO, show_addr_len, NULL),
@@ -355,6 +376,7 @@ static struct device_attribute net_class_attributes[] = {
 	__ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len,
 	       store_tx_queue_len),
 	__ATTR(netdev_group, S_IRUGO | S_IWUSR, show_group, store_group),
+	__ATTR(phys_port_id, S_IRUGO, show_phys_port_id, NULL),
 	{}
 };
 
-- 
cgit v1.2.3


From fca418955148e4f4555d7ce911e9eee3e7970a7f Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Mon, 29 Jul 2013 11:07:36 -0700
Subject: flow_dissector: clean up IPIP case

Explicitly set proto to ETH_P_IP and jump directly to ip processing.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow_dissector.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 00ee068efc1c..3259446f3026 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -139,7 +139,8 @@ ipv6:
 		break;
 	}
 	case IPPROTO_IPIP:
-		goto again;
+		proto = htons(ETH_P_IP);
+		goto ip;
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From b438f940d3541f478c6b37106ed095f1be7959ef Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Mon, 29 Jul 2013 11:07:42 -0700
Subject: flow_dissector: add support for IPPROTO_IPV6

Support IPPROTO_IPV6 similar to IPPROTO_IPIP

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow_dissector.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 3259446f3026..ade9ff1c1980 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -141,6 +141,9 @@ ipv6:
 	case IPPROTO_IPIP:
 		proto = htons(ETH_P_IP);
 		goto ip;
+	case IPPROTO_IPV6:
+		proto = htons(ETH_P_IPV6);
+		goto ipv6;
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From 4299c8a94fa9d26b54222d7781244137623d4b39 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Mon, 29 Jul 2013 22:15:19 +0300
Subject: net: remove an unneeded check

"ifa->ifa_label" is an array inside the in_ifaddr struct.  It can never
be NULL so we can remove this check.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/devinet.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 8d48c392adcc..1b7f7ae8514a 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1124,10 +1124,7 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len)
 		if (len < (int) sizeof(ifr))
 			break;
 		memset(&ifr, 0, sizeof(struct ifreq));
-		if (ifa->ifa_label)
-			strcpy(ifr.ifr_name, ifa->ifa_label);
-		else
-			strcpy(ifr.ifr_name, dev->name);
+		strcpy(ifr.ifr_name, ifa->ifa_label);
 
 		(*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET;
 		(*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr =
-- 
cgit v1.2.3


From 5774c94aceade9eadc311957fe31322cc3ad2016 Mon Sep 17 00:00:00 2001
From: Phil Oester <kernel@linuxace.com>
Date: Thu, 11 Jul 2013 12:06:58 -0700
Subject: netfilter: xt_addrtype: fix trivial typo

Fix typo in error message.

Signed-off-by: Phil Oester <kernel@linuxace.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_addrtype.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c
index 68ff29f60867..fab6eea1bf38 100644
--- a/net/netfilter/xt_addrtype.c
+++ b/net/netfilter/xt_addrtype.c
@@ -202,7 +202,7 @@ static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par)
 			return -EINVAL;
 		}
 		if ((info->source | info->dest) >= XT_ADDRTYPE_PROHIBIT) {
-			pr_err("ipv6 PROHIBT (THROW, NAT ..) matching not supported\n");
+			pr_err("ipv6 PROHIBIT (THROW, NAT ..) matching not supported\n");
 			return -EINVAL;
 		}
 		if ((info->source | info->dest) & XT_ADDRTYPE_BROADCAST) {
-- 
cgit v1.2.3


From 312a0c16c1fa9dd7cb5af413cf73b2fe2806c962 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sun, 28 Jul 2013 22:54:08 +0200
Subject: netfilter: nf_conntrack: constify sk_buff argument to nf_ct_attach()

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h         | 4 ++--
 net/netfilter/core.c              | 7 ++++---
 net/netfilter/nf_conntrack_core.c | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index de70f7b45b68..f4bbf2cd22d8 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -314,8 +314,8 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
 #endif /*CONFIG_NETFILTER*/
 
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
-extern void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *) __rcu;
-extern void nf_ct_attach(struct sk_buff *, struct sk_buff *);
+extern void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) __rcu;
+extern void nf_ct_attach(struct sk_buff *, const struct sk_buff *);
 extern void (*nf_ct_destroy)(struct nf_conntrack *) __rcu;
 
 struct nf_conn;
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 2217363ab422..593b16ea45e0 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -234,12 +234,13 @@ EXPORT_SYMBOL(skb_make_writable);
 /* This does not belong here, but locally generated errors need it if connection
    tracking in use: without this, connection may not be in hash table, and hence
    manufactured ICMP or RST packets will not be associated with it. */
-void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *) __rcu __read_mostly;
+void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *)
+		__rcu __read_mostly;
 EXPORT_SYMBOL(ip_ct_attach);
 
-void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb)
+void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)
 {
-	void (*attach)(struct sk_buff *, struct sk_buff *);
+	void (*attach)(struct sk_buff *, const struct sk_buff *);
 
 	if (skb->nfct) {
 		rcu_read_lock();
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 0283baedcdfb..d32afaff72f8 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1192,7 +1192,7 @@ EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size);
 #endif
 
 /* Used by ipt_REJECT and ip6t_REJECT. */
-static void nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
+static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
 {
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
-- 
cgit v1.2.3


From 5813a8eb47915e051059562f22ffa521404f6e19 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 29 Jul 2013 15:41:50 +0200
Subject: netfilter: connlabels: remove unneeded includes

leftovers from the (never merged) v1 patch.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_labels.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c
index 355d2ef08094..bb53f120e79c 100644
--- a/net/netfilter/nf_conntrack_labels.c
+++ b/net/netfilter/nf_conntrack_labels.c
@@ -8,12 +8,8 @@
  * published by the Free Software Foundation.
  */
 
-#include <linux/ctype.h>
 #include <linux/export.h>
-#include <linux/jhash.h>
-#include <linux/spinlock.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 
 #include <net/netfilter/nf_conntrack_ecache.h>
 #include <net/netfilter/nf_conntrack_labels.h>
-- 
cgit v1.2.3


From 957bec36855f97cc5797fbaaf68b11ac7454df2d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 29 Jul 2013 15:41:51 +0200
Subject: netfilter: nf_queue: relax NFQA_CT attribute check

Allow modifying attributes of the conntrack associated with a packet
without first requesting ct data via CFG_F_CONNTRACK or extra
nfnetlink_conntrack socket.

Also remove unneded rcu_read_lock; the entire function is already
protected by rcu.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink_queue_core.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c
index 971ea145ab3e..ec9de12aa488 100644
--- a/net/netfilter/nfnetlink_queue_core.c
+++ b/net/netfilter/nfnetlink_queue_core.c
@@ -987,8 +987,7 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
 	if (entry == NULL)
 		return -ENOENT;
 
-	rcu_read_lock();
-	if (nfqa[NFQA_CT] && (queue->flags & NFQA_CFG_F_CONNTRACK))
+	if (nfqa[NFQA_CT])
 		ct = nfqnl_ct_parse(entry->skb, nfqa[NFQA_CT], &ctinfo);
 
 	if (nfqa[NFQA_PAYLOAD]) {
@@ -1002,7 +1001,6 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
 		if (ct)
 			nfqnl_ct_seq_adjust(skb, ct, ctinfo, diff);
 	}
-	rcu_read_unlock();
 
 	if (nfqa[NFQA_MARK])
 		entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
-- 
cgit v1.2.3


From fd158d79d33d3c8b693e3e2d8c0e3068d529c2dc Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 29 Jul 2013 15:41:52 +0200
Subject: netfilter: tproxy: remove nf_tproxy_core, keep tw sk assigned to skb

The module was "permanent", due to the special tproxy skb->destructor.
Nowadays we have tcp early demux and its sock_edemux destructor in
networking core which can be used instead.

Thanks to early demux changes the input path now also handles
"skb->sk is tw socket" correctly, so this no longer needs the special
handling introduced with commit d503b30bd648b3cb4e5f50b65d27e389960cc6d9
(netfilter: tproxy: do not assign timewait sockets to skb->sk).

Thus:
- move assign_sock function to where its needed
- don't prevent timewait sockets from being assigned to the skb
- remove nf_tproxy_core.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 Documentation/networking/tproxy.txt    |  5 ++-
 include/net/netfilter/nf_tproxy_core.h |  4 ---
 net/netfilter/Kconfig                  | 22 +++---------
 net/netfilter/Makefile                 |  3 --
 net/netfilter/nf_tproxy_core.c         | 62 ----------------------------------
 net/netfilter/xt_TPROXY.c              |  9 +++++
 6 files changed, 16 insertions(+), 89 deletions(-)
 delete mode 100644 net/netfilter/nf_tproxy_core.c

(limited to 'net')

diff --git a/Documentation/networking/tproxy.txt b/Documentation/networking/tproxy.txt
index 7b5996d9357e..ec11429e1d42 100644
--- a/Documentation/networking/tproxy.txt
+++ b/Documentation/networking/tproxy.txt
@@ -2,9 +2,8 @@ Transparent proxy support
 =========================
 
 This feature adds Linux 2.2-like transparent proxy support to current kernels.
-To use it, enable NETFILTER_TPROXY, the socket match and the TPROXY target in
-your kernel config. You will need policy routing too, so be sure to enable that
-as well.
+To use it, enable the socket match and the TPROXY target in your kernel config.
+You will need policy routing too, so be sure to enable that as well.
 
 
 1. Making non-local sockets work
diff --git a/include/net/netfilter/nf_tproxy_core.h b/include/net/netfilter/nf_tproxy_core.h
index 36d9379d4c4b..975ffa4545a9 100644
--- a/include/net/netfilter/nf_tproxy_core.h
+++ b/include/net/netfilter/nf_tproxy_core.h
@@ -203,8 +203,4 @@ nf_tproxy_get_sock_v6(struct net *net, const u8 protocol,
 }
 #endif
 
-/* assign a socket to the skb -- consumes sk */
-void
-nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk);
-
 #endif
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 56d22cae5906..c45fc1a60e0d 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -410,20 +410,6 @@ config NF_NAT_TFTP
 
 endif # NF_CONNTRACK
 
-# transparent proxy support
-config NETFILTER_TPROXY
-	tristate "Transparent proxying support"
-	depends on IP_NF_MANGLE
-	depends on NETFILTER_ADVANCED
-	help
-	  This option enables transparent proxying support, that is,
-	  support for handling non-locally bound IPv4 TCP and UDP sockets.
-	  For it to work you will have to configure certain iptables rules
-	  and use policy routing. For more information on how to set it up
-	  see Documentation/networking/tproxy.txt.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
 config NETFILTER_XTABLES
 	tristate "Netfilter Xtables support (required for ip_tables)"
 	default m if NETFILTER_ADVANCED=n
@@ -720,10 +706,10 @@ config NETFILTER_XT_TARGET_TEE
 	this clone be rerouted to another nexthop.
 
 config NETFILTER_XT_TARGET_TPROXY
-	tristate '"TPROXY" target support'
-	depends on NETFILTER_TPROXY
+	tristate '"TPROXY" target transparent proxying support'
 	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
+	depends on IP_NF_MANGLE
 	select NF_DEFRAG_IPV4
 	select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
 	help
@@ -731,6 +717,9 @@ config NETFILTER_XT_TARGET_TPROXY
 	  REDIRECT.  It can only be used in the mangle table and is useful
 	  to redirect traffic to a transparent proxy.  It does _not_ depend
 	  on Netfilter connection tracking and NAT, unlike REDIRECT.
+	  For it to work you will have to configure certain iptables rules
+	  and use policy routing. For more information on how to set it up
+	  see Documentation/networking/tproxy.txt.
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
@@ -1180,7 +1169,6 @@ config NETFILTER_XT_MATCH_SCTP
 
 config NETFILTER_XT_MATCH_SOCKET
 	tristate '"socket" match support'
-	depends on NETFILTER_TPROXY
 	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
 	depends on !NF_CONNTRACK || NF_CONNTRACK
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index a1abf87d43bf..ebfa7dc747cd 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -61,9 +61,6 @@ obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o
 obj-$(CONFIG_NF_NAT_SIP) += nf_nat_sip.o
 obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o
 
-# transparent proxy support
-obj-$(CONFIG_NETFILTER_TPROXY) += nf_tproxy_core.o
-
 # generic X tables 
 obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
 
diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c
deleted file mode 100644
index 474d621cbc2e..000000000000
--- a/net/netfilter/nf_tproxy_core.c
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Transparent proxy support for Linux/iptables
- *
- * Copyright (c) 2006-2007 BalaBit IT Ltd.
- * Author: Balazs Scheidler, Krisztian Kovacs
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#include <linux/module.h>
-
-#include <linux/net.h>
-#include <linux/if.h>
-#include <linux/netdevice.h>
-#include <net/udp.h>
-#include <net/netfilter/nf_tproxy_core.h>
-
-
-static void
-nf_tproxy_destructor(struct sk_buff *skb)
-{
-	struct sock *sk = skb->sk;
-
-	skb->sk = NULL;
-	skb->destructor = NULL;
-
-	if (sk)
-		sock_put(sk);
-}
-
-/* consumes sk */
-void
-nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
-{
-	/* assigning tw sockets complicates things; most
-	 * skb->sk->X checks would have to test sk->sk_state first */
-	if (sk->sk_state == TCP_TIME_WAIT) {
-		inet_twsk_put(inet_twsk(sk));
-		return;
-	}
-
-	skb_orphan(skb);
-	skb->sk = sk;
-	skb->destructor = nf_tproxy_destructor;
-}
-EXPORT_SYMBOL_GPL(nf_tproxy_assign_sock);
-
-static int __init nf_tproxy_init(void)
-{
-	pr_info("NF_TPROXY: Transparent proxy support initialized, version 4.1.0\n");
-	pr_info("NF_TPROXY: Copyright (c) 2006-2007 BalaBit IT Ltd.\n");
-	return 0;
-}
-
-module_init(nf_tproxy_init);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Krisztian Kovacs");
-MODULE_DESCRIPTION("Transparent proxy support core routines");
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index d7f195388f66..17c40deafa4f 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -117,6 +117,15 @@ tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport,
 	return sk;
 }
 
+/* assign a socket to the skb -- consumes sk */
+static void
+nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
+{
+	skb_orphan(skb);
+	skb->sk = sk;
+	skb->destructor = sock_edemux;
+}
+
 static unsigned int
 tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
 	   u_int32_t mark_mask, u_int32_t mark_value)
-- 
cgit v1.2.3


From 93742cf8af9dd3b053242b273040aa35fcbf93b3 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 29 Jul 2013 15:41:53 +0200
Subject: netfilter: tproxy: remove nf_tproxy_core.h

We've removed nf_tproxy_core.ko, so also remove its header.
The lookup helpers are split and then moved to tproxy target/socket match.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tproxy_core.h | 206 ---------------------------------
 net/netfilter/xt_TPROXY.c              | 160 ++++++++++++++++++++++++-
 net/netfilter/xt_socket.c              |  66 ++++++++++-
 3 files changed, 220 insertions(+), 212 deletions(-)
 delete mode 100644 include/net/netfilter/nf_tproxy_core.h

(limited to 'net')

diff --git a/include/net/netfilter/nf_tproxy_core.h b/include/net/netfilter/nf_tproxy_core.h
deleted file mode 100644
index 975ffa4545a9..000000000000
--- a/include/net/netfilter/nf_tproxy_core.h
+++ /dev/null
@@ -1,206 +0,0 @@
-#ifndef _NF_TPROXY_CORE_H
-#define _NF_TPROXY_CORE_H
-
-#include <linux/types.h>
-#include <linux/in.h>
-#include <linux/skbuff.h>
-#include <net/sock.h>
-#include <net/inet_hashtables.h>
-#include <net/inet6_hashtables.h>
-#include <net/tcp.h>
-
-#define NFT_LOOKUP_ANY         0
-#define NFT_LOOKUP_LISTENER    1
-#define NFT_LOOKUP_ESTABLISHED 2
-
-/* look up and get a reference to a matching socket */
-
-
-/* This function is used by the 'TPROXY' target and the 'socket'
- * match. The following lookups are supported:
- *
- * Explicit TProxy target rule
- * ===========================
- *
- * This is used when the user wants to intercept a connection matching
- * an explicit iptables rule. In this case the sockets are assumed
- * matching in preference order:
- *
- *   - match: if there's a fully established connection matching the
- *     _packet_ tuple, it is returned, assuming the redirection
- *     already took place and we process a packet belonging to an
- *     established connection
- *
- *   - match: if there's a listening socket matching the redirection
- *     (e.g. on-port & on-ip of the connection), it is returned,
- *     regardless if it was bound to 0.0.0.0 or an explicit
- *     address. The reasoning is that if there's an explicit rule, it
- *     does not really matter if the listener is bound to an interface
- *     or to 0. The user already stated that he wants redirection
- *     (since he added the rule).
- *
- * "socket" match based redirection (no specific rule)
- * ===================================================
- *
- * There are connections with dynamic endpoints (e.g. FTP data
- * connection) that the user is unable to add explicit rules
- * for. These are taken care of by a generic "socket" rule. It is
- * assumed that the proxy application is trusted to open such
- * connections without explicit iptables rule (except of course the
- * generic 'socket' rule). In this case the following sockets are
- * matched in preference order:
- *
- *   - match: if there's a fully established connection matching the
- *     _packet_ tuple
- *
- *   - match: if there's a non-zero bound listener (possibly with a
- *     non-local address) We don't accept zero-bound listeners, since
- *     then local services could intercept traffic going through the
- *     box.
- *
- * Please note that there's an overlap between what a TPROXY target
- * and a socket match will match. Normally if you have both rules the
- * "socket" match will be the first one, effectively all packets
- * belonging to established connections going through that one.
- */
-static inline struct sock *
-nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
-		      const __be32 saddr, const __be32 daddr,
-		      const __be16 sport, const __be16 dport,
-		      const struct net_device *in, int lookup_type)
-{
-	struct sock *sk;
-
-	/* look up socket */
-	switch (protocol) {
-	case IPPROTO_TCP:
-		switch (lookup_type) {
-		case NFT_LOOKUP_ANY:
-			sk = __inet_lookup(net, &tcp_hashinfo,
-					   saddr, sport, daddr, dport,
-					   in->ifindex);
-			break;
-		case NFT_LOOKUP_LISTENER:
-			sk = inet_lookup_listener(net, &tcp_hashinfo,
-						    saddr, sport,
-						    daddr, dport,
-						    in->ifindex);
-
-			/* NOTE: we return listeners even if bound to
-			 * 0.0.0.0, those are filtered out in
-			 * xt_socket, since xt_TPROXY needs 0 bound
-			 * listeners too */
-
-			break;
-		case NFT_LOOKUP_ESTABLISHED:
-			sk = inet_lookup_established(net, &tcp_hashinfo,
-						    saddr, sport, daddr, dport,
-						    in->ifindex);
-			break;
-		default:
-			WARN_ON(1);
-			sk = NULL;
-			break;
-		}
-		break;
-	case IPPROTO_UDP:
-		sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
-				     in->ifindex);
-		if (sk && lookup_type != NFT_LOOKUP_ANY) {
-			int connected = (sk->sk_state == TCP_ESTABLISHED);
-			int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0);
-
-			/* NOTE: we return listeners even if bound to
-			 * 0.0.0.0, those are filtered out in
-			 * xt_socket, since xt_TPROXY needs 0 bound
-			 * listeners too */
-			if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) ||
-			    (lookup_type == NFT_LOOKUP_LISTENER && connected)) {
-				sock_put(sk);
-				sk = NULL;
-			}
-		}
-		break;
-	default:
-		WARN_ON(1);
-		sk = NULL;
-	}
-
-	pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n",
-		 protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk);
-
-	return sk;
-}
-
-#if IS_ENABLED(CONFIG_IPV6)
-static inline struct sock *
-nf_tproxy_get_sock_v6(struct net *net, const u8 protocol,
-		      const struct in6_addr *saddr, const struct in6_addr *daddr,
-		      const __be16 sport, const __be16 dport,
-		      const struct net_device *in, int lookup_type)
-{
-	struct sock *sk;
-
-	/* look up socket */
-	switch (protocol) {
-	case IPPROTO_TCP:
-		switch (lookup_type) {
-		case NFT_LOOKUP_ANY:
-			sk = inet6_lookup(net, &tcp_hashinfo,
-					  saddr, sport, daddr, dport,
-					  in->ifindex);
-			break;
-		case NFT_LOOKUP_LISTENER:
-			sk = inet6_lookup_listener(net, &tcp_hashinfo,
-						   saddr, sport,
-						   daddr, ntohs(dport),
-						   in->ifindex);
-
-			/* NOTE: we return listeners even if bound to
-			 * 0.0.0.0, those are filtered out in
-			 * xt_socket, since xt_TPROXY needs 0 bound
-			 * listeners too */
-
-			break;
-		case NFT_LOOKUP_ESTABLISHED:
-			sk = __inet6_lookup_established(net, &tcp_hashinfo,
-							saddr, sport, daddr, ntohs(dport),
-							in->ifindex);
-			break;
-		default:
-			WARN_ON(1);
-			sk = NULL;
-			break;
-		}
-		break;
-	case IPPROTO_UDP:
-		sk = udp6_lib_lookup(net, saddr, sport, daddr, dport,
-				     in->ifindex);
-		if (sk && lookup_type != NFT_LOOKUP_ANY) {
-			int connected = (sk->sk_state == TCP_ESTABLISHED);
-			int wildcard = ipv6_addr_any(&inet6_sk(sk)->rcv_saddr);
-
-			/* NOTE: we return listeners even if bound to
-			 * 0.0.0.0, those are filtered out in
-			 * xt_socket, since xt_TPROXY needs 0 bound
-			 * listeners too */
-			if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) ||
-			    (lookup_type == NFT_LOOKUP_LISTENER && connected)) {
-				sock_put(sk);
-				sk = NULL;
-			}
-		}
-		break;
-	default:
-		WARN_ON(1);
-		sk = NULL;
-	}
-
-	pr_debug("tproxy socket lookup: proto %u %pI6:%u -> %pI6:%u, lookup type: %d, sock %p\n",
-		 protocol, saddr, ntohs(sport), daddr, ntohs(dport), lookup_type, sk);
-
-	return sk;
-}
-#endif
-
-#endif
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 17c40deafa4f..851383a7f461 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -15,7 +15,9 @@
 #include <linux/ip.h>
 #include <net/checksum.h>
 #include <net/udp.h>
+#include <net/tcp.h>
 #include <net/inet_sock.h>
+#include <net/inet_hashtables.h>
 #include <linux/inetdevice.h>
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
@@ -26,13 +28,18 @@
 #define XT_TPROXY_HAVE_IPV6 1
 #include <net/if_inet6.h>
 #include <net/addrconf.h>
+#include <net/inet6_hashtables.h>
 #include <linux/netfilter_ipv6/ip6_tables.h>
 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 #endif
 
-#include <net/netfilter/nf_tproxy_core.h>
 #include <linux/netfilter/xt_TPROXY.h>
 
+enum nf_tproxy_lookup_t {
+	 NFT_LOOKUP_LISTENER,
+	 NFT_LOOKUP_ESTABLISHED,
+};
+
 static bool tproxy_sk_is_transparent(struct sock *sk)
 {
 	if (sk->sk_state != TCP_TIME_WAIT) {
@@ -68,6 +75,157 @@ tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
 	return laddr ? laddr : daddr;
 }
 
+/*
+ * This is used when the user wants to intercept a connection matching
+ * an explicit iptables rule. In this case the sockets are assumed
+ * matching in preference order:
+ *
+ *   - match: if there's a fully established connection matching the
+ *     _packet_ tuple, it is returned, assuming the redirection
+ *     already took place and we process a packet belonging to an
+ *     established connection
+ *
+ *   - match: if there's a listening socket matching the redirection
+ *     (e.g. on-port & on-ip of the connection), it is returned,
+ *     regardless if it was bound to 0.0.0.0 or an explicit
+ *     address. The reasoning is that if there's an explicit rule, it
+ *     does not really matter if the listener is bound to an interface
+ *     or to 0. The user already stated that he wants redirection
+ *     (since he added the rule).
+ *
+ * Please note that there's an overlap between what a TPROXY target
+ * and a socket match will match. Normally if you have both rules the
+ * "socket" match will be the first one, effectively all packets
+ * belonging to established connections going through that one.
+ */
+static inline struct sock *
+nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
+		      const __be32 saddr, const __be32 daddr,
+		      const __be16 sport, const __be16 dport,
+		      const struct net_device *in,
+		      const enum nf_tproxy_lookup_t lookup_type)
+{
+	struct sock *sk;
+
+	switch (protocol) {
+	case IPPROTO_TCP:
+		switch (lookup_type) {
+		case NFT_LOOKUP_LISTENER:
+			sk = inet_lookup_listener(net, &tcp_hashinfo,
+						    saddr, sport,
+						    daddr, dport,
+						    in->ifindex);
+
+			/* NOTE: we return listeners even if bound to
+			 * 0.0.0.0, those are filtered out in
+			 * xt_socket, since xt_TPROXY needs 0 bound
+			 * listeners too
+			 */
+			break;
+		case NFT_LOOKUP_ESTABLISHED:
+			sk = inet_lookup_established(net, &tcp_hashinfo,
+						    saddr, sport, daddr, dport,
+						    in->ifindex);
+			break;
+		default:
+			BUG();
+		}
+		break;
+	case IPPROTO_UDP:
+		sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
+				     in->ifindex);
+		if (sk) {
+			int connected = (sk->sk_state == TCP_ESTABLISHED);
+			int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0);
+
+			/* NOTE: we return listeners even if bound to
+			 * 0.0.0.0, those are filtered out in
+			 * xt_socket, since xt_TPROXY needs 0 bound
+			 * listeners too
+			 */
+			if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) ||
+			    (lookup_type == NFT_LOOKUP_LISTENER && connected)) {
+				sock_put(sk);
+				sk = NULL;
+			}
+		}
+		break;
+	default:
+		WARN_ON(1);
+		sk = NULL;
+	}
+
+	pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n",
+		 protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk);
+
+	return sk;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static inline struct sock *
+nf_tproxy_get_sock_v6(struct net *net, const u8 protocol,
+		      const struct in6_addr *saddr, const struct in6_addr *daddr,
+		      const __be16 sport, const __be16 dport,
+		      const struct net_device *in,
+		      const enum nf_tproxy_lookup_t lookup_type)
+{
+	struct sock *sk;
+
+	switch (protocol) {
+	case IPPROTO_TCP:
+		switch (lookup_type) {
+		case NFT_LOOKUP_LISTENER:
+			sk = inet6_lookup_listener(net, &tcp_hashinfo,
+						   saddr, sport,
+						   daddr, ntohs(dport),
+						   in->ifindex);
+
+			/* NOTE: we return listeners even if bound to
+			 * 0.0.0.0, those are filtered out in
+			 * xt_socket, since xt_TPROXY needs 0 bound
+			 * listeners too
+			 */
+			break;
+		case NFT_LOOKUP_ESTABLISHED:
+			sk = __inet6_lookup_established(net, &tcp_hashinfo,
+							saddr, sport, daddr, ntohs(dport),
+							in->ifindex);
+			break;
+		default:
+			BUG();
+		}
+		break;
+	case IPPROTO_UDP:
+		sk = udp6_lib_lookup(net, saddr, sport, daddr, dport,
+				     in->ifindex);
+		if (sk) {
+			int connected = (sk->sk_state == TCP_ESTABLISHED);
+			int wildcard = ipv6_addr_any(&inet6_sk(sk)->rcv_saddr);
+
+			/* NOTE: we return listeners even if bound to
+			 * 0.0.0.0, those are filtered out in
+			 * xt_socket, since xt_TPROXY needs 0 bound
+			 * listeners too
+			 */
+			if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) ||
+			    (lookup_type == NFT_LOOKUP_LISTENER && connected)) {
+				sock_put(sk);
+				sk = NULL;
+			}
+		}
+		break;
+	default:
+		WARN_ON(1);
+		sk = NULL;
+	}
+
+	pr_debug("tproxy socket lookup: proto %u %pI6:%u -> %pI6:%u, lookup type: %d, sock %p\n",
+		 protocol, saddr, ntohs(sport), daddr, ntohs(dport), lookup_type, sk);
+
+	return sk;
+}
+#endif
+
 /**
  * tproxy_handle_time_wait4 - handle IPv4 TCP TIME_WAIT reopen redirections
  * @skb:	The skb being processed.
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index f8b71911037a..a7dd108d4063 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -19,12 +19,12 @@
 #include <net/icmp.h>
 #include <net/sock.h>
 #include <net/inet_sock.h>
-#include <net/netfilter/nf_tproxy_core.h>
 #include <net/netfilter/ipv4/nf_defrag_ipv4.h>
 
 #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
 #define XT_SOCKET_HAVE_IPV6 1
 #include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/inet6_hashtables.h>
 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 #endif
 
@@ -101,6 +101,43 @@ extract_icmp4_fields(const struct sk_buff *skb,
 	return 0;
 }
 
+/* "socket" match based redirection (no specific rule)
+ * ===================================================
+ *
+ * There are connections with dynamic endpoints (e.g. FTP data
+ * connection) that the user is unable to add explicit rules
+ * for. These are taken care of by a generic "socket" rule. It is
+ * assumed that the proxy application is trusted to open such
+ * connections without explicit iptables rule (except of course the
+ * generic 'socket' rule). In this case the following sockets are
+ * matched in preference order:
+ *
+ *   - match: if there's a fully established connection matching the
+ *     _packet_ tuple
+ *
+ *   - match: if there's a non-zero bound listener (possibly with a
+ *     non-local address) We don't accept zero-bound listeners, since
+ *     then local services could intercept traffic going through the
+ *     box.
+ */
+static struct sock *
+xt_socket_get_sock_v4(struct net *net, const u8 protocol,
+		      const __be32 saddr, const __be32 daddr,
+		      const __be16 sport, const __be16 dport,
+		      const struct net_device *in)
+{
+	switch (protocol) {
+	case IPPROTO_TCP:
+		return __inet_lookup(net, &tcp_hashinfo,
+				     saddr, sport, daddr, dport,
+				     in->ifindex);
+	case IPPROTO_UDP:
+		return udp4_lib_lookup(net, saddr, sport, daddr, dport,
+				       in->ifindex);
+	}
+	return NULL;
+}
+
 static bool
 socket_match(const struct sk_buff *skb, struct xt_action_param *par,
 	     const struct xt_socket_mtinfo1 *info)
@@ -156,9 +193,9 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
 #endif
 
 	if (!sk)
-		sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), protocol,
+		sk = xt_socket_get_sock_v4(dev_net(skb->dev), protocol,
 					   saddr, daddr, sport, dport,
-					   par->in, NFT_LOOKUP_ANY);
+					   par->in);
 	if (sk) {
 		bool wildcard;
 		bool transparent = true;
@@ -261,6 +298,25 @@ extract_icmp6_fields(const struct sk_buff *skb,
 	return 0;
 }
 
+static struct sock *
+xt_socket_get_sock_v6(struct net *net, const u8 protocol,
+		      const struct in6_addr *saddr, const struct in6_addr *daddr,
+		      const __be16 sport, const __be16 dport,
+		      const struct net_device *in)
+{
+	switch (protocol) {
+	case IPPROTO_TCP:
+		return inet6_lookup(net, &tcp_hashinfo,
+				    saddr, sport, daddr, dport,
+				    in->ifindex);
+	case IPPROTO_UDP:
+		return udp6_lib_lookup(net, saddr, sport, daddr, dport,
+				       in->ifindex);
+	}
+
+	return NULL;
+}
+
 static bool
 socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
 {
@@ -298,9 +354,9 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
 	}
 
 	if (!sk)
-		sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+		sk = xt_socket_get_sock_v6(dev_net(skb->dev), tproto,
 					   saddr, daddr, sport, dport,
-					   par->in, NFT_LOOKUP_ANY);
+					   par->in);
 	if (sk) {
 		bool wildcard;
 		bool transparent = true;
-- 
cgit v1.2.3


From 02982c27ba1e1bd9f9d4747214e19ca83aa88d0e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 29 Jul 2013 15:41:54 +0200
Subject: netfilter: nf_conntrack: remove duplicate code in ctnetlink

ctnetlink contains copy-paste code from death_by_timeout.  In order to
avoid changing both places in upcoming event delivery patch,
export death_by_timeout functionality and use it in the ctnetlink code.

Based on earlier patch from Pablo Neira.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h |  3 +--
 net/netfilter/nf_conntrack_core.c    | 29 ++++++++++++++++-------------
 net/netfilter/nf_conntrack_netlink.c | 18 +++---------------
 3 files changed, 20 insertions(+), 30 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 644d9c223d24..939aced35a02 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -181,8 +181,7 @@ __nf_conntrack_find(struct net *net, u16 zone,
 		    const struct nf_conntrack_tuple *tuple);
 
 extern int nf_conntrack_hash_check_insert(struct nf_conn *ct);
-extern void nf_ct_delete_from_lists(struct nf_conn *ct);
-extern void nf_ct_dying_timeout(struct nf_conn *ct);
+bool nf_ct_delete(struct nf_conn *ct, u32 pid, int report);
 
 extern void nf_conntrack_flush_report(struct net *net, u32 portid, int report);
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index d32afaff72f8..089e408676fa 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -238,7 +238,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
 	nf_conntrack_free(ct);
 }
 
-void nf_ct_delete_from_lists(struct nf_conn *ct)
+static void nf_ct_delete_from_lists(struct nf_conn *ct)
 {
 	struct net *net = nf_ct_net(ct);
 
@@ -253,7 +253,6 @@ void nf_ct_delete_from_lists(struct nf_conn *ct)
 			     &net->ct.dying);
 	spin_unlock_bh(&nf_conntrack_lock);
 }
-EXPORT_SYMBOL_GPL(nf_ct_delete_from_lists);
 
 static void death_by_event(unsigned long ul_conntrack)
 {
@@ -275,7 +274,7 @@ static void death_by_event(unsigned long ul_conntrack)
 	nf_ct_put(ct);
 }
 
-void nf_ct_dying_timeout(struct nf_conn *ct)
+static void nf_ct_dying_timeout(struct nf_conn *ct)
 {
 	struct net *net = nf_ct_net(ct);
 	struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct);
@@ -288,27 +287,33 @@ void nf_ct_dying_timeout(struct nf_conn *ct)
 		(prandom_u32() % net->ct.sysctl_events_retry_timeout);
 	add_timer(&ecache->timeout);
 }
-EXPORT_SYMBOL_GPL(nf_ct_dying_timeout);
 
-static void death_by_timeout(unsigned long ul_conntrack)
+bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
 {
-	struct nf_conn *ct = (void *)ul_conntrack;
 	struct nf_conn_tstamp *tstamp;
 
 	tstamp = nf_conn_tstamp_find(ct);
 	if (tstamp && tstamp->stop == 0)
 		tstamp->stop = ktime_to_ns(ktime_get_real());
 
-	if (!test_bit(IPS_DYING_BIT, &ct->status) &&
-	    unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) {
+	if (!nf_ct_is_dying(ct) &&
+	    unlikely(nf_conntrack_event_report(IPCT_DESTROY, ct,
+	    portid, report) < 0)) {
 		/* destroy event was not delivered */
 		nf_ct_delete_from_lists(ct);
 		nf_ct_dying_timeout(ct);
-		return;
+		return false;
 	}
 	set_bit(IPS_DYING_BIT, &ct->status);
 	nf_ct_delete_from_lists(ct);
 	nf_ct_put(ct);
+	return true;
+}
+EXPORT_SYMBOL_GPL(nf_ct_delete);
+
+static void death_by_timeout(unsigned long ul_conntrack)
+{
+	nf_ct_delete((struct nf_conn *)ul_conntrack, 0, 0);
 }
 
 /*
@@ -643,10 +648,7 @@ static noinline int early_drop(struct net *net, unsigned int hash)
 		return dropped;
 
 	if (del_timer(&ct->timeout)) {
-		death_by_timeout((unsigned long)ct);
-		/* Check if we indeed killed this entry. Reliable event
-		   delivery may have inserted it into the dying list. */
-		if (test_bit(IPS_DYING_BIT, &ct->status)) {
+		if (nf_ct_delete(ct, 0, 0)) {
 			dropped = 1;
 			NF_CT_STAT_INC_ATOMIC(net, early_drop);
 		}
@@ -1253,6 +1255,7 @@ void nf_ct_iterate_cleanup(struct net *net,
 		/* Time to push up daises... */
 		if (del_timer(&ct->timeout))
 			death_by_timeout((unsigned long)ct);
+
 		/* ... else the timer will get him soon. */
 
 		nf_ct_put(ct);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index edc410e778f7..e842c0ded79d 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1038,21 +1038,9 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
 		}
 	}
 
-	if (del_timer(&ct->timeout)) {
-		if (nf_conntrack_event_report(IPCT_DESTROY, ct,
-					      NETLINK_CB(skb).portid,
-					      nlmsg_report(nlh)) < 0) {
-			nf_ct_delete_from_lists(ct);
-			/* we failed to report the event, try later */
-			nf_ct_dying_timeout(ct);
-			nf_ct_put(ct);
-			return 0;
-		}
-		/* death_by_timeout would report the event again */
-		set_bit(IPS_DYING_BIT, &ct->status);
-		nf_ct_delete_from_lists(ct);
-		nf_ct_put(ct);
-	}
+	if (del_timer(&ct->timeout))
+		nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh));
+
 	nf_ct_put(ct);
 
 	return 0;
-- 
cgit v1.2.3


From 0658cdc8f3babb4a441f5a803a0b644fafcbf9ef Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sun, 28 Jul 2013 22:54:09 +0200
Subject: netfilter: nf_nat: fix locking in nf_nat_seq_adjust()

nf_nat_seq_adjust() needs to grab nf_nat_seqofs_lock to protect against
concurrent changes to the sequence adjustment data.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_nat_helper.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index 85e20a919081..a7262ed055c6 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -373,6 +373,7 @@ nf_nat_seq_adjust(struct sk_buff *skb,
 	s16 seqoff, ackoff;
 	struct nf_conn_nat *nat = nfct_nat(ct);
 	struct nf_nat_seq *this_way, *other_way;
+	int res;
 
 	dir = CTINFO2DIR(ctinfo);
 
@@ -383,6 +384,7 @@ nf_nat_seq_adjust(struct sk_buff *skb,
 		return 0;
 
 	tcph = (void *)skb->data + protoff;
+	spin_lock_bh(&nf_nat_seqofs_lock);
 	if (after(ntohl(tcph->seq), this_way->correction_pos))
 		seqoff = this_way->offset_after;
 	else
@@ -407,7 +409,10 @@ nf_nat_seq_adjust(struct sk_buff *skb,
 	tcph->seq = newseq;
 	tcph->ack_seq = newack;
 
-	return nf_nat_sack_adjust(skb, protoff, tcph, ct, ctinfo);
+	res = nf_nat_sack_adjust(skb, protoff, tcph, ct, ctinfo);
+	spin_unlock_bh(&nf_nat_seqofs_lock);
+
+	return res;
 }
 
 /* Setup NAT on this expected conntrack so it follows master. */
-- 
cgit v1.2.3


From 2d89c68ac78ae432038ef23371d2fa949d725d43 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sun, 28 Jul 2013 22:54:10 +0200
Subject: netfilter: nf_nat: change sequence number adjustments to 32 bits

Using 16 bits is too small, when many adjustments happen the offsets might
overflow and break the connection.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h              | 2 +-
 include/net/netfilter/nf_conntrack.h   | 2 +-
 include/net/netfilter/nf_nat.h         | 2 +-
 include/net/netfilter/nf_nat_helper.h  | 6 +++---
 net/netfilter/nf_conntrack_core.c      | 2 +-
 net/netfilter/nf_conntrack_proto_tcp.c | 4 ++--
 net/netfilter/nf_nat_helper.c          | 8 ++++----
 7 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index f4bbf2cd22d8..655d5d198d49 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -330,7 +330,7 @@ extern struct nfq_ct_hook __rcu *nfq_ct_hook;
 
 struct nfq_ct_nat_hook {
 	void (*seq_adjust)(struct sk_buff *skb, struct nf_conn *ct,
-			   u32 ctinfo, int off);
+			   u32 ctinfo, s32 off);
 };
 extern struct nfq_ct_nat_hook __rcu *nfq_ct_nat_hook;
 #else
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 939aced35a02..e5eb8b62538c 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -234,7 +234,7 @@ static inline bool nf_ct_kill(struct nf_conn *ct)
 }
 
 /* These are for NAT.  Icky. */
-extern s16 (*nf_ct_nat_offset)(const struct nf_conn *ct,
+extern s32 (*nf_ct_nat_offset)(const struct nf_conn *ct,
 			       enum ip_conntrack_dir dir,
 			       u32 seq);
 
diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
index ad14a799fd2e..e2441413675c 100644
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -19,7 +19,7 @@ struct nf_nat_seq {
 	u_int32_t correction_pos;
 
 	/* sequence number offset before and after last modification */
-	int16_t offset_before, offset_after;
+	int32_t offset_before, offset_after;
 };
 
 #include <linux/list.h>
diff --git a/include/net/netfilter/nf_nat_helper.h b/include/net/netfilter/nf_nat_helper.h
index b4d6bfc2af03..194c34794923 100644
--- a/include/net/netfilter/nf_nat_helper.h
+++ b/include/net/netfilter/nf_nat_helper.h
@@ -41,7 +41,7 @@ extern int nf_nat_mangle_udp_packet(struct sk_buff *skb,
 
 extern void nf_nat_set_seq_adjust(struct nf_conn *ct,
 				  enum ip_conntrack_info ctinfo,
-				  __be32 seq, s16 off);
+				  __be32 seq, s32 off);
 extern int nf_nat_seq_adjust(struct sk_buff *skb,
 			     struct nf_conn *ct,
 			     enum ip_conntrack_info ctinfo,
@@ -56,11 +56,11 @@ extern int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb,
 extern void nf_nat_follow_master(struct nf_conn *ct,
 				 struct nf_conntrack_expect *this);
 
-extern s16 nf_nat_get_offset(const struct nf_conn *ct,
+extern s32 nf_nat_get_offset(const struct nf_conn *ct,
 			     enum ip_conntrack_dir dir,
 			     u32 seq);
 
 extern void nf_nat_tcp_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
-				  u32 dir, int off);
+				  u32 dir, s32 off);
 
 #endif
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 089e408676fa..0934611ff9f3 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1695,7 +1695,7 @@ err_stat:
 	return ret;
 }
 
-s16 (*nf_ct_nat_offset)(const struct nf_conn *ct,
+s32 (*nf_ct_nat_offset)(const struct nf_conn *ct,
 			enum ip_conntrack_dir dir,
 			u32 seq);
 EXPORT_SYMBOL_GPL(nf_ct_nat_offset);
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 7dcc376eea5f..8f308d896324 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -496,7 +496,7 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
 }
 
 #ifdef CONFIG_NF_NAT_NEEDED
-static inline s16 nat_offset(const struct nf_conn *ct,
+static inline s32 nat_offset(const struct nf_conn *ct,
 			     enum ip_conntrack_dir dir,
 			     u32 seq)
 {
@@ -525,7 +525,7 @@ static bool tcp_in_window(const struct nf_conn *ct,
 	struct ip_ct_tcp_state *receiver = &state->seen[!dir];
 	const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
 	__u32 seq, ack, sack, end, win, swin;
-	s16 receiver_offset;
+	s32 receiver_offset;
 	bool res;
 
 	/*
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index a7262ed055c6..ff4a589e3e39 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -68,13 +68,13 @@ adjust_tcp_sequence(u32 seq,
 }
 
 /* Get the offset value, for conntrack */
-s16 nf_nat_get_offset(const struct nf_conn *ct,
+s32 nf_nat_get_offset(const struct nf_conn *ct,
 		      enum ip_conntrack_dir dir,
 		      u32 seq)
 {
 	struct nf_conn_nat *nat = nfct_nat(ct);
 	struct nf_nat_seq *this_way;
-	s16 offset;
+	s32 offset;
 
 	if (!nat)
 		return 0;
@@ -143,7 +143,7 @@ static int enlarge_skb(struct sk_buff *skb, unsigned int extra)
 }
 
 void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
-			   __be32 seq, s16 off)
+			   __be32 seq, s32 off)
 {
 	if (!off)
 		return;
@@ -370,7 +370,7 @@ nf_nat_seq_adjust(struct sk_buff *skb,
 	struct tcphdr *tcph;
 	int dir;
 	__be32 newseq, newack;
-	s16 seqoff, ackoff;
+	s32 seqoff, ackoff;
 	struct nf_conn_nat *nat = nfct_nat(ct);
 	struct nf_nat_seq *this_way, *other_way;
 	int res;
-- 
cgit v1.2.3


From 12e7ada385eada77854174ecaf469a0791277ddd Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sun, 28 Jul 2013 22:54:11 +0200
Subject: netfilter: nf_nat: use per-conntrack locking for sequence number
 adjustments

Get rid of the global lock and use per-conntrack locks for protecting the
sequencen number adjustment data. Additionally saves one lock/unlock
operation for every TCP packet.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_nat_helper.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index ff4a589e3e39..46b9baa845a6 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -30,8 +30,6 @@
 	pr_debug("offset_before=%d, offset_after=%d, correction_pos=%u\n", \
 		 x->offset_before, x->offset_after, x->correction_pos);
 
-static DEFINE_SPINLOCK(nf_nat_seqofs_lock);
-
 /* Setup TCP sequence correction given this change at this sequence */
 static inline void
 adjust_tcp_sequence(u32 seq,
@@ -49,7 +47,7 @@ adjust_tcp_sequence(u32 seq,
 	pr_debug("adjust_tcp_sequence: Seq_offset before: ");
 	DUMP_OFFSET(this_way);
 
-	spin_lock_bh(&nf_nat_seqofs_lock);
+	spin_lock_bh(&ct->lock);
 
 	/* SYN adjust. If it's uninitialized, or this is after last
 	 * correction, record it: we don't handle more than one
@@ -61,31 +59,26 @@ adjust_tcp_sequence(u32 seq,
 		this_way->offset_before = this_way->offset_after;
 		this_way->offset_after += sizediff;
 	}
-	spin_unlock_bh(&nf_nat_seqofs_lock);
+	spin_unlock_bh(&ct->lock);
 
 	pr_debug("adjust_tcp_sequence: Seq_offset after: ");
 	DUMP_OFFSET(this_way);
 }
 
-/* Get the offset value, for conntrack */
+/* Get the offset value, for conntrack. Caller must have the conntrack locked */
 s32 nf_nat_get_offset(const struct nf_conn *ct,
 		      enum ip_conntrack_dir dir,
 		      u32 seq)
 {
 	struct nf_conn_nat *nat = nfct_nat(ct);
 	struct nf_nat_seq *this_way;
-	s32 offset;
 
 	if (!nat)
 		return 0;
 
 	this_way = &nat->seq[dir];
-	spin_lock_bh(&nf_nat_seqofs_lock);
-	offset = after(seq, this_way->correction_pos)
+	return after(seq, this_way->correction_pos)
 		 ? this_way->offset_after : this_way->offset_before;
-	spin_unlock_bh(&nf_nat_seqofs_lock);
-
-	return offset;
 }
 
 /* Frobs data inside this packet, which is linear. */
@@ -384,7 +377,7 @@ nf_nat_seq_adjust(struct sk_buff *skb,
 		return 0;
 
 	tcph = (void *)skb->data + protoff;
-	spin_lock_bh(&nf_nat_seqofs_lock);
+	spin_lock_bh(&ct->lock);
 	if (after(ntohl(tcph->seq), this_way->correction_pos))
 		seqoff = this_way->offset_after;
 	else
@@ -410,7 +403,7 @@ nf_nat_seq_adjust(struct sk_buff *skb,
 	tcph->ack_seq = newack;
 
 	res = nf_nat_sack_adjust(skb, protoff, tcph, ct, ctinfo);
-	spin_unlock_bh(&nf_nat_seqofs_lock);
+	spin_unlock_bh(&ct->lock);
 
 	return res;
 }
-- 
cgit v1.2.3


From c0155b2da4cd6583b1b729451249ca346e1c05a2 Mon Sep 17 00:00:00 2001
From: Dmitry Popov <dp@highloadlab.com>
Date: Wed, 31 Jul 2013 13:39:45 +0400
Subject: tcp: Remove unused tcpct declarations and comments

Remove declaration, 4 defines and confusing comment that are no longer used
since 1a2c6181c4 ("tcp: Remove TCPCT").

Signed-off-by: Dmitry Popov <dp@highloadlab.com>
Acked-by: Christoph Paasch <christoph.paasch@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h | 1 -
 include/net/tcp.h   | 4 ----
 net/ipv4/tcp.c      | 4 ----
 3 files changed, 9 deletions(-)

(limited to 'net')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 9640803a17a7..d68633452d9b 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -107,7 +107,6 @@ static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
  * only four options will fit in a standard TCP header */
 #define TCP_NUM_SACKS 4
 
-struct tcp_cookie_values;
 struct tcp_request_sock_ops;
 
 struct tcp_request_sock {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 18fc999dae3c..27b652f379ef 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -192,10 +192,6 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCPOLEN_TIMESTAMP      10
 #define TCPOLEN_MD5SIG         18
 #define TCPOLEN_EXP_FASTOPEN_BASE  4
-#define TCPOLEN_COOKIE_BASE    2	/* Cookie-less header extension */
-#define TCPOLEN_COOKIE_PAIR    3	/* Cookie pair header extension */
-#define TCPOLEN_COOKIE_MIN     (TCPOLEN_COOKIE_BASE+TCP_COOKIE_MIN)
-#define TCPOLEN_COOKIE_MAX     (TCPOLEN_COOKIE_BASE+TCP_COOKIE_MAX)
 
 /* But this is what stacks really send out. */
 #define TCPOLEN_TSTAMP_ALIGNED		12
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c27e81392398..ab64eea042fa 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -410,10 +410,6 @@ void tcp_init_sock(struct sock *sk)
 
 	icsk->icsk_sync_mss = tcp_sync_mss;
 
-	/* Presumed zeroed, in order of appearance:
-	 *	cookie_in_always, cookie_out_never,
-	 *	s_data_constant, s_data_in, s_data_out
-	 */
 	sk->sk_sndbuf = sysctl_tcp_wmem[1];
 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
 
-- 
cgit v1.2.3


From ca4c3fc24e293719fe7410c4e63da9b6bc633b83 Mon Sep 17 00:00:00 2001
From: "fan.du" <fan.du@windriver.com>
Date: Tue, 30 Jul 2013 08:33:53 +0800
Subject: net: split rt_genid for ipv4 and ipv6

Current net name space has only one genid for both IPv4 and IPv6, it has below
drawbacks:

- Add/delete an IPv4 address will invalidate all IPv6 routing table entries.
- Insert/remove XFRM policy will also invalidate both IPv4/IPv6 routing table
  entries even when the policy is only applied for one address family.

Thus, this patch attempt to split one genid for two to cater for IPv4 and IPv6
separately in a fine granularity.

Signed-off-by: Fan Du <fan.du@windriver.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/net_namespace.h     | 37 ++++++++++++++++++++++++++++++++-----
 include/net/netns/ipv4.h        |  1 +
 include/net/netns/ipv6.h        |  1 +
 net/ipv4/route.c                | 16 ++++++++--------
 net/ipv6/af_inet6.c             |  1 +
 net/ipv6/route.c                |  4 ++--
 net/xfrm/xfrm_policy.c          |  8 +++++++-
 security/selinux/include/xfrm.h |  7 ++++++-
 8 files changed, 58 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 84e37b1ca9e1..1313456a0994 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -119,7 +119,6 @@ struct net {
 	struct netns_ipvs	*ipvs;
 #endif
 	struct sock		*diag_nlsk;
-	atomic_t		rt_genid;
 	atomic_t		fnhe_genid;
 };
 
@@ -333,14 +332,42 @@ static inline void unregister_net_sysctl_table(struct ctl_table_header *header)
 }
 #endif
 
-static inline int rt_genid(struct net *net)
+static inline int rt_genid_ipv4(struct net *net)
 {
-	return atomic_read(&net->rt_genid);
+	return atomic_read(&net->ipv4.rt_genid);
 }
 
-static inline void rt_genid_bump(struct net *net)
+static inline void rt_genid_bump_ipv4(struct net *net)
 {
-	atomic_inc(&net->rt_genid);
+	atomic_inc(&net->ipv4.rt_genid);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static inline int rt_genid_ipv6(struct net *net)
+{
+	return atomic_read(&net->ipv6.rt_genid);
+}
+
+static inline void rt_genid_bump_ipv6(struct net *net)
+{
+	atomic_inc(&net->ipv6.rt_genid);
+}
+#else
+static inline int rt_genid_ipv6(struct net *net)
+{
+	return 0;
+}
+
+static inline void rt_genid_bump_ipv6(struct net *net)
+{
+}
+#endif
+
+/* For callers who don't really care about whether it's IPv4 or IPv6 */
+static inline void rt_genid_bump_all(struct net *net)
+{
+	rt_genid_bump_ipv4(net);
+	rt_genid_bump_ipv6(net);
 }
 
 static inline int fnhe_genid(struct net *net)
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 2ba9de89e8ec..bf2ec2202c56 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -77,5 +77,6 @@ struct netns_ipv4 {
 	struct fib_rules_ops	*mr_rules_ops;
 #endif
 #endif
+	atomic_t	rt_genid;
 };
 #endif
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 005e2c2e39a9..0fb2401197c5 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -72,6 +72,7 @@ struct netns_ipv6 {
 #endif
 #endif
 	atomic_t		dev_addr_genid;
+	atomic_t		rt_genid;
 };
 
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a9a54a236832..e805481eff72 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -435,12 +435,12 @@ static inline int ip_rt_proc_init(void)
 
 static inline bool rt_is_expired(const struct rtable *rth)
 {
-	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
+	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 }
 
 void rt_cache_flush(struct net *net)
 {
-	rt_genid_bump(net);
+	rt_genid_bump_ipv4(net);
 }
 
 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
@@ -1458,7 +1458,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 #endif
 	rth->dst.output = ip_rt_bug;
 
-	rth->rt_genid	= rt_genid(dev_net(dev));
+	rth->rt_genid	= rt_genid_ipv4(dev_net(dev));
 	rth->rt_flags	= RTCF_MULTICAST;
 	rth->rt_type	= RTN_MULTICAST;
 	rth->rt_is_input= 1;
@@ -1589,7 +1589,7 @@ static int __mkroute_input(struct sk_buff *skb,
 		goto cleanup;
 	}
 
-	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
+	rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
 	rth->rt_flags = flags;
 	rth->rt_type = res->type;
 	rth->rt_is_input = 1;
@@ -1760,7 +1760,7 @@ local_input:
 	rth->dst.tclassid = itag;
 #endif
 
-	rth->rt_genid = rt_genid(net);
+	rth->rt_genid = rt_genid_ipv4(net);
 	rth->rt_flags 	= flags|RTCF_LOCAL;
 	rth->rt_type	= res.type;
 	rth->rt_is_input = 1;
@@ -1945,7 +1945,7 @@ add:
 
 	rth->dst.output = ip_output;
 
-	rth->rt_genid = rt_genid(dev_net(dev_out));
+	rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
 	rth->rt_flags	= flags;
 	rth->rt_type	= type;
 	rth->rt_is_input = 0;
@@ -2227,7 +2227,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		rt->rt_iif = ort->rt_iif;
 		rt->rt_pmtu = ort->rt_pmtu;
 
-		rt->rt_genid = rt_genid(net);
+		rt->rt_genid = rt_genid_ipv4(net);
 		rt->rt_flags = ort->rt_flags;
 		rt->rt_type = ort->rt_type;
 		rt->rt_gateway = ort->rt_gateway;
@@ -2665,7 +2665,7 @@ static __net_initdata struct pernet_operations sysctl_route_ops = {
 
 static __net_init int rt_genid_init(struct net *net)
 {
-	atomic_set(&net->rt_genid, 0);
+	atomic_set(&net->ipv4.rt_genid, 0);
 	atomic_set(&net->fnhe_genid, 0);
 	get_random_bytes(&net->ipv4.dev_addr_genid,
 			 sizeof(net->ipv4.dev_addr_genid));
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index a5ac969aeefe..0d1a9b153fbb 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -766,6 +766,7 @@ static int __net_init inet6_net_init(struct net *net)
 
 	net->ipv6.sysctl.bindv6only = 0;
 	net->ipv6.sysctl.icmpv6_time = 1*HZ;
+	atomic_set(&net->ipv6.rt_genid, 0);
 
 	err = ipv6_init_mibs(net);
 	if (err)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 74ab1f74abcd..ce9616304521 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -283,7 +283,7 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net,
 
 		memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
 		rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
-		rt->rt6i_genid = rt_genid(net);
+		rt->rt6i_genid = rt_genid_ipv6(net);
 		INIT_LIST_HEAD(&rt->rt6i_siblings);
 	}
 	return rt;
@@ -1061,7 +1061,7 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
 	 * into this function always.
 	 */
-	if (rt->rt6i_genid != rt_genid(dev_net(rt->dst.dev)))
+	if (rt->rt6i_genid != rt_genid_ipv6(dev_net(rt->dst.dev)))
 		return NULL;
 
 	if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index e52cab3591dd..d8da6b8c6ba8 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -660,7 +660,13 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 	xfrm_pol_hold(policy);
 	net->xfrm.policy_count[dir]++;
 	atomic_inc(&flow_cache_genid);
-	rt_genid_bump(net);
+
+	/* After previous checking, family can either be AF_INET or AF_INET6 */
+	if (policy->family == AF_INET)
+		rt_genid_bump_ipv4(net);
+	else
+		rt_genid_bump_ipv6(net);
+
 	if (delpol) {
 		xfrm_policy_requeue(delpol, policy);
 		__xfrm_policy_unlink(delpol, dir);
diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h
index 65f67cb0aefb..6713f04e30ba 100644
--- a/security/selinux/include/xfrm.h
+++ b/security/selinux/include/xfrm.h
@@ -50,8 +50,13 @@ int selinux_xfrm_decode_session(struct sk_buff *skb, u32 *sid, int ckall);
 
 static inline void selinux_xfrm_notify_policyload(void)
 {
+	struct net *net;
+
 	atomic_inc(&flow_cache_genid);
-	rt_genid_bump(&init_net);
+	rtnl_lock();
+	for_each_net(net)
+		rt_genid_bump_all(net);
+	rtnl_unlock();
 }
 #else
 static inline int selinux_xfrm_enabled(void)
-- 
cgit v1.2.3


From f2f872f9272a79a1048877ea14c15576f46c225e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 30 Jul 2013 17:55:08 -0700
Subject: netem: Introduce skb_orphan_partial() helper

Commit 547669d483e578 ("tcp: xps: fix reordering issues") added
unexpected reorders in case netem is used in a MQ setup for high
performance test bed.

ETH=eth0
tc qd del dev $ETH root 2>/dev/null
tc qd add dev $ETH root handle 1: mq
for i in `seq 1 32`
do
 tc qd add dev $ETH parent 1:$i netem delay 100ms
done

As all tcp packets are orphaned by netem, TCP stack believes it can
set skb->ooo_okay on all packets.

In order to allow producers to send more packets, we want to
keep sk_wmem_alloc from reaching sk_sndbuf limit.

We can do that by accounting one byte per skb in netem queues,
so that TCP stack is not fooled too much.

Tested:

With above MQ/netem setup, scaling number of concurrent flows gives
linear results and no reorders/retransmits

lpq83:~# for n in 1 10 20 30 40 50 60 70 80 90 100
 do echo -n "n:$n " ; ./super_netperf $n -H 10.7.7.84; done
n:1 198.46
n:10 2002.69
n:20 4000.98
n:30 6006.35
n:40 8020.93
n:50 10032.3
n:60 12081.9
n:70 13971.3
n:80 16009.7
n:90 17117.3
n:100 17425.5

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h    |  1 +
 net/core/sock.c       | 19 +++++++++++++++++++
 net/sched/sch_netem.c |  5 +----
 3 files changed, 21 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/sock.h b/include/net/sock.h
index b9f2b095b1ab..53d4714709a1 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1520,6 +1520,7 @@ extern struct sk_buff		*sock_rmalloc(struct sock *sk,
 					      unsigned long size, int force,
 					      gfp_t priority);
 extern void			sock_wfree(struct sk_buff *skb);
+extern void			skb_orphan_partial(struct sk_buff *skb);
 extern void			sock_rfree(struct sk_buff *skb);
 extern void			sock_edemux(struct sk_buff *skb);
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 85e8de1bc7fd..a753d97434dc 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1576,6 +1576,25 @@ void sock_wfree(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(sock_wfree);
 
+void skb_orphan_partial(struct sk_buff *skb)
+{
+	/* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
+	 * so we do not completely orphan skb, but transfert all
+	 * accounted bytes but one, to avoid unexpected reorders.
+	 */
+	if (skb->destructor == sock_wfree
+#ifdef CONFIG_INET
+	    || skb->destructor == tcp_wfree
+#endif
+		) {
+		atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
+		skb->truesize = 1;
+	} else {
+		skb_orphan(skb);
+	}
+}
+EXPORT_SYMBOL(skb_orphan_partial);
+
 /*
  * Read buffer destructor automatically called from kfree_skb.
  */
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 82f6016d89ab..a6d788d45216 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -412,12 +412,9 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 
 	/* If a delay is expected, orphan the skb. (orphaning usually takes
 	 * place at TX completion time, so _before_ the link transit delay)
-	 * Ideally, this orphaning should be done after the rate limiting
-	 * module, because this breaks TCP Small Queue, and other mechanisms
-	 * based on socket sk_wmem_alloc.
 	 */
 	if (q->latency || q->jitter)
-		skb_orphan(skb);
+		skb_orphan_partial(skb);
 
 	/*
 	 * If we need to duplicate packet, then re-insert at top of the
-- 
cgit v1.2.3


From 7764a45a8f1fe74d4f7d301eaca2e558e7e2831a Mon Sep 17 00:00:00 2001
From: Stefan Tomanek <stefan.tomanek@wertarbyte.de>
Date: Thu, 1 Aug 2013 02:17:15 +0200
Subject: fib_rules: add .suppress operation

This change adds a new operation to the fib_rules_ops struct; it allows the
suppression of routing decisions if certain criteria are not met by its
results.

The first implemented constraint is a minimum prefix length added to the
structures of routing rules. If a rule is added with a minimum prefix length
>0, only routes meeting this threshold will be considered. Any other (more
general) routing table entries will be ignored.

When configuring a system with multiple network uplinks and default routes, it
is often convinient to reference the main routing table multiple times - but
omitting the default route. Using this patch and a modified "ip" utility, this
can be achieved by using the following command sequence:

  $ ip route add table secuplink default via 10.42.23.1

  $ ip rule add pref 100            table main prefixlength 1
  $ ip rule add pref 150 fwmark 0xA table secuplink

With this setup, packets marked 0xA will be processed by the additional routing
table "secuplink", but only if no suitable route in the main routing table can
be found. By using a minimal prefixlength of 1, the default route (/0) of the
table "main" is hidden to packets processed by rule 100; packets traveling to
destinations with more specific routing entries are processed as usual.

Signed-off-by: Stefan Tomanek <stefan.tomanek@wertarbyte.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/fib_rules.h        |  4 ++++
 include/uapi/linux/fib_rules.h |  2 +-
 net/core/fib_rules.c           |  8 ++++++++
 net/ipv4/fib_rules.c           | 14 ++++++++++++++
 net/ipv6/fib6_rules.c          | 13 +++++++++++++
 5 files changed, 40 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index e361f4882426..2f286dce9259 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -18,6 +18,7 @@ struct fib_rule {
 	u32			pref;
 	u32			flags;
 	u32			table;
+	u8			table_prefixlen_min;
 	u8			action;
 	u32			target;
 	struct fib_rule __rcu	*ctarget;
@@ -46,6 +47,8 @@ struct fib_rules_ops {
 	int			(*action)(struct fib_rule *,
 					  struct flowi *, int,
 					  struct fib_lookup_arg *);
+	bool			(*suppress)(struct fib_rule *,
+					    struct fib_lookup_arg *);
 	int			(*match)(struct fib_rule *,
 					 struct flowi *, int);
 	int			(*configure)(struct fib_rule *,
@@ -80,6 +83,7 @@ struct fib_rules_ops {
 	[FRA_FWMARK]	= { .type = NLA_U32 }, \
 	[FRA_FWMASK]	= { .type = NLA_U32 }, \
 	[FRA_TABLE]     = { .type = NLA_U32 }, \
+	[FRA_TABLE_PREFIXLEN_MIN] = { .type = NLA_U8 }, \
 	[FRA_GOTO]	= { .type = NLA_U32 }
 
 static inline void fib_rule_get(struct fib_rule *rule)
diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h
index 51da65b68b85..59cd31b3455e 100644
--- a/include/uapi/linux/fib_rules.h
+++ b/include/uapi/linux/fib_rules.h
@@ -45,7 +45,7 @@ enum {
 	FRA_FLOW,	/* flow/class id */
 	FRA_UNUSED6,
 	FRA_UNUSED7,
-	FRA_UNUSED8,
+	FRA_TABLE_PREFIXLEN_MIN,
 	FRA_TABLE,	/* Extended table id */
 	FRA_FWMASK,	/* mask for netfilter mark */
 	FRA_OIFNAME,
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 21735440c44a..2ef5040c99c8 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -226,6 +226,9 @@ jumped:
 		else
 			err = ops->action(rule, fl, flags, arg);
 
+		if (!err && ops->suppress && ops->suppress(rule, arg))
+			continue;
+
 		if (err != -EAGAIN) {
 			if ((arg->flags & FIB_LOOKUP_NOREF) ||
 			    likely(atomic_inc_not_zero(&rule->refcnt))) {
@@ -337,6 +340,8 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 	rule->action = frh->action;
 	rule->flags = frh->flags;
 	rule->table = frh_get_table(frh, tb);
+	if (tb[FRA_TABLE_PREFIXLEN_MIN])
+		rule->table_prefixlen_min = nla_get_u8(tb[FRA_TABLE_PREFIXLEN_MIN]);
 
 	if (!tb[FRA_PRIORITY] && ops->default_pref)
 		rule->pref = ops->default_pref(ops);
@@ -523,6 +528,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
 			 + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */
 			 + nla_total_size(4) /* FRA_PRIORITY */
 			 + nla_total_size(4) /* FRA_TABLE */
+			 + nla_total_size(1) /* FRA_TABLE_PREFIXLEN_MIN */
 			 + nla_total_size(4) /* FRA_FWMARK */
 			 + nla_total_size(4); /* FRA_FWMASK */
 
@@ -548,6 +554,8 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
 	frh->table = rule->table;
 	if (nla_put_u32(skb, FRA_TABLE, rule->table))
 		goto nla_put_failure;
+	if (nla_put_u8(skb, FRA_TABLE_PREFIXLEN_MIN, rule->table_prefixlen_min))
+		goto nla_put_failure;
 	frh->res1 = 0;
 	frh->res2 = 0;
 	frh->action = rule->action;
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 26aa65d1fce4..9f2906679d1f 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -101,6 +101,19 @@ errout:
 	return err;
 }
 
+static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
+{
+	/* do not accept result if the route does
+	 * not meet the required prefix length
+	 */
+	struct fib_result *result = (struct fib_result *) arg->result;
+	if (result->prefixlen < rule->table_prefixlen_min) {
+		if (!(arg->flags & FIB_LOOKUP_NOREF))
+			fib_info_put(result->fi);
+		return true;
+	}
+	return false;
+}
 
 static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
 {
@@ -267,6 +280,7 @@ static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = {
 	.rule_size	= sizeof(struct fib4_rule),
 	.addr_size	= sizeof(u32),
 	.action		= fib4_rule_action,
+	.suppress	= fib4_rule_suppress,
 	.match		= fib4_rule_match,
 	.configure	= fib4_rule_configure,
 	.delete		= fib4_rule_delete,
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 2e1a432867c0..e64e6a55fc4a 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -111,6 +111,18 @@ out:
 	return rt == NULL ? -EAGAIN : 0;
 }
 
+static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
+{
+	struct rt6_info *rt = (struct rt6_info *) arg->result;
+	/* do not accept result if the route does
+	 * not meet the required prefix length
+	 */
+	if (rt->rt6i_dst.plen < rule->table_prefixlen_min) {
+		ip6_rt_put(rt);
+		return true;
+	}
+	return false;
+}
 
 static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
 {
@@ -244,6 +256,7 @@ static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = {
 	.addr_size		= sizeof(struct in6_addr),
 	.action			= fib6_rule_action,
 	.match			= fib6_rule_match,
+	.suppress		= fib6_rule_suppress,
 	.configure		= fib6_rule_configure,
 	.compare		= fib6_rule_compare,
 	.fill			= fib6_rule_fill,
-- 
cgit v1.2.3


From 46b3a421903aa2a4bc9731ca4663cee3ea869dab Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Thu, 1 Aug 2013 08:54:47 +0200
Subject: ipv6: fib6_rules should return exact return value

With the addition of the suppress operation
(7764a45a8f1fe74d4f7d301eaca2e558e7e2831a ("fib_rules: add .suppress
operation") we rely on accurate error reporting of the fib_rules.actions.

fib6_rule_action always returned -EAGAIN in case we could not find a
matching route and 0 if a rule was matched. This also included a match
for blackhole or prohibited rule actions which could get suppressed by
the new logic.

So adapt fib6_rule_action to always return the correct error code as
its counterpart fib4_rule_action does. This also fixes a possiblity of
nullptr-deref where we don't find a table, thus rt == NULL. Because
the condition rt != ip6_null_entry still holdes it seems we could later
get a nullptr bug on dereference rt->dst.

v2:
a) Fixed a brain fart in the commit msg (the rule => a table, etc). No
   changes to the patch.

Cc: Stefan Tomanek <stefan.tomanek@wertarbyte.de>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/fib6_rules.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index e64e6a55fc4a..554a4fbabfb3 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -55,26 +55,33 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
 	struct fib6_table *table;
 	struct net *net = rule->fr_net;
 	pol_lookup_t lookup = arg->lookup_ptr;
+	int err = 0;
 
 	switch (rule->action) {
 	case FR_ACT_TO_TBL:
 		break;
 	case FR_ACT_UNREACHABLE:
+		err = -ENETUNREACH;
 		rt = net->ipv6.ip6_null_entry;
 		goto discard_pkt;
 	default:
 	case FR_ACT_BLACKHOLE:
+		err = -EINVAL;
 		rt = net->ipv6.ip6_blk_hole_entry;
 		goto discard_pkt;
 	case FR_ACT_PROHIBIT:
+		err = -EACCES;
 		rt = net->ipv6.ip6_prohibit_entry;
 		goto discard_pkt;
 	}
 
 	table = fib6_get_table(net, rule->table);
-	if (table)
-		rt = lookup(net, table, flp6, flags);
+	if (!table) {
+		err = -EAGAIN;
+		goto out;
+	}
 
+	rt = lookup(net, table, flp6, flags);
 	if (rt != net->ipv6.ip6_null_entry) {
 		struct fib6_rule *r = (struct fib6_rule *)rule;
 
@@ -101,6 +108,7 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
 	}
 again:
 	ip6_rt_put(rt);
+	err = -EAGAIN;
 	rt = NULL;
 	goto out;
 
@@ -108,7 +116,7 @@ discard_pkt:
 	dst_hold(&rt->dst);
 out:
 	arg->result = rt;
-	return rt == NULL ? -EAGAIN : 0;
+	return err;
 }
 
 static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
-- 
cgit v1.2.3


From e47f2509e5f182f4df144406de6f2bc78179d57e Mon Sep 17 00:00:00 2001
From: Fabio Baltieri <fabio.baltieri@gmail.com>
Date: Thu, 25 Jul 2013 12:00:26 +0200
Subject: mac80211: use oneshot blink API for LED triggers

Change mac80211 LED trigger code to use the generic
led_trigger_blink_oneshot() API for transmit and receive activity
indication.

This gives a better feedback to the user, as with the new API each
activity event results in a visible blink, while a constant traffic
results in a continuous blink at constant rate.

Signed-off-by: Fabio Baltieri <fabio.baltieri@gmail.com>
[fix LED disabled build error]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ieee80211_i.h |  1 -
 net/mac80211/led.c         | 19 +++++++------------
 net/mac80211/led.h         |  4 ++--
 net/mac80211/status.c      |  2 +-
 net/mac80211/tx.c          |  1 -
 5 files changed, 10 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 3d32df1fbc6d..d779383c52d2 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1094,7 +1094,6 @@ struct ieee80211_local {
 	u32 dot11TransmittedFrameCount;
 
 #ifdef CONFIG_MAC80211_LEDS
-	int tx_led_counter, rx_led_counter;
 	struct led_trigger *tx_led, *rx_led, *assoc_led, *radio_led;
 	struct tpt_led_trigger *tpt_led_trigger;
 	char tx_led_name[32], rx_led_name[32],
diff --git a/net/mac80211/led.c b/net/mac80211/led.c
index bcffa6903129..e2b836446af3 100644
--- a/net/mac80211/led.c
+++ b/net/mac80211/led.c
@@ -12,27 +12,22 @@
 #include <linux/export.h>
 #include "led.h"
 
+#define MAC80211_BLINK_DELAY 50 /* ms */
+
 void ieee80211_led_rx(struct ieee80211_local *local)
 {
+	unsigned long led_delay = MAC80211_BLINK_DELAY;
 	if (unlikely(!local->rx_led))
 		return;
-	if (local->rx_led_counter++ % 2 == 0)
-		led_trigger_event(local->rx_led, LED_OFF);
-	else
-		led_trigger_event(local->rx_led, LED_FULL);
+	led_trigger_blink_oneshot(local->rx_led, &led_delay, &led_delay, 0);
 }
 
-/* q is 1 if a packet was enqueued, 0 if it has been transmitted */
-void ieee80211_led_tx(struct ieee80211_local *local, int q)
+void ieee80211_led_tx(struct ieee80211_local *local)
 {
+	unsigned long led_delay = MAC80211_BLINK_DELAY;
 	if (unlikely(!local->tx_led))
 		return;
-	/* not sure how this is supposed to work ... */
-	local->tx_led_counter += 2*q-1;
-	if (local->tx_led_counter % 2 == 0)
-		led_trigger_event(local->tx_led, LED_OFF);
-	else
-		led_trigger_event(local->tx_led, LED_FULL);
+	led_trigger_blink_oneshot(local->tx_led, &led_delay, &led_delay, 0);
 }
 
 void ieee80211_led_assoc(struct ieee80211_local *local, bool associated)
diff --git a/net/mac80211/led.h b/net/mac80211/led.h
index e0275d9befa8..89f4344f13b9 100644
--- a/net/mac80211/led.h
+++ b/net/mac80211/led.h
@@ -13,7 +13,7 @@
 
 #ifdef CONFIG_MAC80211_LEDS
 void ieee80211_led_rx(struct ieee80211_local *local);
-void ieee80211_led_tx(struct ieee80211_local *local, int q);
+void ieee80211_led_tx(struct ieee80211_local *local);
 void ieee80211_led_assoc(struct ieee80211_local *local,
 			 bool associated);
 void ieee80211_led_radio(struct ieee80211_local *local,
@@ -27,7 +27,7 @@ void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local,
 static inline void ieee80211_led_rx(struct ieee80211_local *local)
 {
 }
-static inline void ieee80211_led_tx(struct ieee80211_local *local, int q)
+static inline void ieee80211_led_tx(struct ieee80211_local *local)
 {
 }
 static inline void ieee80211_led_assoc(struct ieee80211_local *local,
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 6ad4c14385ef..8eabfd96ca36 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -564,7 +564,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
 
 	rcu_read_unlock();
 
-	ieee80211_led_tx(local, 0);
+	ieee80211_led_tx(local);
 
 	/* SNMP counters
 	 * Fragments are passed to low-level drivers as separate skbs, so these
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index be4d3caf4879..f65873f0c89f 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1300,7 +1300,6 @@ static bool __ieee80211_tx(struct ieee80211_local *local,
 				    txpending);
 
 	ieee80211_tpt_led_trig_tx(local, fc, led_len);
-	ieee80211_led_tx(local, 1);
 
 	WARN_ON_ONCE(!skb_queue_empty(skbs));
 
-- 
cgit v1.2.3


From dad9defd28912dc408763d461b9a6b1a762c07ea Mon Sep 17 00:00:00 2001
From: Antonio Quartulli <antonio@open-mesh.com>
Date: Fri, 26 Jul 2013 21:15:02 +0200
Subject: mac80211: ibss - remove not authorized station earlier

A station which is not authorized has to be purged earlier
to give it a chance to re-try to establish an IBSS/RSN
session soon. Set the timeout to 10 seconds.

Some refactoring has also been done to allow the IBSS
submodule to have its own expiring function.

Reported-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Antonio Quartulli <antonio@open-mesh.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ibss.c | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index 5e6836c3aa4c..e08387cdc8fd 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -30,6 +30,7 @@
 
 #define IEEE80211_IBSS_MERGE_INTERVAL (30 * HZ)
 #define IEEE80211_IBSS_INACTIVITY_LIMIT (60 * HZ)
+#define IEEE80211_IBSS_RSN_INACTIVITY_LIMIT (10 * HZ)
 
 #define IEEE80211_IBSS_MAX_STA_ENTRIES 128
 
@@ -740,6 +741,33 @@ static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata)
 	return active;
 }
 
+static void ieee80211_ibss_sta_expire(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sta_info *sta, *tmp;
+	unsigned long exp_time = IEEE80211_IBSS_INACTIVITY_LIMIT;
+	unsigned long exp_rsn_time = IEEE80211_IBSS_RSN_INACTIVITY_LIMIT;
+
+	mutex_lock(&local->sta_mtx);
+
+	list_for_each_entry_safe(sta, tmp, &local->sta_list, list) {
+		if (sdata != sta->sdata)
+			continue;
+
+		if (time_after(jiffies, sta->last_rx + exp_time) ||
+		    (time_after(jiffies, sta->last_rx + exp_rsn_time) &&
+		     sta->sta_state != IEEE80211_STA_AUTHORIZED)) {
+			sta_dbg(sta->sdata, "expiring inactive %sSTA %pM\n",
+				sta->sta_state != IEEE80211_STA_AUTHORIZED ?
+				"not authorized " : "", sta->sta.addr);
+
+			WARN_ON(__sta_info_destroy(sta));
+		}
+	}
+
+	mutex_unlock(&local->sta_mtx);
+}
+
 /*
  * This function is called with state == IEEE80211_IBSS_MLME_JOINED
  */
@@ -754,7 +782,7 @@ static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata)
 	mod_timer(&ifibss->timer,
 		  round_jiffies(jiffies + IEEE80211_IBSS_MERGE_INTERVAL));
 
-	ieee80211_sta_expire(sdata, IEEE80211_IBSS_INACTIVITY_LIMIT);
+	ieee80211_ibss_sta_expire(sdata);
 
 	if (time_before(jiffies, ifibss->last_scan_completed +
 		       IEEE80211_IBSS_MERGE_INTERVAL))
-- 
cgit v1.2.3


From a824131017a2c3c8f275c6eb46740cfb8a43f7c5 Mon Sep 17 00:00:00 2001
From: Karl Beldan <karl.beldan@rivierawaves.com>
Date: Sat, 27 Jul 2013 11:47:04 +0200
Subject: mac80211: report some VHT radiotap infos for tx status

The radiotap VHT info is 12 bytes (required to be aligned on 2) :
u16 known - IEEE80211_RADIOTAP_VHT_KNOWN_*
u8 flags - IEEE80211_RADIOTAP_VHT_FLAG_*
u8 bandwidth
u8 mcs_nss[4]
u8 coding
u8 group_id
u16 partial_aid

ATM mac80211 can handle IEEE80211_RADIOTAP_VHT_KNOWN_{GI,BANDWIDTH} and
mcs_nss[0] (i.e single user) in simple cases.
This is more a placeholder to let sniffers give more clues for VHT,
since we don't have yet the proper infrastructure/conventions
in mac80211 for complete feedback (e.g consider dynamic BW).

Signed-off-by: Karl Beldan <karl.beldan@rivierawaves.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/status.c | 76 ++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 63 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 8eabfd96ca36..368837fe3b80 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -235,7 +235,8 @@ static int ieee80211_tx_radiotap_len(struct ieee80211_tx_info *info)
 
 	/* IEEE80211_RADIOTAP_RATE rate */
 	if (info->status.rates[0].idx >= 0 &&
-	    !(info->status.rates[0].flags & IEEE80211_TX_RC_MCS))
+	    !(info->status.rates[0].flags & (IEEE80211_TX_RC_MCS |
+					     IEEE80211_TX_RC_VHT_MCS)))
 		len += 2;
 
 	/* IEEE80211_RADIOTAP_TX_FLAGS */
@@ -244,16 +245,21 @@ static int ieee80211_tx_radiotap_len(struct ieee80211_tx_info *info)
 	/* IEEE80211_RADIOTAP_DATA_RETRIES */
 	len += 1;
 
-	/* IEEE80211_TX_RC_MCS */
-	if (info->status.rates[0].idx >= 0 &&
-	    info->status.rates[0].flags & IEEE80211_TX_RC_MCS)
-		len += 3;
+	/* IEEE80211_RADIOTAP_MCS
+	 * IEEE80211_RADIOTAP_VHT */
+	if (info->status.rates[0].idx >= 0) {
+		if (info->status.rates[0].flags & IEEE80211_TX_RC_MCS)
+			len += 3;
+		else if (info->status.rates[0].flags & IEEE80211_TX_RC_VHT_MCS)
+			len = ALIGN(len, 2) + 12;
+	}
 
 	return len;
 }
 
 static void
-ieee80211_add_tx_radiotap_header(struct ieee80211_supported_band *sband,
+ieee80211_add_tx_radiotap_header(struct ieee80211_local *local,
+				 struct ieee80211_supported_band *sband,
 				 struct sk_buff *skb, int retry_count,
 				 int rtap_len, int shift)
 {
@@ -280,7 +286,8 @@ ieee80211_add_tx_radiotap_header(struct ieee80211_supported_band *sband,
 
 	/* IEEE80211_RADIOTAP_RATE */
 	if (info->status.rates[0].idx >= 0 &&
-	    !(info->status.rates[0].flags & IEEE80211_TX_RC_MCS)) {
+	    !(info->status.rates[0].flags & (IEEE80211_TX_RC_MCS |
+					     IEEE80211_TX_RC_VHT_MCS))) {
 		u16 rate;
 
 		rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_RATE);
@@ -310,9 +317,12 @@ ieee80211_add_tx_radiotap_header(struct ieee80211_supported_band *sband,
 	*pos = retry_count;
 	pos++;
 
-	/* IEEE80211_TX_RC_MCS */
-	if (info->status.rates[0].idx >= 0 &&
-	    info->status.rates[0].flags & IEEE80211_TX_RC_MCS) {
+	if (info->status.rates[0].idx < 0)
+		return;
+
+	/* IEEE80211_RADIOTAP_MCS
+	 * IEEE80211_RADIOTAP_VHT */
+	if (info->status.rates[0].flags & IEEE80211_TX_RC_MCS) {
 		rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_MCS);
 		pos[0] = IEEE80211_RADIOTAP_MCS_HAVE_MCS |
 			 IEEE80211_RADIOTAP_MCS_HAVE_GI |
@@ -325,8 +335,48 @@ ieee80211_add_tx_radiotap_header(struct ieee80211_supported_band *sband,
 			pos[1] |= IEEE80211_RADIOTAP_MCS_FMT_GF;
 		pos[2] = info->status.rates[0].idx;
 		pos += 3;
-	}
+	} else if (info->status.rates[0].flags & IEEE80211_TX_RC_VHT_MCS) {
+		u16 known = local->hw.radiotap_vht_details &
+			(IEEE80211_RADIOTAP_VHT_KNOWN_GI |
+			 IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH);
+
+		rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_VHT);
+
+		/* required alignment from rthdr */
+		pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2);
 
+		/* u16 known - IEEE80211_RADIOTAP_VHT_KNOWN_* */
+		put_unaligned_le16(known, pos);
+		pos += 2;
+
+		/* u8 flags - IEEE80211_RADIOTAP_VHT_FLAG_* */
+		if (info->status.rates[0].flags & IEEE80211_TX_RC_SHORT_GI)
+			*pos |= IEEE80211_RADIOTAP_VHT_FLAG_SGI;
+		pos++;
+
+		/* u8 bandwidth */
+		if (info->status.rates[0].flags & IEEE80211_TX_RC_40_MHZ_WIDTH)
+			*pos = 1;
+		else if (info->status.rates[0].flags & IEEE80211_TX_RC_80_MHZ_WIDTH)
+			*pos = 4;
+		else if (info->status.rates[0].flags & IEEE80211_TX_RC_160_MHZ_WIDTH)
+			*pos = 11;
+		else /* IEEE80211_TX_RC_{20_MHZ_WIDTH,FIXME:DUP_DATA} */
+			*pos = 0;
+		pos++;
+
+		/* u8 mcs_nss[4] */
+		*pos = (ieee80211_rate_get_vht_mcs(&info->status.rates[0]) << 4) |
+			ieee80211_rate_get_vht_nss(&info->status.rates[0]);
+		pos += 4;
+
+		/* u8 coding */
+		pos++;
+		/* u8 group_id */
+		pos++;
+		/* u16 partial_aid */
+		pos += 2;
+	}
 }
 
 static void ieee80211_report_used_skb(struct ieee80211_local *local,
@@ -631,8 +681,8 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
 		dev_kfree_skb(skb);
 		return;
 	}
-	ieee80211_add_tx_radiotap_header(sband, skb, retry_count, rtap_len,
-					 shift);
+	ieee80211_add_tx_radiotap_header(local, sband, skb, retry_count,
+					 rtap_len, shift);
 
 	/* XXX: is this sufficient for BPF? */
 	skb_set_mac_header(skb, 0);
-- 
cgit v1.2.3


From 7cf1f14ecf1f5025abb0e30e22e8f7ad219fa32e Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 12 Jul 2013 10:40:31 +0200
Subject: mac80211: add debugfs for driver-buffered TID bitmap

Add a per-station debugfs file indicating the TIDs (as
a bitmap) that the driver has data buffered on.

Reviewed-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/debugfs_sta.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index 44e201d60a13..19c54a44ed47 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -455,6 +455,15 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
 	DEBUGFS_ADD_COUNTER(tx_retry_count, tx_retry_count);
 	DEBUGFS_ADD_COUNTER(wep_weak_iv_count, wep_weak_iv_count);
 
+	if (sizeof(sta->driver_buffered_tids) == sizeof(u32))
+		debugfs_create_x32("driver_buffered_tids", 0400,
+				   sta->debugfs.dir,
+				   (u32 *)&sta->driver_buffered_tids);
+	else
+		debugfs_create_x64("driver_buffered_tids", 0400,
+				   sta->debugfs.dir,
+				   (u64 *)&sta->driver_buffered_tids);
+
 	drv_sta_add_debugfs(local, sdata, &sta->sta, sta->debugfs.dir);
 }
 
-- 
cgit v1.2.3


From 16ef1fe272332b2f7fd99236017b891db48d9cd6 Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <simon.wunderlich@s2003.tu-chemnitz.de>
Date: Thu, 11 Jul 2013 16:09:05 +0200
Subject: nl80211/cfg80211: add channel switch command

To allow channel switch announcements within beacons, add
the channel switch command to nl80211/cfg80211. This is
implementation is intended for AP and (later) IBSS mode.

Signed-off-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Mathias Kretschmer <mathias.kretschmer@fokus.fraunhofer.de>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  33 ++++++++++++
 include/uapi/linux/nl80211.h |  30 +++++++++++
 net/wireless/nl80211.c       | 122 ++++++++++++++++++++++++++++++++++++++++++-
 net/wireless/rdev-ops.h      |  12 +++++
 net/wireless/trace.h         |  33 ++++++++++++
 5 files changed, 229 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index aeaf6dff6e05..b7495c72061c 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -665,6 +665,30 @@ struct cfg80211_ap_settings {
 	bool radar_required;
 };
 
+/**
+ * struct cfg80211_csa_settings - channel switch settings
+ *
+ * Used for channel switch
+ *
+ * @chandef: defines the channel to use after the switch
+ * @beacon_csa: beacon data while performing the switch
+ * @counter_offset_beacon: offset for the counter within the beacon (tail)
+ * @counter_offset_presp: offset for the counter within the probe response
+ * @beacon_after: beacon data to be used on the new channel
+ * @radar_required: whether radar detection is required on the new channel
+ * @block_tx: whether transmissions should be blocked while changing
+ * @count: number of beacons until switch
+ */
+struct cfg80211_csa_settings {
+	struct cfg80211_chan_def chandef;
+	struct cfg80211_beacon_data beacon_csa;
+	u16 counter_offset_beacon, counter_offset_presp;
+	struct cfg80211_beacon_data beacon_after;
+	bool radar_required;
+	bool block_tx;
+	u8 count;
+};
+
 /**
  * enum station_parameters_apply_mask - station parameter values to apply
  * @STATION_PARAM_APPLY_UAPSD: apply new uAPSD parameters (uapsd_queues, max_sp)
@@ -2139,6 +2163,8 @@ struct cfg80211_update_ft_ies_params {
  * @crit_proto_stop: Indicates critical protocol no longer needs increased link
  *	reliability. This operation can not fail.
  * @set_coalesce: Set coalesce parameters.
+ *
+ * @channel_switch: initiate channel-switch procedure (with CSA)
  */
 struct cfg80211_ops {
 	int	(*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow);
@@ -2376,6 +2402,10 @@ struct cfg80211_ops {
 				   struct wireless_dev *wdev);
 	int	(*set_coalesce)(struct wiphy *wiphy,
 				struct cfg80211_coalesce *coalesce);
+
+	int	(*channel_switch)(struct wiphy *wiphy,
+				  struct net_device *dev,
+				  struct cfg80211_csa_settings *params);
 };
 
 /*
@@ -2441,6 +2471,8 @@ struct cfg80211_ops {
  * @WIPHY_FLAG_OFFCHAN_TX: Device supports direct off-channel TX.
  * @WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL: Device supports remain-on-channel call.
  * @WIPHY_FLAG_SUPPORTS_5_10_MHZ: Device supports 5 MHz and 10 MHz channels.
+ * @WIPHY_FLAG_HAS_CHANNEL_SWITCH: Device supports channel switch in
+ *	beaconing mode (AP, IBSS, Mesh, ...).
  */
 enum wiphy_flags {
 	WIPHY_FLAG_CUSTOM_REGULATORY		= BIT(0),
@@ -2465,6 +2497,7 @@ enum wiphy_flags {
 	WIPHY_FLAG_OFFCHAN_TX			= BIT(20),
 	WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL	= BIT(21),
 	WIPHY_FLAG_SUPPORTS_5_10_MHZ		= BIT(22),
+	WIPHY_FLAG_HAS_CHANNEL_SWITCH		= BIT(23),
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index eb68735b3318..1f42bc3dcb9c 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -676,6 +676,16 @@
  * @NL80211_CMD_GET_COALESCE: Get currently supported coalesce rules.
  * @NL80211_CMD_SET_COALESCE: Configure coalesce rules or clear existing rules.
  *
+ * @NL80211_CMD_CHANNEL_SWITCH: Perform a channel switch by announcing the
+ *	the new channel information (Channel Switch Announcement - CSA)
+ *	in the beacon for some time (as defined in the
+ *	%NL80211_ATTR_CH_SWITCH_COUNT parameter) and then change to the
+ *	new channel. Userspace provides the new channel information (using
+ *	%NL80211_ATTR_WIPHY_FREQ and the attributes determining channel
+ *	width). %NL80211_ATTR_CH_SWITCH_BLOCK_TX may be supplied to inform
+ *	other station that transmission must be blocked until the channel
+ *	switch is complete.
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -841,6 +851,8 @@ enum nl80211_commands {
 	NL80211_CMD_GET_COALESCE,
 	NL80211_CMD_SET_COALESCE,
 
+	NL80211_CMD_CHANNEL_SWITCH,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -1469,6 +1481,18 @@ enum nl80211_commands {
  *
  * @NL80211_ATTR_COALESCE_RULE: Coalesce rule information.
  *
+ * @NL80211_ATTR_CH_SWITCH_COUNT: u32 attribute specifying the number of TBTT's
+ *	until the channel switch event.
+ * @NL80211_ATTR_CH_SWITCH_BLOCK_TX: flag attribute specifying that transmission
+ *	must be blocked on the current channel (before the channel switch
+ *	operation).
+ * @NL80211_ATTR_CSA_IES: Nested set of attributes containing the IE information
+ *	for the time while performing a channel switch.
+ * @NL80211_ATTR_CSA_C_OFF_BEACON: Offset of the channel switch counter
+ *	field in the beacons tail (%NL80211_ATTR_BEACON_TAIL).
+ * @NL80211_ATTR_CSA_C_OFF_PRESP: Offset of the channel switch counter
+ *	field in the probe response (%NL80211_ATTR_PROBE_RESP).
+ *
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
  */
@@ -1771,6 +1795,12 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_COALESCE_RULE,
 
+	NL80211_ATTR_CH_SWITCH_COUNT,
+	NL80211_ATTR_CH_SWITCH_BLOCK_TX,
+	NL80211_ATTR_CSA_IES,
+	NL80211_ATTR_CSA_C_OFF_BEACON,
+	NL80211_ATTR_CSA_C_OFF_PRESP,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 03d4ef95292e..f7cb12178bd2 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -349,6 +349,11 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = {
 	[NL80211_ATTR_IE_RIC] = { .type = NLA_BINARY,
 				  .len = IEEE80211_MAX_DATA_LEN },
 	[NL80211_ATTR_PEER_AID] = { .type = NLA_U16 },
+	[NL80211_ATTR_CH_SWITCH_COUNT] = { .type = NLA_U32 },
+	[NL80211_ATTR_CH_SWITCH_BLOCK_TX] = { .type = NLA_FLAG },
+	[NL80211_ATTR_CSA_IES] = { .type = NLA_NESTED },
+	[NL80211_ATTR_CSA_C_OFF_BEACON] = { .type = NLA_U16 },
+	[NL80211_ATTR_CSA_C_OFF_PRESP] = { .type = NLA_U16 },
 };
 
 /* policy for the key attributes */
@@ -1422,6 +1427,8 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *dev,
 		if (state->split) {
 			CMD(crit_proto_start, CRIT_PROTOCOL_START);
 			CMD(crit_proto_stop, CRIT_PROTOCOL_STOP);
+			if (dev->wiphy.flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH)
+				CMD(channel_switch, CHANNEL_SWITCH);
 		}
 
 #ifdef CONFIG_NL80211_TESTMODE
@@ -5613,6 +5620,111 @@ static int nl80211_start_radar_detection(struct sk_buff *skb,
 	return err;
 }
 
+static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_csa_settings params;
+	/* csa_attrs is defined static to avoid waste of stack size - this
+	 * function is called under RTNL lock, so this should not be a problem.
+	 */
+	static struct nlattr *csa_attrs[NL80211_ATTR_MAX+1];
+	u8 radar_detect_width = 0;
+	int err;
+
+	if (!rdev->ops->channel_switch ||
+	    !(rdev->wiphy.flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH))
+		return -EOPNOTSUPP;
+
+	/* may add IBSS support later */
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+		return -EOPNOTSUPP;
+
+	memset(&params, 0, sizeof(params));
+
+	if (!info->attrs[NL80211_ATTR_WIPHY_FREQ] ||
+	    !info->attrs[NL80211_ATTR_CH_SWITCH_COUNT])
+		return -EINVAL;
+
+	/* only important for AP, IBSS and mesh create IEs internally */
+	if (!info->attrs[NL80211_ATTR_CSA_IES])
+		return -EINVAL;
+
+	/* useless if AP is not running */
+	if (!wdev->beacon_interval)
+		return -EINVAL;
+
+	params.count = nla_get_u32(info->attrs[NL80211_ATTR_CH_SWITCH_COUNT]);
+
+	err = nl80211_parse_beacon(info->attrs, &params.beacon_after);
+	if (err)
+		return err;
+
+	err = nla_parse_nested(csa_attrs, NL80211_ATTR_MAX,
+			       info->attrs[NL80211_ATTR_CSA_IES],
+			       nl80211_policy);
+	if (err)
+		return err;
+
+	err = nl80211_parse_beacon(csa_attrs, &params.beacon_csa);
+	if (err)
+		return err;
+
+	if (!csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON])
+		return -EINVAL;
+
+	params.counter_offset_beacon =
+		nla_get_u16(csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON]);
+	if (params.counter_offset_beacon >= params.beacon_csa.tail_len)
+		return -EINVAL;
+
+	/* sanity check - counters should be the same */
+	if (params.beacon_csa.tail[params.counter_offset_beacon] !=
+	    params.count)
+		return -EINVAL;
+
+	if (csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]) {
+		params.counter_offset_presp =
+			nla_get_u16(csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]);
+		if (params.counter_offset_presp >=
+		    params.beacon_csa.probe_resp_len)
+			return -EINVAL;
+
+		if (params.beacon_csa.probe_resp[params.counter_offset_presp] !=
+		    params.count)
+			return -EINVAL;
+	}
+
+	err = nl80211_parse_chandef(rdev, info, &params.chandef);
+	if (err)
+		return err;
+
+	if (!cfg80211_reg_can_beacon(&rdev->wiphy, &params.chandef))
+		return -EINVAL;
+
+	err = cfg80211_chandef_dfs_required(wdev->wiphy, &params.chandef);
+	if (err < 0) {
+		return err;
+	} else if (err) {
+		radar_detect_width = BIT(params.chandef.width);
+		params.radar_required = true;
+	}
+
+	err = cfg80211_can_use_iftype_chan(rdev, wdev, wdev->iftype,
+					   params.chandef.chan,
+					   CHAN_MODE_SHARED,
+					   radar_detect_width);
+	if (err)
+		return err;
+
+	if (info->attrs[NL80211_ATTR_CH_SWITCH_BLOCK_TX])
+		params.block_tx = true;
+
+	return rdev_channel_switch(rdev, dev, &params);
+}
+
 static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb,
 			    u32 seq, int flags,
 			    struct cfg80211_registered_device *rdev,
@@ -9361,7 +9473,15 @@ static struct genl_ops nl80211_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = NL80211_FLAG_NEED_WIPHY |
 				  NL80211_FLAG_NEED_RTNL,
-	}
+	},
+	{
+		.cmd = NL80211_CMD_CHANNEL_SWITCH,
+		.doit = nl80211_channel_switch,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
 };
 
 static struct genl_multicast_group nl80211_mlme_mcgrp = {
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 9f15f0ac824d..de870d4d0bcc 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -923,4 +923,16 @@ static inline void rdev_crit_proto_stop(struct cfg80211_registered_device *rdev,
 	trace_rdev_return_void(&rdev->wiphy);
 }
 
+static inline int rdev_channel_switch(struct cfg80211_registered_device *rdev,
+				      struct net_device *dev,
+				      struct cfg80211_csa_settings *params)
+{
+	int ret;
+
+	trace_rdev_channel_switch(&rdev->wiphy, dev, params);
+	ret = rdev->ops->channel_switch(&rdev->wiphy, dev, params);
+	trace_rdev_return_int(&rdev->wiphy, ret);
+	return ret;
+}
+
 #endif /* __CFG80211_RDEV_OPS */
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 09af6eb426a8..f0ebdcd394ef 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -1841,6 +1841,39 @@ TRACE_EVENT(rdev_crit_proto_stop,
 		  WIPHY_PR_ARG, WDEV_PR_ARG)
 );
 
+TRACE_EVENT(rdev_channel_switch,
+	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+		 struct cfg80211_csa_settings *params),
+	TP_ARGS(wiphy, netdev, params),
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		NETDEV_ENTRY
+		CHAN_DEF_ENTRY
+		__field(u16, counter_offset_beacon)
+		__field(u16, counter_offset_presp)
+		__field(bool, radar_required)
+		__field(bool, block_tx)
+		__field(u8, count)
+	),
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		NETDEV_ASSIGN;
+		CHAN_DEF_ASSIGN(&params->chandef);
+		__entry->counter_offset_beacon = params->counter_offset_beacon;
+		__entry->counter_offset_presp = params->counter_offset_presp;
+		__entry->radar_required = params->radar_required;
+		__entry->block_tx = params->block_tx;
+		__entry->count = params->count;
+	),
+	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT
+		  ", block_tx: %d, count: %u, radar_required: %d"
+		  ", counter offsets (beacon/presp): %u/%u",
+		  WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG,
+		  __entry->block_tx, __entry->count, __entry->radar_required,
+		  __entry->counter_offset_beacon,
+		  __entry->counter_offset_presp)
+);
+
 /*************************************************************
  *	     cfg80211 exported functions traces		     *
  *************************************************************/
-- 
cgit v1.2.3


From 73da7d5bab79ad7e16ff44d67c3fe8b9c0b33e5b Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <simon.wunderlich@s2003.tu-chemnitz.de>
Date: Thu, 11 Jul 2013 16:09:06 +0200
Subject: mac80211: add channel switch command and beacon callbacks

The count field in CSA must be decremented with each beacon
transmitted. This patch implements the functionality for drivers
using ieee80211_beacon_get(). Other drivers must call back manually
after reaching count == 0.

This patch also contains the handling and finish worker for the channel
switch command, and mac80211/chanctx code to allow to change a channel
definition of an active channel context.

Signed-off-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Mathias Kretschmer <mathias.kretschmer@fokus.fraunhofer.de>
[small cleanups, catch identical chandef]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  37 +++++++++
 net/mac80211/cfg.c         | 187 ++++++++++++++++++++++++++++++++++++++++++++-
 net/mac80211/chan.c        |  58 ++++++++++++++
 net/mac80211/driver-ops.h  |  13 ++++
 net/mac80211/ieee80211_i.h |  17 +++++
 net/mac80211/iface.c       |   9 +++
 net/mac80211/trace.h       |  26 +++++++
 net/mac80211/tx.c          |  78 +++++++++++++++++++
 8 files changed, 423 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 3124036285eb..9cda3728c2cb 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -152,11 +152,14 @@ struct ieee80211_low_level_stats {
  * @IEEE80211_CHANCTX_CHANGE_WIDTH: The channel width changed
  * @IEEE80211_CHANCTX_CHANGE_RX_CHAINS: The number of RX chains changed
  * @IEEE80211_CHANCTX_CHANGE_RADAR: radar detection flag changed
+ * @IEEE80211_CHANCTX_CHANGE_CHANNEL: switched to another operating channel,
+ *	this is used only with channel switching with CSA
  */
 enum ieee80211_chanctx_change {
 	IEEE80211_CHANCTX_CHANGE_WIDTH		= BIT(0),
 	IEEE80211_CHANCTX_CHANGE_RX_CHAINS	= BIT(1),
 	IEEE80211_CHANCTX_CHANGE_RADAR		= BIT(2),
+	IEEE80211_CHANCTX_CHANGE_CHANNEL	= BIT(3),
 };
 
 /**
@@ -1084,6 +1087,7 @@ enum ieee80211_vif_flags {
  * @addr: address of this interface
  * @p2p: indicates whether this AP or STA interface is a p2p
  *	interface, i.e. a GO or p2p-sta respectively
+ * @csa_active: marks whether a channel switch is going on
  * @driver_flags: flags/capabilities the driver has for this interface,
  *	these need to be set (or cleared) when the interface is added
  *	or, if supported by the driver, the interface type is changed
@@ -1106,6 +1110,7 @@ struct ieee80211_vif {
 	struct ieee80211_bss_conf bss_conf;
 	u8 addr[ETH_ALEN];
 	bool p2p;
+	bool csa_active;
 
 	u8 cab_queue;
 	u8 hw_queue[IEEE80211_NUM_ACS];
@@ -2637,6 +2642,16 @@ enum ieee80211_roc_type {
  * @ipv6_addr_change: IPv6 address assignment on the given interface changed.
  *	Currently, this is only called for managed or P2P client interfaces.
  *	This callback is optional; it must not sleep.
+ *
+ * @channel_switch_beacon: Starts a channel switch to a new channel.
+ *	Beacons are modified to include CSA or ECSA IEs before calling this
+ *	function. The corresponding count fields in these IEs must be
+ *	decremented, and when they reach zero the driver must call
+ *	ieee80211_csa_finish(). Drivers which use ieee80211_beacon_get()
+ *	get the csa counter decremented by mac80211, but must check if it is
+ *	zero using ieee80211_csa_is_complete() after the beacon has been
+ *	transmitted and then call ieee80211_csa_finish().
+ *
  */
 struct ieee80211_ops {
 	void (*tx)(struct ieee80211_hw *hw,
@@ -2824,6 +2839,9 @@ struct ieee80211_ops {
 				 struct ieee80211_vif *vif,
 				 struct inet6_dev *idev);
 #endif
+	void (*channel_switch_beacon)(struct ieee80211_hw *hw,
+				      struct ieee80211_vif *vif,
+				      struct cfg80211_chan_def *chandef);
 };
 
 /**
@@ -3318,6 +3336,25 @@ static inline struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw,
 	return ieee80211_beacon_get_tim(hw, vif, NULL, NULL);
 }
 
+/**
+ * ieee80211_csa_finish - notify mac80211 about channel switch
+ * @vif: &struct ieee80211_vif pointer from the add_interface callback.
+ *
+ * After a channel switch announcement was scheduled and the counter in this
+ * announcement hit zero, this function must be called by the driver to
+ * notify mac80211 that the channel can be changed.
+ */
+void ieee80211_csa_finish(struct ieee80211_vif *vif);
+
+/**
+ * ieee80211_csa_is_complete - find out if counters reached zero
+ * @vif: &struct ieee80211_vif pointer from the add_interface callback.
+ *
+ * This function returns whether the channel switch counters reached zero.
+ */
+bool ieee80211_csa_is_complete(struct ieee80211_vif *vif);
+
+
 /**
  * ieee80211_proberesp_get - retrieve a Probe Response template
  * @hw: pointer obtained from ieee80211_alloc_hw().
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index b82fff6c0b30..44449ceb7966 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -860,8 +860,8 @@ static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata,
 	return 0;
 }
 
-static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata,
-				   struct cfg80211_beacon_data *params)
+int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata,
+			    struct cfg80211_beacon_data *params)
 {
 	struct beacon_data *new, *old;
 	int new_head_len, new_tail_len;
@@ -1024,6 +1024,12 @@ static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev,
 
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
+	/* don't allow changing the beacon while CSA is in place - offset
+	 * of channel switch counter may change
+	 */
+	if (sdata->vif.csa_active)
+		return -EBUSY;
+
 	old = rtnl_dereference(sdata->u.ap.beacon);
 	if (!old)
 		return -ENOENT;
@@ -1048,6 +1054,10 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev)
 		return -ENOENT;
 	old_probe_resp = rtnl_dereference(sdata->u.ap.probe_resp);
 
+	/* abort any running channel switch */
+	sdata->vif.csa_active = false;
+	cancel_work_sync(&sdata->csa_finalize_work);
+
 	/* turn off carrier for this interface and dependent VLANs */
 	list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list)
 		netif_carrier_off(vlan->dev);
@@ -2775,6 +2785,178 @@ static int ieee80211_start_radar_detection(struct wiphy *wiphy,
 	return 0;
 }
 
+static struct cfg80211_beacon_data *
+cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon)
+{
+	struct cfg80211_beacon_data *new_beacon;
+	u8 *pos;
+	int len;
+
+	len = beacon->head_len + beacon->tail_len + beacon->beacon_ies_len +
+	      beacon->proberesp_ies_len + beacon->assocresp_ies_len +
+	      beacon->probe_resp_len;
+
+	new_beacon = kzalloc(sizeof(*new_beacon) + len, GFP_KERNEL);
+	if (!new_beacon)
+		return NULL;
+
+	pos = (u8 *)(new_beacon + 1);
+	if (beacon->head_len) {
+		new_beacon->head_len = beacon->head_len;
+		new_beacon->head = pos;
+		memcpy(pos, beacon->head, beacon->head_len);
+		pos += beacon->head_len;
+	}
+	if (beacon->tail_len) {
+		new_beacon->tail_len = beacon->tail_len;
+		new_beacon->tail = pos;
+		memcpy(pos, beacon->tail, beacon->tail_len);
+		pos += beacon->tail_len;
+	}
+	if (beacon->beacon_ies_len) {
+		new_beacon->beacon_ies_len = beacon->beacon_ies_len;
+		new_beacon->beacon_ies = pos;
+		memcpy(pos, beacon->beacon_ies, beacon->beacon_ies_len);
+		pos += beacon->beacon_ies_len;
+	}
+	if (beacon->proberesp_ies_len) {
+		new_beacon->proberesp_ies_len = beacon->proberesp_ies_len;
+		new_beacon->proberesp_ies = pos;
+		memcpy(pos, beacon->proberesp_ies, beacon->proberesp_ies_len);
+		pos += beacon->proberesp_ies_len;
+	}
+	if (beacon->assocresp_ies_len) {
+		new_beacon->assocresp_ies_len = beacon->assocresp_ies_len;
+		new_beacon->assocresp_ies = pos;
+		memcpy(pos, beacon->assocresp_ies, beacon->assocresp_ies_len);
+		pos += beacon->assocresp_ies_len;
+	}
+	if (beacon->probe_resp_len) {
+		new_beacon->probe_resp_len = beacon->probe_resp_len;
+		beacon->probe_resp = pos;
+		memcpy(pos, beacon->probe_resp, beacon->probe_resp_len);
+		pos += beacon->probe_resp_len;
+	}
+
+	return new_beacon;
+}
+
+void ieee80211_csa_finalize_work(struct work_struct *work)
+{
+	struct ieee80211_sub_if_data *sdata =
+		container_of(work, struct ieee80211_sub_if_data,
+			     csa_finalize_work);
+	struct ieee80211_local *local = sdata->local;
+	int err, changed;
+
+	if (!ieee80211_sdata_running(sdata))
+		return;
+
+	if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_AP))
+		return;
+
+	sdata->radar_required = sdata->csa_radar_required;
+	err = ieee80211_vif_change_channel(sdata, &local->csa_chandef,
+					   &changed);
+	if (WARN_ON(err < 0))
+		return;
+
+	err = ieee80211_assign_beacon(sdata, sdata->u.ap.next_beacon);
+	if (err < 0)
+		return;
+
+	changed |= err;
+	kfree(sdata->u.ap.next_beacon);
+	sdata->u.ap.next_beacon = NULL;
+	sdata->vif.csa_active = false;
+
+	ieee80211_wake_queues_by_reason(&sdata->local->hw,
+					IEEE80211_MAX_QUEUE_MAP,
+					IEEE80211_QUEUE_STOP_REASON_CSA);
+
+	ieee80211_bss_info_change_notify(sdata, changed);
+
+	cfg80211_ch_switch_notify(sdata->dev, &local->csa_chandef);
+}
+
+static int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
+				    struct cfg80211_csa_settings *params)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_chanctx_conf *chanctx_conf;
+	struct ieee80211_chanctx *chanctx;
+	int err, num_chanctx;
+
+	if (!list_empty(&local->roc_list) || local->scanning)
+		return -EBUSY;
+
+	if (sdata->wdev.cac_started)
+		return -EBUSY;
+
+	if (cfg80211_chandef_identical(&params->chandef,
+				       &sdata->vif.bss_conf.chandef))
+		return -EINVAL;
+
+	rcu_read_lock();
+	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+	if (!chanctx_conf) {
+		rcu_read_unlock();
+		return -EBUSY;
+	}
+
+	/* don't handle for multi-VIF cases */
+	chanctx = container_of(chanctx_conf, struct ieee80211_chanctx, conf);
+	if (chanctx->refcount > 1) {
+		rcu_read_unlock();
+		return -EBUSY;
+	}
+	num_chanctx = 0;
+	list_for_each_entry_rcu(chanctx, &local->chanctx_list, list)
+		num_chanctx++;
+	rcu_read_unlock();
+
+	if (num_chanctx > 1)
+		return -EBUSY;
+
+	/* don't allow another channel switch if one is already active. */
+	if (sdata->vif.csa_active)
+		return -EBUSY;
+
+	/* only handle AP for now. */
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_AP:
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	sdata->u.ap.next_beacon = cfg80211_beacon_dup(&params->beacon_after);
+	if (!sdata->u.ap.next_beacon)
+		return -ENOMEM;
+
+	sdata->csa_counter_offset_beacon = params->counter_offset_beacon;
+	sdata->csa_counter_offset_presp = params->counter_offset_presp;
+	sdata->csa_radar_required = params->radar_required;
+
+	if (params->block_tx)
+		ieee80211_stop_queues_by_reason(&local->hw,
+				IEEE80211_MAX_QUEUE_MAP,
+				IEEE80211_QUEUE_STOP_REASON_CSA);
+
+	err = ieee80211_assign_beacon(sdata, &params->beacon_csa);
+	if (err < 0)
+		return err;
+
+	local->csa_chandef = params->chandef;
+	sdata->vif.csa_active = true;
+
+	ieee80211_bss_info_change_notify(sdata, err);
+	drv_channel_switch_beacon(sdata, &params->chandef);
+
+	return 0;
+}
+
 static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
 			     struct ieee80211_channel *chan, bool offchan,
 			     unsigned int wait, const u8 *buf, size_t len,
@@ -3492,4 +3674,5 @@ struct cfg80211_ops mac80211_config_ops = {
 	.get_et_strings = ieee80211_get_et_strings,
 	.get_channel = ieee80211_cfg_get_channel,
 	.start_radar_detection = ieee80211_start_radar_detection,
+	.channel_switch = ieee80211_channel_switch,
 };
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index 03e8d2e3270e..3a4764b2869e 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -410,6 +410,64 @@ int ieee80211_vif_use_channel(struct ieee80211_sub_if_data *sdata,
 	return ret;
 }
 
+int ieee80211_vif_change_channel(struct ieee80211_sub_if_data *sdata,
+				 const struct cfg80211_chan_def *chandef,
+				 u32 *changed)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_chanctx_conf *conf;
+	struct ieee80211_chanctx *ctx;
+	int ret;
+	u32 chanctx_changed = 0;
+
+	/* should never be called if not performing a channel switch. */
+	if (WARN_ON(!sdata->vif.csa_active))
+		return -EINVAL;
+
+	if (!cfg80211_chandef_usable(sdata->local->hw.wiphy, chandef,
+				     IEEE80211_CHAN_DISABLED))
+		return -EINVAL;
+
+	mutex_lock(&local->chanctx_mtx);
+	conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
+					 lockdep_is_held(&local->chanctx_mtx));
+	if (!conf) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ctx = container_of(conf, struct ieee80211_chanctx, conf);
+	if (ctx->refcount != 1) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (sdata->vif.bss_conf.chandef.width != chandef->width) {
+		chanctx_changed = IEEE80211_CHANCTX_CHANGE_WIDTH;
+		*changed |= BSS_CHANGED_BANDWIDTH;
+	}
+
+	sdata->vif.bss_conf.chandef = *chandef;
+	ctx->conf.def = *chandef;
+
+	chanctx_changed |= IEEE80211_CHANCTX_CHANGE_CHANNEL;
+	drv_change_chanctx(local, ctx, chanctx_changed);
+
+	if (!local->use_chanctx) {
+		local->_oper_chandef = *chandef;
+		ieee80211_hw_config(local, 0);
+	}
+
+	ieee80211_recalc_chanctx_chantype(local, ctx);
+	ieee80211_recalc_smps_chanctx(local, ctx);
+	ieee80211_recalc_radar_chanctx(local, ctx);
+
+	ret = 0;
+ out:
+	mutex_unlock(&local->chanctx_mtx);
+	return ret;
+}
+
 int ieee80211_vif_change_bandwidth(struct ieee80211_sub_if_data *sdata,
 				   const struct cfg80211_chan_def *chandef,
 				   u32 *changed)
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index b931c96a596f..b3ea11f3d526 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -1072,4 +1072,17 @@ static inline void drv_ipv6_addr_change(struct ieee80211_local *local,
 }
 #endif
 
+static inline void
+drv_channel_switch_beacon(struct ieee80211_sub_if_data *sdata,
+			  struct cfg80211_chan_def *chandef)
+{
+	struct ieee80211_local *local = sdata->local;
+
+	if (local->ops->channel_switch_beacon) {
+		trace_drv_channel_switch_beacon(local, sdata, chandef);
+		local->ops->channel_switch_beacon(&local->hw, &sdata->vif,
+						  chandef);
+	}
+}
+
 #endif /* __MAC80211_DRIVER_OPS */
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index d779383c52d2..e94c84050e9c 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -259,6 +259,8 @@ struct ieee80211_if_ap {
 	struct beacon_data __rcu *beacon;
 	struct probe_resp __rcu *probe_resp;
 
+	/* to be used after channel switch. */
+	struct cfg80211_beacon_data *next_beacon;
 	struct list_head vlans;
 
 	struct ps_data ps;
@@ -716,6 +718,11 @@ struct ieee80211_sub_if_data {
 
 	struct ieee80211_tx_queue_params tx_conf[IEEE80211_NUM_ACS];
 
+	struct work_struct csa_finalize_work;
+	int csa_counter_offset_beacon;
+	int csa_counter_offset_presp;
+	bool csa_radar_required;
+
 	/* used to reconfigure hardware SM PS */
 	struct work_struct recalc_smps;
 
@@ -1372,6 +1379,9 @@ void ieee80211_roc_notify_destroy(struct ieee80211_roc_work *roc, bool free);
 void ieee80211_sw_roc_work(struct work_struct *work);
 void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc);
 
+/* channel switch handling */
+void ieee80211_csa_finalize_work(struct work_struct *work);
+
 /* interface handling */
 int ieee80211_iface_init(void);
 void ieee80211_iface_exit(void);
@@ -1393,6 +1403,8 @@ void ieee80211_del_virtual_monitor(struct ieee80211_local *local);
 
 bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata);
 void ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata);
+int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata,
+			    struct cfg80211_beacon_data *params);
 
 static inline bool ieee80211_sdata_running(struct ieee80211_sub_if_data *sdata)
 {
@@ -1654,6 +1666,11 @@ int __must_check
 ieee80211_vif_change_bandwidth(struct ieee80211_sub_if_data *sdata,
 			       const struct cfg80211_chan_def *chandef,
 			       u32 *changed);
+/* NOTE: only use ieee80211_vif_change_channel() for channel switch */
+int __must_check
+ieee80211_vif_change_channel(struct ieee80211_sub_if_data *sdata,
+			     const struct cfg80211_chan_def *chandef,
+			     u32 *changed);
 void ieee80211_vif_release_channel(struct ieee80211_sub_if_data *sdata);
 void ieee80211_vif_vlan_copy_chanctx(struct ieee80211_sub_if_data *sdata);
 void ieee80211_vif_copy_chanctx_to_vlans(struct ieee80211_sub_if_data *sdata,
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 4c41c11958c8..7ca534bf4cea 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -274,6 +274,12 @@ static int ieee80211_check_concurrent_iface(struct ieee80211_sub_if_data *sdata,
 			if (iftype == NL80211_IFTYPE_ADHOC &&
 			    nsdata->vif.type == NL80211_IFTYPE_ADHOC)
 				return -EBUSY;
+			/*
+			 * will not add another interface while any channel
+			 * switch is active.
+			 */
+			if (nsdata->vif.csa_active)
+				return -EBUSY;
 
 			/*
 			 * The remaining checks are only performed for interfaces
@@ -804,6 +810,8 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
 	cancel_work_sync(&local->dynamic_ps_enable_work);
 
 	cancel_work_sync(&sdata->recalc_smps);
+	sdata->vif.csa_active = false;
+	cancel_work_sync(&sdata->csa_finalize_work);
 
 	cancel_delayed_work_sync(&sdata->dfs_cac_timer_work);
 
@@ -1267,6 +1275,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
 	skb_queue_head_init(&sdata->skb_queue);
 	INIT_WORK(&sdata->work, ieee80211_iface_work);
 	INIT_WORK(&sdata->recalc_smps, ieee80211_recalc_smps_work);
+	INIT_WORK(&sdata->csa_finalize_work, ieee80211_csa_finalize_work);
 
 	switch (type) {
 	case NL80211_IFTYPE_P2P_GO:
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index c215fafd7a2f..1aba645882bd 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -1906,6 +1906,32 @@ TRACE_EVENT(api_radar_detected,
 	)
 );
 
+TRACE_EVENT(drv_channel_switch_beacon,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata,
+		 struct cfg80211_chan_def *chandef),
+
+	TP_ARGS(local, sdata, chandef),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		VIF_ENTRY
+		CHANDEF_ENTRY
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		VIF_ASSIGN;
+		CHANDEF_ASSIGN(chandef);
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT VIF_PR_FMT " channel switch to " CHANDEF_PR_FMT,
+		LOCAL_PR_ARG, VIF_PR_ARG, CHANDEF_PR_ARG
+	)
+);
+
+
 #ifdef CONFIG_MAC80211_MESSAGE_TRACING
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM mac80211_msg
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index f65873f0c89f..0e42322aa6b1 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -2338,6 +2338,81 @@ static int ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata,
 	return 0;
 }
 
+void ieee80211_csa_finish(struct ieee80211_vif *vif)
+{
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+
+	ieee80211_queue_work(&sdata->local->hw,
+			     &sdata->csa_finalize_work);
+}
+EXPORT_SYMBOL(ieee80211_csa_finish);
+
+static void ieee80211_update_csa(struct ieee80211_sub_if_data *sdata,
+				 struct beacon_data *beacon)
+{
+	struct probe_resp *resp;
+	int counter_offset_beacon = sdata->csa_counter_offset_beacon;
+	int counter_offset_presp = sdata->csa_counter_offset_presp;
+
+	/* warn if the driver did not check for/react to csa completeness */
+	if (WARN_ON(((u8 *)beacon->tail)[counter_offset_beacon] == 0))
+		return;
+
+	((u8 *)beacon->tail)[counter_offset_beacon]--;
+
+	if (sdata->vif.type == NL80211_IFTYPE_AP &&
+	    counter_offset_presp) {
+		rcu_read_lock();
+		resp = rcu_dereference(sdata->u.ap.probe_resp);
+
+		/* if nl80211 accepted the offset, this should not happen. */
+		if (WARN_ON(!resp)) {
+			rcu_read_unlock();
+			return;
+		}
+		resp->data[counter_offset_presp]--;
+		rcu_read_unlock();
+	}
+}
+
+bool ieee80211_csa_is_complete(struct ieee80211_vif *vif)
+{
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct beacon_data *beacon = NULL;
+	u8 *beacon_data;
+	size_t beacon_data_len;
+	int counter_beacon = sdata->csa_counter_offset_beacon;
+	int ret = false;
+
+	if (!ieee80211_sdata_running(sdata))
+		return false;
+
+	rcu_read_lock();
+	if (vif->type == NL80211_IFTYPE_AP) {
+		struct ieee80211_if_ap *ap = &sdata->u.ap;
+
+		beacon = rcu_dereference(ap->beacon);
+		if (WARN_ON(!beacon || !beacon->tail))
+			goto out;
+		beacon_data = beacon->tail;
+		beacon_data_len = beacon->tail_len;
+	} else {
+		WARN_ON(1);
+		goto out;
+	}
+
+	if (WARN_ON(counter_beacon > beacon_data_len))
+		goto out;
+
+	if (beacon_data[counter_beacon] == 0)
+		ret = true;
+ out:
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(ieee80211_csa_is_complete);
+
 struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw,
 					 struct ieee80211_vif *vif,
 					 u16 *tim_offset, u16 *tim_length)
@@ -2368,6 +2443,9 @@ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw,
 		struct beacon_data *beacon = rcu_dereference(ap->beacon);
 
 		if (beacon) {
+			if (sdata->vif.csa_active)
+				ieee80211_update_csa(sdata, beacon);
+
 			/*
 			 * headroom, head length,
 			 * tail length and maximum TIM length
-- 
cgit v1.2.3


From 439677d766ba9095e5afc4a30147f65bc363b6e7 Mon Sep 17 00:00:00 2001
From: "fan.du" <fan.du@windriver.com>
Date: Thu, 1 Aug 2013 17:44:44 +0800
Subject: ipv6: bump genid when delete/add address

Server           Client
2001:1::803/64  <-> 2001:1::805/64
2001:2::804/64  <-> 2001:2::806/64

Server side fib binary tree looks like this:

                                   (2001:/64)
                                   /
                                  /
                   ffff88002103c380
                 /                 \
     (2)        /                   \
 (2001::803/128)                     ffff880037ac07c0
                                    /               \
                                   /                 \  (3)
                      ffff880037ac0640               (2001::806/128)
                       /             \
             (1)      /               \
        (2001::804/128)               (2001::805/128)

Delete 2001::804/64 won't cause prefix route deleted as well as rt in (3)
destinate to 2001::806 with source address as 2001::804/64. That's because
2001::803/64 is still alive, which make onlink=1 in ipv6_del_addr, this is
where the substantial difference between same prefix configuration and
different prefix configuration :) So packet are still transmitted out to
2001::806 with source address as 2001::804/64.

So bump genid will clear rt in (3), and up layer protocol will eventually
find the right one for themselves.

This problem arised from the discussion in here:
http://marc.info/?l=linux-netdev&m=137404469219410&w=4

Signed-off-by: Fan Du <fan.du@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index cfdcf7b2daf6..e7780d72067c 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4653,6 +4653,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 		break;
 	}
 	atomic_inc(&net->ipv6.dev_addr_genid);
+	rt_genid_bump_ipv6(net);
 }
 
 static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
-- 
cgit v1.2.3


From d27fc78208b53ccdfd6a57d4ac44a459ca66806f Mon Sep 17 00:00:00 2001
From: "fan.du" <fan.du@windriver.com>
Date: Fri, 2 Aug 2013 10:45:13 +0800
Subject: sctp: Don't lookup dst if transport dst is still valid

When sctp sits on IPv6, sctp_transport_dst_check pass cookie as ZERO,
as a result ip6_dst_check always fail out. This behaviour makes
transport->dst useless, because every sctp_packet_transmit must look
for valid dst.

Add a dst_cookie into sctp_transport, and set the cookie whenever we
get new dst for sctp_transport. So dst validness could be checked
against it.

Since I have split genid for IPv4 and IPv6, also delete/add IPv6 address
will also bump IPv6 genid. So issues we discussed in:
http://marc.info/?l=linux-netdev&m=137404469219410&w=4
have all been sloved for this patch.

Signed-off-by: Fan Du <fan.du@windriver.com>
Acked-by: Vlad Yasevich <vyasevich@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h    | 2 +-
 include/net/sctp/structs.h | 1 +
 net/sctp/ipv6.c            | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 554cf88605fc..cb28df9c2781 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -613,7 +613,7 @@ static inline void sctp_v4_map_v6(union sctp_addr *addr)
  */
 static inline struct dst_entry *sctp_transport_dst_check(struct sctp_transport *t)
 {
-	if (t->dst && !dst_check(t->dst, 0)) {
+	if (t->dst && !dst_check(t->dst, t->dst_cookie)) {
 		dst_release(t->dst);
 		t->dst = NULL;
 	}
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 75c4c16601b6..c0f4e29eedd5 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -946,6 +946,7 @@ struct sctp_transport {
 	__u64 hb_nonce;
 
 	struct rcu_head rcu;
+	u32 dst_cookie;
 };
 
 struct sctp_transport *sctp_transport_new(struct net *, const union sctp_addr *,
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 85d688f59e6a..5a9402e3c0cf 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -351,7 +351,7 @@ out:
 
 		rt = (struct rt6_info *)dst;
 		t->dst = dst;
-
+		t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
 		pr_debug("rt6_dst:%pI6 rt6_src:%pI6\n", &rt->rt6i_dst.addr,
 			 &fl6->saddr);
 	} else {
-- 
cgit v1.2.3


From 0f75b09c798ed00c30d7d5551b896be883bc2aeb Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Fri, 2 Aug 2013 11:37:39 +0200
Subject: af_packet: when sending ethernet frames, parse header for
 skb->protocol

This may be necessary when the SKB is passed to other layers on the go,
which check the protocol field on their own. An example is a VLAN packet
sent out using AF_PACKET on a bridge interface. The bridging code checks
the SKB size, accounting for any VLAN header only if the protocol field
is set accordingly.

Note that eth_type_trans() sets skb->dev to the passed argument, so this
can be skipped in packet_snd() for ethernet frames, as well.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 4cb28a7f639b..71db35ecba39 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -88,6 +88,7 @@
 #include <linux/virtio_net.h>
 #include <linux/errqueue.h>
 #include <linux/net_tstamp.h>
+#include <linux/if_arp.h>
 
 #ifdef CONFIG_INET
 #include <net/inet_common.h>
@@ -2005,6 +2006,9 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 		if (unlikely(err))
 			return err;
 
+		if (dev->type == ARPHRD_ETHER)
+			skb->protocol = eth_type_trans(skb, dev);
+
 		data += dev->hard_header_len;
 		to_write -= dev->hard_header_len;
 	}
@@ -2324,6 +2328,13 @@ static int packet_snd(struct socket *sock,
 
 	sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
 
+	if (dev->type == ARPHRD_ETHER) {
+		skb->protocol = eth_type_trans(skb, dev);
+	} else {
+		skb->protocol = proto;
+		skb->dev = dev;
+	}
+
 	if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
 		/* Earlier code assumed this would be a VLAN pkt,
 		 * double-check this now that we have the actual
@@ -2338,8 +2349,6 @@ static int packet_snd(struct socket *sock,
 		}
 	}
 
-	skb->protocol = proto;
-	skb->dev = dev;
 	skb->priority = sk->sk_priority;
 	skb->mark = sk->sk_mark;
 
-- 
cgit v1.2.3


From cbd89acb9eb257ed3b2be867142583fdcf7fdc5b Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Fri, 2 Aug 2013 11:37:40 +0200
Subject: af_packet: fix for sending VLAN frames via packet_mmap

Since tpacket_fill_skb() parses the protocol field in ethernet frames'
headers, it's easy to see if any passed frame is a VLAN one and account
for the extended size.

But as the real protocol does not turn up before tpacket_fill_skb()
runs which in turn also checks the frame length, move the max frame
length calculation into the function.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 71db35ecba39..2b1470de9e16 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1924,7 +1924,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 		__be16 proto, unsigned char *addr, int hlen)
 {
 	union tpacket_uhdr ph;
-	int to_write, offset, len, tp_len, nr_frags, len_max;
+	int to_write, offset, len, tp_len, nr_frags, len_max, max_frame_len;
 	struct socket *sock = po->sk.sk_socket;
 	struct page *page;
 	void *data;
@@ -1947,10 +1947,6 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 		tp_len = ph.h1->tp_len;
 		break;
 	}
-	if (unlikely(tp_len > size_max)) {
-		pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
-		return -EMSGSIZE;
-	}
 
 	skb_reserve(skb, hlen);
 	skb_reset_network_header(skb);
@@ -2013,6 +2009,18 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 		to_write -= dev->hard_header_len;
 	}
 
+	max_frame_len = dev->mtu + dev->hard_header_len;
+	if (skb->protocol == htons(ETH_P_8021Q))
+		max_frame_len += VLAN_HLEN;
+
+	if (size_max > max_frame_len)
+		size_max = max_frame_len;
+
+	if (unlikely(tp_len > size_max)) {
+		pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
+		return -EMSGSIZE;
+	}
+
 	offset = offset_in_page(data);
 	len_max = PAGE_SIZE - offset;
 	len = ((to_write > len_max) ? len_max : to_write);
@@ -2051,7 +2059,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 	struct net_device *dev;
 	__be16 proto;
 	bool need_rls_dev = false;
-	int err, reserve = 0;
+	int err;
 	void *ph;
 	struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
 	int tp_len, size_max;
@@ -2084,8 +2092,6 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 	if (unlikely(dev == NULL))
 		goto out;
 
-	reserve = dev->hard_header_len;
-
 	err = -ENETDOWN;
 	if (unlikely(!(dev->flags & IFF_UP)))
 		goto out_put;
@@ -2093,9 +2099,6 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 	size_max = po->tx_ring.frame_size
 		- (po->tp_hdrlen - sizeof(struct sockaddr_ll));
 
-	if (size_max > dev->mtu + reserve)
-		size_max = dev->mtu + reserve;
-
 	do {
 		ph = packet_current_frame(po, &po->tx_ring,
 				TP_STATUS_SEND_REQUEST);
-- 
cgit v1.2.3


From c483e02614551e44ced3fe6eedda8e36d3277ccc Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Fri, 2 Aug 2013 11:37:41 +0200
Subject: af_packet: simplify VLAN frame check in packet_snd

For ethernet frames, eth_type_trans() already parses the header, so one
can skip this when checking the frame size.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 2b1470de9e16..0c0f6c9a90e7 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2333,23 +2333,16 @@ static int packet_snd(struct socket *sock,
 
 	if (dev->type == ARPHRD_ETHER) {
 		skb->protocol = eth_type_trans(skb, dev);
+		if (skb->protocol == htons(ETH_P_8021Q))
+			reserve += VLAN_HLEN;
 	} else {
 		skb->protocol = proto;
 		skb->dev = dev;
 	}
 
 	if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
-		/* Earlier code assumed this would be a VLAN pkt,
-		 * double-check this now that we have the actual
-		 * packet in hand.
-		 */
-		struct ethhdr *ehdr;
-		skb_reset_mac_header(skb);
-		ehdr = eth_hdr(skb);
-		if (ehdr->h_proto != htons(ETH_P_8021Q)) {
-			err = -EMSGSIZE;
-			goto out_free;
-		}
+		err = -EMSGSIZE;
+		goto out_free;
 	}
 
 	skb->priority = sk->sk_priority;
-- 
cgit v1.2.3


From 9cc08af3a1d9d1687cb2ad6063ac1552ec2f695a Mon Sep 17 00:00:00 2001
From: Werner Almesberger <werner@almesberger.net>
Date: Fri, 2 Aug 2013 10:51:19 -0300
Subject: icmpv6_filter: fix "_hdr" incorrectly being a pointer

"_hdr" should hold the ICMPv6 header while "hdr" is the pointer to it.
This worked by accident.

Signed-off-by: Werner Almesberger <werner@almesberger.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/raw.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index c45f7a5c36e9..164fb5f58641 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -108,7 +108,7 @@ found:
  */
 static int icmpv6_filter(const struct sock *sk, const struct sk_buff *skb)
 {
-	struct icmp6hdr *_hdr;
+	struct icmp6hdr _hdr;
 	const struct icmp6hdr *hdr;
 
 	hdr = skb_header_pointer(skb, skb_transport_offset(skb),
-- 
cgit v1.2.3


From d1c53c8e870cdedb6fc9550f41c558bab45b5219 Mon Sep 17 00:00:00 2001
From: Werner Almesberger <werner@almesberger.net>
Date: Fri, 2 Aug 2013 10:51:34 -0300
Subject: icmpv6_filter: allow ICMPv6 messages with bodies < 4 bytes

By using sizeof(_hdr), net/ipv6/raw.c:icmpv6_filter implicitly assumes
that any valid ICMPv6 message is at least eight bytes long, i.e., that
the message body is at least four bytes.

The DIS message of RPL (RFC 6550 section 6.2, from the 6LoWPAN world),
has a minimum length of only six bytes, and is thus blocked by
icmpv6_filter.

RFC 4443 seems to allow even a zero-sized body, making the minimum
allowable message size four bytes.

Signed-off-by: Werner Almesberger <werner@almesberger.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/raw.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 164fb5f58641..c1e533498203 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -63,6 +63,8 @@
 #include <linux/seq_file.h>
 #include <linux/export.h>
 
+#define	ICMPV6_HDRLEN	4	/* ICMPv6 header, RFC 4443 Section 2.1 */
+
 static struct raw_hashinfo raw_v6_hashinfo = {
 	.lock = __RW_LOCK_UNLOCKED(raw_v6_hashinfo.lock),
 };
@@ -111,8 +113,11 @@ static int icmpv6_filter(const struct sock *sk, const struct sk_buff *skb)
 	struct icmp6hdr _hdr;
 	const struct icmp6hdr *hdr;
 
+	/* We require only the four bytes of the ICMPv6 header, not any
+	 * additional bytes of message body in "struct icmp6hdr".
+	 */
 	hdr = skb_header_pointer(skb, skb_transport_offset(skb),
-				 sizeof(_hdr), &_hdr);
+				 ICMPV6_HDRLEN, &_hdr);
 	if (hdr) {
 		const __u32 *data = &raw6_sk(sk)->filter.data[0];
 		unsigned int type = hdr->icmp6_type;
-- 
cgit v1.2.3


From 6ef94cfafba159d6b1a902ccb3349ac6a34ff6ad Mon Sep 17 00:00:00 2001
From: Stefan Tomanek <stefan.tomanek@wertarbyte.de>
Date: Fri, 2 Aug 2013 17:19:56 +0200
Subject: fib_rules: add route suppression based on ifgroup

This change adds the ability to suppress a routing decision based upon the
interface group the selected interface belongs to. This allows it to
exclude specific devices from a routing decision.

Signed-off-by: Stefan Tomanek <stefan.tomanek@wertarbyte.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/fib_rules.h        |  2 ++
 include/uapi/linux/fib_rules.h |  2 +-
 net/core/fib_rules.c           | 10 ++++++++++
 net/ipv4/fib_rules.c           | 23 +++++++++++++++++------
 net/ipv6/fib6_rules.c          | 16 +++++++++++++---
 5 files changed, 43 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index 2f286dce9259..d13c461b4b59 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -18,6 +18,7 @@ struct fib_rule {
 	u32			pref;
 	u32			flags;
 	u32			table;
+	int			suppress_ifgroup;
 	u8			table_prefixlen_min;
 	u8			action;
 	u32			target;
@@ -84,6 +85,7 @@ struct fib_rules_ops {
 	[FRA_FWMASK]	= { .type = NLA_U32 }, \
 	[FRA_TABLE]     = { .type = NLA_U32 }, \
 	[FRA_TABLE_PREFIXLEN_MIN] = { .type = NLA_U8 }, \
+	[FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \
 	[FRA_GOTO]	= { .type = NLA_U32 }
 
 static inline void fib_rule_get(struct fib_rule *rule)
diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h
index 59cd31b3455e..63e31166e85b 100644
--- a/include/uapi/linux/fib_rules.h
+++ b/include/uapi/linux/fib_rules.h
@@ -44,7 +44,7 @@ enum {
 	FRA_FWMARK,	/* mark */
 	FRA_FLOW,	/* flow/class id */
 	FRA_UNUSED6,
-	FRA_UNUSED7,
+	FRA_SUPPRESS_IFGROUP,
 	FRA_TABLE_PREFIXLEN_MIN,
 	FRA_TABLE,	/* Extended table id */
 	FRA_FWMASK,	/* mask for netfilter mark */
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 2ef5040c99c8..5040a61bf28a 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -343,6 +343,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 	if (tb[FRA_TABLE_PREFIXLEN_MIN])
 		rule->table_prefixlen_min = nla_get_u8(tb[FRA_TABLE_PREFIXLEN_MIN]);
 
+	if (tb[FRA_SUPPRESS_IFGROUP])
+		rule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]);
+
 	if (!tb[FRA_PRIORITY] && ops->default_pref)
 		rule->pref = ops->default_pref(ops);
 
@@ -529,6 +532,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
 			 + nla_total_size(4) /* FRA_PRIORITY */
 			 + nla_total_size(4) /* FRA_TABLE */
 			 + nla_total_size(1) /* FRA_TABLE_PREFIXLEN_MIN */
+			 + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */
 			 + nla_total_size(4) /* FRA_FWMARK */
 			 + nla_total_size(4); /* FRA_FWMASK */
 
@@ -588,6 +592,12 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
 	    (rule->target &&
 	     nla_put_u32(skb, FRA_GOTO, rule->target)))
 		goto nla_put_failure;
+
+	if (rule->suppress_ifgroup != -1) {
+		if (nla_put_u32(skb, FRA_SUPPRESS_IFGROUP, rule->suppress_ifgroup))
+			goto nla_put_failure;
+	}
+
 	if (ops->fill(rule, skb, frh) < 0)
 		goto nla_put_failure;
 
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 9f2906679d1f..b78fd28970c9 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -103,16 +103,27 @@ errout:
 
 static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
 {
+	struct fib_result *result = (struct fib_result *) arg->result;
+	struct net_device *dev = result->fi->fib_dev;
+
 	/* do not accept result if the route does
 	 * not meet the required prefix length
 	 */
-	struct fib_result *result = (struct fib_result *) arg->result;
-	if (result->prefixlen < rule->table_prefixlen_min) {
-		if (!(arg->flags & FIB_LOOKUP_NOREF))
-			fib_info_put(result->fi);
-		return true;
-	}
+	if (result->prefixlen < rule->table_prefixlen_min)
+		goto suppress_route;
+
+	/* do not accept result if the route uses a device
+	 * belonging to a forbidden interface group
+	 */
+	if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup)
+		goto suppress_route;
+
 	return false;
+
+suppress_route:
+	if (!(arg->flags & FIB_LOOKUP_NOREF))
+		fib_info_put(result->fi);
+	return true;
 }
 
 static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 554a4fbabfb3..36283267e2f8 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -122,14 +122,24 @@ out:
 static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
 {
 	struct rt6_info *rt = (struct rt6_info *) arg->result;
+	struct net_device *dev = rt->rt6i_idev->dev;
 	/* do not accept result if the route does
 	 * not meet the required prefix length
 	 */
-	if (rt->rt6i_dst.plen < rule->table_prefixlen_min) {
+	if (rt->rt6i_dst.plen < rule->table_prefixlen_min)
+		goto suppress_route;
+
+	/* do not accept result if the route uses a device
+	 * belonging to a forbidden interface group
+	 */
+	if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup)
+		goto suppress_route;
+
+	return false;
+
+suppress_route:
 		ip6_rt_put(rt);
 		return true;
-	}
-	return false;
 }
 
 static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
-- 
cgit v1.2.3


From 8a849bb7f0ac513c2b7202620f611d94301c4a93 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Fri, 2 Aug 2013 17:32:39 +0200
Subject: net: netlink: minor: remove unused pointer in alloc_pg_vec

Variable ptr is being assigned, but never used, so just remove it.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/af_netlink.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 0c61b59175dc..6273772aa30c 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -294,14 +294,14 @@ static void **alloc_pg_vec(struct netlink_sock *nlk,
 {
 	unsigned int block_nr = req->nm_block_nr;
 	unsigned int i;
-	void **pg_vec, *ptr;
+	void **pg_vec;
 
 	pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL);
 	if (pg_vec == NULL)
 		return NULL;
 
 	for (i = 0; i < block_nr; i++) {
-		pg_vec[i] = ptr = alloc_one_pg_vec_page(order);
+		pg_vec[i] = alloc_one_pg_vec_page(order);
 		if (pg_vec[i] == NULL)
 			goto err1;
 	}
-- 
cgit v1.2.3


From 63134803a6369dcf7dddf7f0d5e37b9566b308d2 Mon Sep 17 00:00:00 2001
From: Veaceslav Falico <vfalico@redhat.com>
Date: Fri, 2 Aug 2013 19:07:38 +0200
Subject: neighbour: populate neigh_parms on alloc before calling
 ndo_neigh_setup

dev->ndo_neigh_setup() might need some of the values of neigh_parms, so
populate them before calling it.

Signed-off-by: Veaceslav Falico <vfalico@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index b7de821f98df..576d46f4a69e 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1441,16 +1441,18 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
 		atomic_set(&p->refcnt, 1);
 		p->reachable_time =
 				neigh_rand_reach_time(p->base_reachable_time);
+		dev_hold(dev);
+		p->dev = dev;
+		write_pnet(&p->net, hold_net(net));
+		p->sysctl_table = NULL;
 
 		if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) {
+			release_net(net);
+			dev_put(dev);
 			kfree(p);
 			return NULL;
 		}
 
-		dev_hold(dev);
-		p->dev = dev;
-		write_pnet(&p->net, hold_net(net));
-		p->sysctl_table = NULL;
 		write_lock_bh(&tbl->lock);
 		p->next		= tbl->parms.next;
 		tbl->parms.next = p;
-- 
cgit v1.2.3


From 0c0667a8548ef2985038a5a1d0fa0f64e2774694 Mon Sep 17 00:00:00 2001
From: Wang Sheng-Hui <shhuiw@gmail.com>
Date: Sat, 3 Aug 2013 16:54:50 +0800
Subject: vlan: cleanup the usage of vlan_dev_priv(dev)

This patch cleanup 2 points for the usage of vlan_dev_priv(dev):
* In vlan_dev.c/vlan_dev_hard_header, we should use the var *vlan directly
  after grabing the pointer at the beginning with
        *vlan = vlan_dev_priv(dev);
  when we need to access the fields of *vlan.
* In vlan.c/register_vlan_device, add the var *vlan pointer
        struct vlan_dev_priv *vlan;
to cleanup the code to access the fields of vlan_dev_priv(new_dev).

Signed-off-by: Wang Sheng-Hui <shhuiw@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlan.c     | 12 +++++++-----
 net/8021q/vlan_dev.c |  6 +++---
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 03a92e117657..61fc573f1142 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -210,6 +210,7 @@ out_vid_del:
 static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
 {
 	struct net_device *new_dev;
+	struct vlan_dev_priv *vlan;
 	struct net *net = dev_net(real_dev);
 	struct vlan_net *vn = net_generic(net, vlan_net_id);
 	char name[IFNAMSIZ];
@@ -260,11 +261,12 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
 	new_dev->mtu = real_dev->mtu;
 	new_dev->priv_flags |= (real_dev->priv_flags & IFF_UNICAST_FLT);
 
-	vlan_dev_priv(new_dev)->vlan_proto = htons(ETH_P_8021Q);
-	vlan_dev_priv(new_dev)->vlan_id = vlan_id;
-	vlan_dev_priv(new_dev)->real_dev = real_dev;
-	vlan_dev_priv(new_dev)->dent = NULL;
-	vlan_dev_priv(new_dev)->flags = VLAN_FLAG_REORDER_HDR;
+	vlan = vlan_dev_priv(new_dev);
+	vlan->vlan_proto = htons(ETH_P_8021Q);
+	vlan->vlan_id = vlan_id;
+	vlan->real_dev = real_dev;
+	vlan->dent = NULL;
+	vlan->flags = VLAN_FLAG_REORDER_HDR;
 
 	new_dev->rtnl_link_ops = &vlan_link_ops;
 	err = register_vlan_dev(new_dev);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 1cd3d2a406f5..9ab8a7ed99c0 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -107,10 +107,10 @@ static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 	u16 vlan_tci = 0;
 	int rc;
 
-	if (!(vlan_dev_priv(dev)->flags & VLAN_FLAG_REORDER_HDR)) {
+	if (!(vlan->flags & VLAN_FLAG_REORDER_HDR)) {
 		vhdr = (struct vlan_hdr *) skb_push(skb, VLAN_HLEN);
 
-		vlan_tci = vlan_dev_priv(dev)->vlan_id;
+		vlan_tci = vlan->vlan_id;
 		vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb);
 		vhdr->h_vlan_TCI = htons(vlan_tci);
 
@@ -133,7 +133,7 @@ static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 		saddr = dev->dev_addr;
 
 	/* Now make the underlying real hard header */
-	dev = vlan_dev_priv(dev)->real_dev;
+	dev = vlan->real_dev;
 	rc = dev_hard_header(skb, dev, type, daddr, saddr, len + vhdrlen);
 	if (rc > 0)
 		rc += vhdrlen;
-- 
cgit v1.2.3


From 73f5698e77219bfc3ea1903759fe8e20ab5b285e Mon Sep 17 00:00:00 2001
From: Stefan Tomanek <stefan.tomanek@wertarbyte.de>
Date: Sat, 3 Aug 2013 14:14:43 +0200
Subject: fib_rules: fix suppressor names and default values

This change brings the suppressor attribute names into line; it also changes
the data types to provide a more consistent interface.

While -1 indicates that the suppressor is not enabled, values >= 0 for
suppress_prefixlen or suppress_ifgroup  reject routing decisions violating the
constraint.

This changes the previously presented behaviour of suppress_prefixlen, where a
prefix length _less_ than the attribute value was rejected. After this change,
a prefix length less than *or* equal to the value is considered a violation of
the rule constraint.

It also changes the default values for default and newly added rules (disabling
any suppression for those).

Signed-off-by: Stefan Tomanek <stefan.tomanek@wertarbyte.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/fib_rules.h        |  4 ++--
 include/uapi/linux/fib_rules.h |  2 +-
 net/core/fib_rules.c           | 15 +++++++++++----
 net/ipv4/fib_rules.c           |  2 +-
 net/ipv6/fib6_rules.c          |  2 +-
 5 files changed, 16 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index d13c461b4b59..9d0fcbaa9cbb 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -19,7 +19,7 @@ struct fib_rule {
 	u32			flags;
 	u32			table;
 	int			suppress_ifgroup;
-	u8			table_prefixlen_min;
+	int			suppress_prefixlen;
 	u8			action;
 	u32			target;
 	struct fib_rule __rcu	*ctarget;
@@ -84,7 +84,7 @@ struct fib_rules_ops {
 	[FRA_FWMARK]	= { .type = NLA_U32 }, \
 	[FRA_FWMASK]	= { .type = NLA_U32 }, \
 	[FRA_TABLE]     = { .type = NLA_U32 }, \
-	[FRA_TABLE_PREFIXLEN_MIN] = { .type = NLA_U8 }, \
+	[FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \
 	[FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \
 	[FRA_GOTO]	= { .type = NLA_U32 }
 
diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h
index 63e31166e85b..2b82d7e30974 100644
--- a/include/uapi/linux/fib_rules.h
+++ b/include/uapi/linux/fib_rules.h
@@ -45,7 +45,7 @@ enum {
 	FRA_FLOW,	/* flow/class id */
 	FRA_UNUSED6,
 	FRA_SUPPRESS_IFGROUP,
-	FRA_TABLE_PREFIXLEN_MIN,
+	FRA_SUPPRESS_PREFIXLEN,
 	FRA_TABLE,	/* Extended table id */
 	FRA_FWMASK,	/* mask for netfilter mark */
 	FRA_OIFNAME,
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 5040a61bf28a..2e654138433c 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -33,6 +33,9 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
 	r->flags = flags;
 	r->fr_net = hold_net(ops->fro_net);
 
+	r->suppress_prefixlen = -1;
+	r->suppress_ifgroup = -1;
+
 	/* The lock is not required here, the list in unreacheable
 	 * at the moment this function is called */
 	list_add_tail(&r->list, &ops->rules_list);
@@ -340,11 +343,15 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 	rule->action = frh->action;
 	rule->flags = frh->flags;
 	rule->table = frh_get_table(frh, tb);
-	if (tb[FRA_TABLE_PREFIXLEN_MIN])
-		rule->table_prefixlen_min = nla_get_u8(tb[FRA_TABLE_PREFIXLEN_MIN]);
+	if (tb[FRA_SUPPRESS_PREFIXLEN])
+		rule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]);
+	else
+		rule->suppress_prefixlen = -1;
 
 	if (tb[FRA_SUPPRESS_IFGROUP])
 		rule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]);
+	else
+		rule->suppress_ifgroup = -1;
 
 	if (!tb[FRA_PRIORITY] && ops->default_pref)
 		rule->pref = ops->default_pref(ops);
@@ -531,7 +538,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
 			 + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */
 			 + nla_total_size(4) /* FRA_PRIORITY */
 			 + nla_total_size(4) /* FRA_TABLE */
-			 + nla_total_size(1) /* FRA_TABLE_PREFIXLEN_MIN */
+			 + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */
 			 + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */
 			 + nla_total_size(4) /* FRA_FWMARK */
 			 + nla_total_size(4); /* FRA_FWMASK */
@@ -558,7 +565,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
 	frh->table = rule->table;
 	if (nla_put_u32(skb, FRA_TABLE, rule->table))
 		goto nla_put_failure;
-	if (nla_put_u8(skb, FRA_TABLE_PREFIXLEN_MIN, rule->table_prefixlen_min))
+	if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen))
 		goto nla_put_failure;
 	frh->res1 = 0;
 	frh->res2 = 0;
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index b78fd28970c9..523be38e37de 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -109,7 +109,7 @@ static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg
 	/* do not accept result if the route does
 	 * not meet the required prefix length
 	 */
-	if (result->prefixlen < rule->table_prefixlen_min)
+	if (result->prefixlen <= rule->suppress_prefixlen)
 		goto suppress_route;
 
 	/* do not accept result if the route uses a device
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 36283267e2f8..a6c58ce43d34 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -126,7 +126,7 @@ static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg
 	/* do not accept result if the route does
 	 * not meet the required prefix length
 	 */
-	if (rt->rt6i_dst.plen < rule->table_prefixlen_min)
+	if (rt->rt6i_dst.plen <= rule->suppress_prefixlen)
 		goto suppress_route;
 
 	/* do not accept result if the route uses a device
-- 
cgit v1.2.3


From 762a3d89ebf5873f71b3839449ac6562049ef1ce Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Sun, 4 Aug 2013 17:19:38 -0700
Subject: bridge: fix rcu check warning in multicast port group

Use of RCU here with out marked pointer and function doesn't match prototype
with sparse.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_mdb.c     | 3 ++-
 net/bridge/br_private.h | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 0daae3ec2355..e4d5cd43b7fb 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -61,7 +61,8 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
 
 	for (i = 0; i < mdb->max; i++) {
 		struct net_bridge_mdb_entry *mp;
-		struct net_bridge_port_group *p, **pp;
+		struct net_bridge_port_group *p;
+		struct net_bridge_port_group __rcu **pp;
 		struct net_bridge_port *port;
 
 		hlist_for_each_entry_rcu(mp, &mdb->mhash[i], hlist[mdb->ver]) {
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 1514c9f4f102..d41283c57952 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -466,7 +466,7 @@ extern void br_multicast_free_pg(struct rcu_head *head);
 extern struct net_bridge_port_group *br_multicast_new_port_group(
 				struct net_bridge_port *port,
 				struct br_ip *group,
-				struct net_bridge_port_group *next,
+				struct net_bridge_port_group __rcu *next,
 				unsigned char state);
 extern void br_mdb_init(void);
 extern void br_mdb_uninit(void);
-- 
cgit v1.2.3


From e473fcb472574de978e47f980aeca510020a1286 Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Wed, 26 Jun 2013 23:56:58 +0200
Subject: xfrm: constify mark argument of xfrm_find_acq()

The mark argument is read only, so constify it. Also make dummy_mark in
af_key const -- only used as dummy argument for this very function.

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h    |  2 +-
 net/key/af_key.c      |  2 +-
 net/xfrm/xfrm_state.c | 12 +++++++-----
 3 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 94ce082b29dc..89d3d8ae204e 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1548,7 +1548,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8, int dir, u32
 int xfrm_policy_flush(struct net *net, u8 type, struct xfrm_audit *audit_info);
 u32 xfrm_get_acqseq(void);
 extern int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi);
-struct xfrm_state *xfrm_find_acq(struct net *net, struct xfrm_mark *mark,
+struct xfrm_state *xfrm_find_acq(struct net *net, const struct xfrm_mark *mark,
 				 u8 mode, u32 reqid, u8 proto,
 				 const xfrm_address_t *daddr,
 				 const xfrm_address_t *saddr, int create,
diff --git a/net/key/af_key.c b/net/key/af_key.c
index ab8bd2cabfa0..4089a210e852 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -45,7 +45,7 @@ struct netns_pfkey {
 static DEFINE_MUTEX(pfkey_mutex);
 
 #define DUMMY_MARK 0
-static struct xfrm_mark dummy_mark = {0, 0};
+static const struct xfrm_mark dummy_mark = {0, 0};
 struct pfkey_sock {
 	/* struct sock must be the first member of struct pfkey_sock */
 	struct sock	sk;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 78f66fa92449..b2cd806c0be3 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -990,11 +990,13 @@ void xfrm_state_insert(struct xfrm_state *x)
 EXPORT_SYMBOL(xfrm_state_insert);
 
 /* xfrm_state_lock is held */
-static struct xfrm_state *__find_acq_core(struct net *net, struct xfrm_mark *m,
+static struct xfrm_state *__find_acq_core(struct net *net,
+					  const struct xfrm_mark *m,
 					  unsigned short family, u8 mode,
 					  u32 reqid, u8 proto,
 					  const xfrm_address_t *daddr,
-					  const xfrm_address_t *saddr, int create)
+					  const xfrm_address_t *saddr,
+					  int create)
 {
 	unsigned int h = xfrm_dst_hash(net, daddr, saddr, reqid, family);
 	struct xfrm_state *x;
@@ -1399,9 +1401,9 @@ xfrm_state_lookup_byaddr(struct net *net, u32 mark,
 EXPORT_SYMBOL(xfrm_state_lookup_byaddr);
 
 struct xfrm_state *
-xfrm_find_acq(struct net *net, struct xfrm_mark *mark, u8 mode, u32 reqid, u8 proto,
-	      const xfrm_address_t *daddr, const xfrm_address_t *saddr,
-	      int create, unsigned short family)
+xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid,
+	      u8 proto, const xfrm_address_t *daddr,
+	      const xfrm_address_t *saddr, int create, unsigned short family)
 {
 	struct xfrm_state *x;
 
-- 
cgit v1.2.3


From 8603b9556e1727f0de7e43ef448c85ff93347f27 Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Wed, 26 Jun 2013 23:56:59 +0200
Subject: af_key: constify lookup tables

The lookup tables for minimum sizes of extensions and for the pfkey
handler functions are read only, therefore can be const.

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/key/af_key.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/key/af_key.c b/net/key/af_key.c
index 4089a210e852..d49f67681823 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -338,7 +338,7 @@ static int pfkey_error(const struct sadb_msg *orig, int err, struct sock *sk)
 	return 0;
 }
 
-static u8 sadb_ext_min_len[] = {
+static const u8 sadb_ext_min_len[] = {
 	[SADB_EXT_RESERVED]		= (u8) 0,
 	[SADB_EXT_SA]			= (u8) sizeof(struct sadb_sa),
 	[SADB_EXT_LIFETIME_CURRENT]	= (u8) sizeof(struct sadb_lifetime),
@@ -2737,7 +2737,7 @@ static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, const struct sad
 
 typedef int (*pfkey_handler)(struct sock *sk, struct sk_buff *skb,
 			     const struct sadb_msg *hdr, void * const *ext_hdrs);
-static pfkey_handler pfkey_funcs[SADB_MAX + 1] = {
+static const pfkey_handler pfkey_funcs[SADB_MAX + 1] = {
 	[SADB_RESERVED]		= pfkey_reserved,
 	[SADB_GETSPI]		= pfkey_getspi,
 	[SADB_UPDATE]		= pfkey_add,
-- 
cgit v1.2.3


From d8b3bfc253d8063fcce9c447ecc4cf3b1735b13a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 4 Aug 2013 11:37:29 +0200
Subject: netfilter: tproxy: fix build with IP6_NF_IPTABLES=n

after commit 93742cf (netfilter: tproxy: remove nf_tproxy_core.h)

CONFIG_IPV6=y
CONFIG_IP6_NF_IPTABLES=n

gives us:

net/netfilter/xt_TPROXY.c: In function 'nf_tproxy_get_sock_v6':
net/netfilter/xt_TPROXY.c:178:4: error: implicit declaration of function 'inet6_lookup_listener'

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_TPROXY.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 851383a7f461..5d8a3a3cd5a7 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -161,7 +161,7 @@ nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
 	return sk;
 }
 
-#if IS_ENABLED(CONFIG_IPV6)
+#ifdef XT_TPROXY_HAVE_IPV6
 static inline struct sock *
 nf_tproxy_get_sock_v6(struct net *net, const u8 protocol,
 		      const struct in6_addr *saddr, const struct in6_addr *daddr,
-- 
cgit v1.2.3


From 0659eea912cf2ce65298784b918904ba0f91f5e1 Mon Sep 17 00:00:00 2001
From: Fan Du <fan.du@windriver.com>
Date: Thu, 1 Aug 2013 18:08:36 +0800
Subject: xfrm: Delete hold_timer when destroy policy

Both policy timer and hold_timer need to be deleted when destroy policy

Signed-off-by: Fan Du <fan.du@windriver.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index d8da6b8c6ba8..f7078eb5b64e 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -308,7 +308,7 @@ void xfrm_policy_destroy(struct xfrm_policy *policy)
 {
 	BUG_ON(!policy->walk.dead);
 
-	if (del_timer(&policy->timer))
+	if (del_timer(&policy->timer) || del_timer(&policy->polq.hold_timer))
 		BUG();
 
 	security_xfrm_policy_free(policy->security);
-- 
cgit v1.2.3


From 70e3ca79cd0e98d4191aa10b3393c23a9b84a2e8 Mon Sep 17 00:00:00 2001
From: Dragos Foianu <dragos.foianu@gmail.com>
Date: Thu, 11 Jul 2013 09:45:42 +0300
Subject: ipvs: fixed spacing at for statements

found using checkpatch.pl

Signed-off-by: Dragos Foianu <dragos.foianu@gmail.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_lblcr.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 3cd85b2fc67c..5199448697f6 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -414,7 +414,7 @@ static void ip_vs_lblcr_flush(struct ip_vs_service *svc)
 
 	spin_lock_bh(&svc->sched_lock);
 	tbl->dead = 1;
-	for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+	for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) {
 		hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
 			ip_vs_lblcr_free(en);
 		}
@@ -440,7 +440,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
 	struct ip_vs_lblcr_entry *en;
 	struct hlist_node *next;
 
-	for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+	for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) {
 		j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
 
 		spin_lock(&svc->sched_lock);
@@ -495,7 +495,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
 	if (goal > tbl->max_size/2)
 		goal = tbl->max_size/2;
 
-	for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+	for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) {
 		j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
 
 		spin_lock(&svc->sched_lock);
@@ -536,7 +536,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
 	/*
 	 *    Initialize the hash buckets
 	 */
-	for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+	for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) {
 		INIT_HLIST_HEAD(&tbl->bucket[i]);
 	}
 	tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
-- 
cgit v1.2.3


From e7f1935c11269bc53cd52425b1025657adddb839 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 25 Jul 2013 21:45:17 +0200
Subject: wireless: make TU conversion macros available

A few places in the code (mac80211 and iwlmvm) use the same
TU_TO_JIFFIES() macro and could use TU_TO_EXP_TIME() that
mac80211 has. Make these available to everyone and use them.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/iwlwifi/mvm/time-event.c | 7 ++-----
 include/linux/ieee80211.h                     | 4 ++++
 net/mac80211/ieee80211_i.h                    | 3 ---
 3 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/drivers/net/wireless/iwlwifi/mvm/time-event.c b/drivers/net/wireless/iwlwifi/mvm/time-event.c
index ad9bbca99213..9f100363b177 100644
--- a/drivers/net/wireless/iwlwifi/mvm/time-event.c
+++ b/drivers/net/wireless/iwlwifi/mvm/time-event.c
@@ -73,7 +73,6 @@
 #include "iwl-prph.h"
 
 /* A TimeUnit is 1024 microsecond */
-#define TU_TO_JIFFIES(_tu)	(usecs_to_jiffies((_tu) * 1024))
 #define MSEC_TO_TU(_msec)	(_msec*1000/1024)
 
 /*
@@ -191,8 +190,7 @@ static void iwl_mvm_te_handle_notif(struct iwl_mvm *mvm,
 		iwl_mvm_te_clear_data(mvm, te_data);
 	} else if (le32_to_cpu(notif->action) & TE_NOTIF_HOST_EVENT_START) {
 		te_data->running = true;
-		te_data->end_jiffies = jiffies +
-			TU_TO_JIFFIES(te_data->duration);
+		te_data->end_jiffies = TU_TO_EXP_TIME(te_data->duration);
 
 		if (te_data->vif->type == NL80211_IFTYPE_P2P_DEVICE) {
 			set_bit(IWL_MVM_STATUS_ROC_RUNNING, &mvm->status);
@@ -329,8 +327,7 @@ void iwl_mvm_protect_session(struct iwl_mvm *mvm,
 	lockdep_assert_held(&mvm->mutex);
 
 	if (te_data->running &&
-	    time_after(te_data->end_jiffies,
-		       jiffies + TU_TO_JIFFIES(min_duration))) {
+	    time_after(te_data->end_jiffies, TU_TO_EXP_TIME(min_duration))) {
 		IWL_DEBUG_TE(mvm, "We have enough time in the current TE: %u\n",
 			     jiffies_to_msecs(te_data->end_jiffies - jiffies));
 		return;
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index b3ce299782af..23a8877f4ded 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2288,4 +2288,8 @@ static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim,
 	return !!(tim->virtual_map[index] & mask);
 }
 
+/* convert time units */
+#define TU_TO_JIFFIES(x)	(usecs_to_jiffies((x) * 1024))
+#define TU_TO_EXP_TIME(x)	(jiffies + TU_TO_JIFFIES(x))
+
 #endif /* LINUX_IEEE80211_H */
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index e94c84050e9c..b6186517ec56 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -53,9 +53,6 @@ struct ieee80211_local;
  * increased memory use (about 2 kB of RAM per entry). */
 #define IEEE80211_FRAGMENT_MAX 4
 
-#define TU_TO_JIFFIES(x)	(usecs_to_jiffies((x) * 1024))
-#define TU_TO_EXP_TIME(x)	(jiffies + TU_TO_JIFFIES(x))
-
 /* power level hasn't been configured (or set to automatic) */
 #define IEEE80211_UNSET_POWER_LEVEL	INT_MIN
 
-- 
cgit v1.2.3


From 54e35cc52346149a7bce8a2f622e215ed17bb56d Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Tue, 6 Aug 2013 11:20:23 +0200
Subject: ipvs: ip_vs_sh: ip_vs_sh_get_port: check skb_header_pointer for NULL

skb_header_pointer could return NULL, so check for it as we do it
everywhere else in ipvs code. This fixes a coverity warning.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_sh.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index f16c027df15b..3588faebe529 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -269,14 +269,20 @@ ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
 	switch (iph->protocol) {
 	case IPPROTO_TCP:
 		th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
+		if (unlikely(th == NULL))
+			return 0;
 		port = th->source;
 		break;
 	case IPPROTO_UDP:
 		uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph);
+		if (unlikely(uh == NULL))
+			return 0;
 		port = uh->source;
 		break;
 	case IPPROTO_SCTP:
 		sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph);
+		if (unlikely(sh == NULL))
+			return 0;
 		port = sh->source;
 		break;
 	default:
-- 
cgit v1.2.3


From af83fde7513b6c4acd869ad4fb93893704439470 Mon Sep 17 00:00:00 2001
From: Fan Du <fan.du@windriver.com>
Date: Tue, 6 Aug 2013 15:50:56 +0800
Subject: xfrm: Remove rebundant address family checking

present_and_same_family has checked addresses family validness for both
SADB_EXT_ADDRESS_SRC and SADB_EXT_ADDRESS_DST in the beginning.
Thereafter pfkey_sadb_addr2xfrm_addr doesn't need to do the checking again.

Signed-off-by: Fan Du <fan.du@windriver.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/key/af_key.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'net')

diff --git a/net/key/af_key.c b/net/key/af_key.c
index d49f67681823..9d585370c5b4 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1196,10 +1196,6 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net,
 
 	x->props.family = pfkey_sadb_addr2xfrm_addr((struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
 						    &x->props.saddr);
-	if (!x->props.family) {
-		err = -EAFNOSUPPORT;
-		goto out;
-	}
 	pfkey_sadb_addr2xfrm_addr((struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_DST-1],
 				  &x->id.daddr);
 
@@ -2205,10 +2201,6 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, const struct sadb_
 
 	sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1];
 	xp->family = pfkey_sadb_addr2xfrm_addr(sa, &xp->selector.saddr);
-	if (!xp->family) {
-		err = -EINVAL;
-		goto out;
-	}
 	xp->selector.family = xp->family;
 	xp->selector.prefixlen_s = sa->sadb_address_prefixlen;
 	xp->selector.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto);
-- 
cgit v1.2.3


From 6261d983f226f0a6a8d4d32b57a032bc23a5ebb6 Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Mon, 5 Aug 2013 22:51:37 -0700
Subject: ip_tunnel: embed hash list head

The IP tunnel hash heads can be embedded in the per-net structure
since it is a fixed size. Reduce the size so that the total structure
fits in a page size. The original size was overly large, even NETDEV_HASHBITS
is only 8 bits!

Also, add some white space for readability.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Acked-by: Pravin B Shelar <pshelar@nicira.com>.
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h |  4 ++--
 net/ipv4/ip_tunnel.c     | 13 ++++++-------
 2 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 781b3cf86a2f..c6acd9f8f877 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -86,12 +86,12 @@ struct tnl_ptk_info {
 #define PACKET_RCVD	0
 #define PACKET_REJECT	1
 
-#define IP_TNL_HASH_BITS   10
+#define IP_TNL_HASH_BITS   7
 #define IP_TNL_HASH_SIZE   (1 << IP_TNL_HASH_BITS)
 
 struct ip_tunnel_net {
-	struct hlist_head *tunnels;
 	struct net_device *fb_tunnel_dev;
+	struct hlist_head tunnels[IP_TNL_HASH_SIZE];
 };
 
 #ifdef CONFIG_INET
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index ca1cb2d5f6e2..9fdf8a6d95f3 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -838,15 +838,16 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
 {
 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
 	struct ip_tunnel_parm parms;
+	unsigned int i;
 
-	itn->tunnels = kzalloc(IP_TNL_HASH_SIZE * sizeof(struct hlist_head), GFP_KERNEL);
-	if (!itn->tunnels)
-		return -ENOMEM;
+	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
+		INIT_HLIST_HEAD(&itn->tunnels[i]);
 
 	if (!ops) {
 		itn->fb_tunnel_dev = NULL;
 		return 0;
 	}
+
 	memset(&parms, 0, sizeof(parms));
 	if (devname)
 		strlcpy(parms.name, devname, IFNAMSIZ);
@@ -854,10 +855,9 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
 	rtnl_lock();
 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
 	rtnl_unlock();
-	if (IS_ERR(itn->fb_tunnel_dev)) {
-		kfree(itn->tunnels);
+
+	if (IS_ERR(itn->fb_tunnel_dev))
 		return PTR_ERR(itn->fb_tunnel_dev);
-	}
 
 	return 0;
 }
@@ -887,7 +887,6 @@ void ip_tunnel_delete_net(struct ip_tunnel_net *itn)
 	ip_tunnel_destroy(itn, &list);
 	unregister_netdevice_many(&list);
 	rtnl_unlock();
-	kfree(itn->tunnels);
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
 
-- 
cgit v1.2.3


From b4bf07771faaf959b0a916d35b1b930c030e30a8 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 6 Aug 2013 17:45:03 +0800
Subject: net: move iov_pages() to net/core/iovec.c

To let it be reused and reduce code duplication.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvtap.c  | 23 -----------------------
 drivers/net/tun.c      | 23 -----------------------
 include/linux/socket.h |  2 ++
 net/core/iovec.c       | 24 ++++++++++++++++++++++++
 4 files changed, 26 insertions(+), 46 deletions(-)

(limited to 'net')

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index a98fb0ed6aef..dfec20df17ba 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -698,29 +698,6 @@ static int macvtap_skb_to_vnet_hdr(const struct sk_buff *skb,
 	return 0;
 }
 
-static unsigned long iov_pages(const struct iovec *iv, int offset,
-			       unsigned long nr_segs)
-{
-	unsigned long seg, base;
-	int pages = 0, len, size;
-
-	while (nr_segs && (offset >= iv->iov_len)) {
-		offset -= iv->iov_len;
-		++iv;
-		--nr_segs;
-	}
-
-	for (seg = 0; seg < nr_segs; seg++) {
-		base = (unsigned long)iv[seg].iov_base + offset;
-		len = iv[seg].iov_len - offset;
-		size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
-		pages += size;
-		offset = 0;
-	}
-
-	return pages;
-}
-
 /* Get packet from user space buffer */
 static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
 				const struct iovec *iv, unsigned long total_len,
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 5dce262f538e..18527ef0cc75 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1041,29 +1041,6 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
 	return 0;
 }
 
-static unsigned long iov_pages(const struct iovec *iv, int offset,
-			       unsigned long nr_segs)
-{
-	unsigned long seg, base;
-	int pages = 0, len, size;
-
-	while (nr_segs && (offset >= iv->iov_len)) {
-		offset -= iv->iov_len;
-		++iv;
-		--nr_segs;
-	}
-
-	for (seg = 0; seg < nr_segs; seg++) {
-		base = (unsigned long)iv[seg].iov_base + offset;
-		len = iv[seg].iov_len - offset;
-		size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
-		pages += size;
-		offset = 0;
-	}
-
-	return pages;
-}
-
 /* Get packet from user space buffer */
 static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 			    void *msg_control, const struct iovec *iv,
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 230c04bda3e2..445ef7519dc2 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -313,6 +313,8 @@ extern int csum_partial_copy_fromiovecend(unsigned char *kdata,
 					  struct iovec *iov, 
 					  int offset, 
 					  unsigned int len, __wsum *csump);
+extern unsigned long iov_pages(const struct iovec *iov, int offset,
+			       unsigned long nr_segs);
 
 extern int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *address, int mode);
 extern int memcpy_toiovecend(const struct iovec *v, unsigned char *kdata,
diff --git a/net/core/iovec.c b/net/core/iovec.c
index de178e462682..b77eeecc0011 100644
--- a/net/core/iovec.c
+++ b/net/core/iovec.c
@@ -212,3 +212,27 @@ out_fault:
 	goto out;
 }
 EXPORT_SYMBOL(csum_partial_copy_fromiovecend);
+
+unsigned long iov_pages(const struct iovec *iov, int offset,
+			unsigned long nr_segs)
+{
+	unsigned long seg, base;
+	int pages = 0, len, size;
+
+	while (nr_segs && (offset >= iov->iov_len)) {
+		offset -= iov->iov_len;
+		++iov;
+		--nr_segs;
+	}
+
+	for (seg = 0; seg < nr_segs; seg++) {
+		base = (unsigned long)iov[seg].iov_base + offset;
+		len = iov[seg].iov_len - offset;
+		size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+		pages += size;
+		offset = 0;
+	}
+
+	return pages;
+}
+EXPORT_SYMBOL(iov_pages);
-- 
cgit v1.2.3


From c3bdeb5c7cc073ccf5ff9624642022a8613a956e Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 6 Aug 2013 17:45:04 +0800
Subject: net: move zerocopy_sg_from_iovec() to net/core/datagram.c

To let it be reused and reduce code duplication. Also document this function.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvtap.c  | 80 -------------------------------------------
 drivers/net/tun.c      | 80 -------------------------------------------
 include/linux/skbuff.h |  4 +++
 net/core/datagram.c    | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 97 insertions(+), 160 deletions(-)

(limited to 'net')

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index dfec20df17ba..182364abfa35 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -536,86 +536,6 @@ static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad,
 	return skb;
 }
 
-/* set skb frags from iovec, this can move to core network code for reuse */
-static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
-				  int offset, size_t count)
-{
-	int len = iov_length(from, count) - offset;
-	int copy = skb_headlen(skb);
-	int size, offset1 = 0;
-	int i = 0;
-
-	/* Skip over from offset */
-	while (count && (offset >= from->iov_len)) {
-		offset -= from->iov_len;
-		++from;
-		--count;
-	}
-
-	/* copy up to skb headlen */
-	while (count && (copy > 0)) {
-		size = min_t(unsigned int, copy, from->iov_len - offset);
-		if (copy_from_user(skb->data + offset1, from->iov_base + offset,
-				   size))
-			return -EFAULT;
-		if (copy > size) {
-			++from;
-			--count;
-			offset = 0;
-		} else
-			offset += size;
-		copy -= size;
-		offset1 += size;
-	}
-
-	if (len == offset1)
-		return 0;
-
-	while (count--) {
-		struct page *page[MAX_SKB_FRAGS];
-		int num_pages;
-		unsigned long base;
-		unsigned long truesize;
-
-		len = from->iov_len - offset;
-		if (!len) {
-			offset = 0;
-			++from;
-			continue;
-		}
-		base = (unsigned long)from->iov_base + offset;
-		size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
-		if (i + size > MAX_SKB_FRAGS)
-			return -EMSGSIZE;
-		num_pages = get_user_pages_fast(base, size, 0, &page[i]);
-		if (num_pages != size) {
-			int j;
-
-			for (j = 0; j < num_pages; j++)
-				put_page(page[i + j]);
-			return -EFAULT;
-		}
-		truesize = size * PAGE_SIZE;
-		skb->data_len += len;
-		skb->len += len;
-		skb->truesize += truesize;
-		atomic_add(truesize, &skb->sk->sk_wmem_alloc);
-		while (len) {
-			int off = base & ~PAGE_MASK;
-			int size = min_t(int, len, PAGE_SIZE - off);
-			__skb_fill_page_desc(skb, i, page[i], off, size);
-			skb_shinfo(skb)->nr_frags++;
-			/* increase sk_wmem_alloc */
-			base += size;
-			len -= size;
-			i++;
-		}
-		offset = 0;
-		++from;
-	}
-	return 0;
-}
-
 /*
  * macvtap_skb_from_vnet_hdr and macvtap_skb_to_vnet_hdr should
  * be shared with the tun/tap driver.
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 18527ef0cc75..b1630479d4a1 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -961,86 +961,6 @@ static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
 	return skb;
 }
 
-/* set skb frags from iovec, this can move to core network code for reuse */
-static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
-				  int offset, size_t count)
-{
-	int len = iov_length(from, count) - offset;
-	int copy = skb_headlen(skb);
-	int size, offset1 = 0;
-	int i = 0;
-
-	/* Skip over from offset */
-	while (count && (offset >= from->iov_len)) {
-		offset -= from->iov_len;
-		++from;
-		--count;
-	}
-
-	/* copy up to skb headlen */
-	while (count && (copy > 0)) {
-		size = min_t(unsigned int, copy, from->iov_len - offset);
-		if (copy_from_user(skb->data + offset1, from->iov_base + offset,
-				   size))
-			return -EFAULT;
-		if (copy > size) {
-			++from;
-			--count;
-			offset = 0;
-		} else
-			offset += size;
-		copy -= size;
-		offset1 += size;
-	}
-
-	if (len == offset1)
-		return 0;
-
-	while (count--) {
-		struct page *page[MAX_SKB_FRAGS];
-		int num_pages;
-		unsigned long base;
-		unsigned long truesize;
-
-		len = from->iov_len - offset;
-		if (!len) {
-			offset = 0;
-			++from;
-			continue;
-		}
-		base = (unsigned long)from->iov_base + offset;
-		size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
-		if (i + size > MAX_SKB_FRAGS)
-			return -EMSGSIZE;
-		num_pages = get_user_pages_fast(base, size, 0, &page[i]);
-		if (num_pages != size) {
-			int j;
-
-			for (j = 0; j < num_pages; j++)
-				put_page(page[i + j]);
-			return -EFAULT;
-		}
-		truesize = size * PAGE_SIZE;
-		skb->data_len += len;
-		skb->len += len;
-		skb->truesize += truesize;
-		atomic_add(truesize, &skb->sk->sk_wmem_alloc);
-		while (len) {
-			int off = base & ~PAGE_MASK;
-			int size = min_t(int, len, PAGE_SIZE - off);
-			__skb_fill_page_desc(skb, i, page[i], off, size);
-			skb_shinfo(skb)->nr_frags++;
-			/* increase sk_wmem_alloc */
-			base += size;
-			len -= size;
-			i++;
-		}
-		offset = 0;
-		++from;
-	}
-	return 0;
-}
-
 /* Get packet from user space buffer */
 static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 			    void *msg_control, const struct iovec *iv,
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ae46475d4568..5ac96f31d546 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2359,6 +2359,10 @@ extern int	       skb_copy_datagram_from_iovec(struct sk_buff *skb,
 						    const struct iovec *from,
 						    int from_offset,
 						    int len);
+extern int	       zerocopy_sg_from_iovec(struct sk_buff *skb,
+					      const struct iovec *frm,
+					      int offset,
+					      size_t count);
 extern int	       skb_copy_datagram_const_iovec(const struct sk_buff *from,
 						     int offset,
 						     const struct iovec *to,
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 8ab48cd89559..f987faef21ce 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -573,6 +573,99 @@ fault:
 }
 EXPORT_SYMBOL(skb_copy_datagram_from_iovec);
 
+/**
+ *	zerocopy_sg_from_iovec - Build a zerocopy datagram from an iovec
+ *	@skb: buffer to copy
+ *	@from: io vector to copy to
+ *	@offset: offset in the io vector to start copying from
+ *	@count: amount of vectors to copy to buffer from
+ *
+ *	The function will first copy up to headlen, and then pin the userspace
+ *	pages and build frags through them.
+ *
+ *	Returns 0, -EFAULT or -EMSGSIZE.
+ *	Note: the iovec is not modified during the copy
+ */
+int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
+				  int offset, size_t count)
+{
+	int len = iov_length(from, count) - offset;
+	int copy = skb_headlen(skb);
+	int size, offset1 = 0;
+	int i = 0;
+
+	/* Skip over from offset */
+	while (count && (offset >= from->iov_len)) {
+		offset -= from->iov_len;
+		++from;
+		--count;
+	}
+
+	/* copy up to skb headlen */
+	while (count && (copy > 0)) {
+		size = min_t(unsigned int, copy, from->iov_len - offset);
+		if (copy_from_user(skb->data + offset1, from->iov_base + offset,
+				   size))
+			return -EFAULT;
+		if (copy > size) {
+			++from;
+			--count;
+			offset = 0;
+		} else
+			offset += size;
+		copy -= size;
+		offset1 += size;
+	}
+
+	if (len == offset1)
+		return 0;
+
+	while (count--) {
+		struct page *page[MAX_SKB_FRAGS];
+		int num_pages;
+		unsigned long base;
+		unsigned long truesize;
+
+		len = from->iov_len - offset;
+		if (!len) {
+			offset = 0;
+			++from;
+			continue;
+		}
+		base = (unsigned long)from->iov_base + offset;
+		size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+		if (i + size > MAX_SKB_FRAGS)
+			return -EMSGSIZE;
+		num_pages = get_user_pages_fast(base, size, 0, &page[i]);
+		if (num_pages != size) {
+			int j;
+
+			for (j = 0; j < num_pages; j++)
+				put_page(page[i + j]);
+			return -EFAULT;
+		}
+		truesize = size * PAGE_SIZE;
+		skb->data_len += len;
+		skb->len += len;
+		skb->truesize += truesize;
+		atomic_add(truesize, &skb->sk->sk_wmem_alloc);
+		while (len) {
+			int off = base & ~PAGE_MASK;
+			int size = min_t(int, len, PAGE_SIZE - off);
+			__skb_fill_page_desc(skb, i, page[i], off, size);
+			skb_shinfo(skb)->nr_frags++;
+			/* increase sk_wmem_alloc */
+			base += size;
+			len -= size;
+			i++;
+		}
+		offset = 0;
+		++from;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(zerocopy_sg_from_iovec);
+
 static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
 				      u8 __user *to, int len,
 				      __wsum *csump)
-- 
cgit v1.2.3


From 0a57ec62dfa4b639d86aba60c236904def1d0356 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 6 Aug 2013 17:45:05 +0800
Subject: net: use skb_fill_page_desc() in zerocopy_sg_from_iovec()

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/datagram.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/datagram.c b/net/core/datagram.c
index f987faef21ce..bec08672644a 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -652,8 +652,7 @@ int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
 		while (len) {
 			int off = base & ~PAGE_MASK;
 			int size = min_t(int, len, PAGE_SIZE - off);
-			__skb_fill_page_desc(skb, i, page[i], off, size);
-			skb_shinfo(skb)->nr_frags++;
+			skb_fill_page_desc(skb, i, page[i], off, size);
 			/* increase sk_wmem_alloc */
 			base += size;
 			len -= size;
-- 
cgit v1.2.3


From 234a4267085e85d0cc5552c340833bb2d82a50e7 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 6 Aug 2013 17:45:06 +0800
Subject: net: remove the useless comment in zerocopy_sg_from_iovec()

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/datagram.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/core/datagram.c b/net/core/datagram.c
index bec08672644a..a8a795cfeff5 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -653,7 +653,6 @@ int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
 			int off = base & ~PAGE_MASK;
 			int size = min_t(int, len, PAGE_SIZE - off);
 			skb_fill_page_desc(skb, i, page[i], off, size);
-			/* increase sk_wmem_alloc */
 			base += size;
 			len -= size;
 			i++;
-- 
cgit v1.2.3


From 0433547aa7443cefc89d9b533169bdc50f1206b8 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 6 Aug 2013 17:45:07 +0800
Subject: net: use release_pages() in zerocopy_sg_from_iovec()

To reduce the duplicated codes.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/datagram.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/core/datagram.c b/net/core/datagram.c
index a8a795cfeff5..badcd93e5dd8 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -48,6 +48,7 @@
 #include <linux/highmem.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
+#include <linux/pagemap.h>
 
 #include <net/protocol.h>
 #include <linux/skbuff.h>
@@ -638,10 +639,7 @@ int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
 			return -EMSGSIZE;
 		num_pages = get_user_pages_fast(base, size, 0, &page[i]);
 		if (num_pages != size) {
-			int j;
-
-			for (j = 0; j < num_pages; j++)
-				put_page(page[i + j]);
+			release_pages(&page[i], num_pages, 0);
 			return -EFAULT;
 		}
 		truesize = size * PAGE_SIZE;
-- 
cgit v1.2.3


From 3d9953a2ef2182d56e268742259b11dedb8e281d Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 6 Aug 2013 17:45:08 +0800
Subject: net: use skb_copy_datagram_from_iovec() in zerocopy_sg_from_iovec()

Use skb_copy_datagram_from_iovec() to avoid code duplication and make it easy to
be read. Also we can do the skipping inside the zero-copy loop.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/datagram.c | 37 ++++++++++---------------------------
 1 file changed, 10 insertions(+), 27 deletions(-)

(limited to 'net')

diff --git a/net/core/datagram.c b/net/core/datagram.c
index badcd93e5dd8..af814e764206 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -591,48 +591,31 @@ int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
 				  int offset, size_t count)
 {
 	int len = iov_length(from, count) - offset;
-	int copy = skb_headlen(skb);
-	int size, offset1 = 0;
+	int copy = min_t(int, skb_headlen(skb), len);
+	int size;
 	int i = 0;
 
-	/* Skip over from offset */
-	while (count && (offset >= from->iov_len)) {
-		offset -= from->iov_len;
-		++from;
-		--count;
-	}
-
 	/* copy up to skb headlen */
-	while (count && (copy > 0)) {
-		size = min_t(unsigned int, copy, from->iov_len - offset);
-		if (copy_from_user(skb->data + offset1, from->iov_base + offset,
-				   size))
-			return -EFAULT;
-		if (copy > size) {
-			++from;
-			--count;
-			offset = 0;
-		} else
-			offset += size;
-		copy -= size;
-		offset1 += size;
-	}
+	if (skb_copy_datagram_from_iovec(skb, 0, from, offset, copy))
+		return -EFAULT;
 
-	if (len == offset1)
+	if (len == copy)
 		return 0;
 
+	offset += copy;
 	while (count--) {
 		struct page *page[MAX_SKB_FRAGS];
 		int num_pages;
 		unsigned long base;
 		unsigned long truesize;
 
-		len = from->iov_len - offset;
-		if (!len) {
-			offset = 0;
+		/* Skip over from offset and copied */
+		if (offset >= from->iov_len) {
+			offset -= from->iov_len;
 			++from;
 			continue;
 		}
+		len = from->iov_len - offset;
 		base = (unsigned long)from->iov_base + offset;
 		size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
 		if (i + size > MAX_SKB_FRAGS)
-- 
cgit v1.2.3


From 09effa67a18d893fc4e6f81a3659fc9efef1445e Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 7 Aug 2013 17:11:00 -0700
Subject: packet: Revert recent header parsing changes.

This reverts commits:

0f75b09c798ed00c30d7d5551b896be883bc2aeb
cbd89acb9eb257ed3b2be867142583fdcf7fdc5b
c483e02614551e44ced3fe6eedda8e36d3277ccc

Amongst other things, it's modifies the SKB header
to pull the ethernet headers off via eth_type_trans()
on the output path which is bogus.

It's causing serious regressions for people.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 53 +++++++++++++++++++++++---------------------------
 1 file changed, 24 insertions(+), 29 deletions(-)

(limited to 'net')

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 0c0f6c9a90e7..4cb28a7f639b 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -88,7 +88,6 @@
 #include <linux/virtio_net.h>
 #include <linux/errqueue.h>
 #include <linux/net_tstamp.h>
-#include <linux/if_arp.h>
 
 #ifdef CONFIG_INET
 #include <net/inet_common.h>
@@ -1924,7 +1923,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 		__be16 proto, unsigned char *addr, int hlen)
 {
 	union tpacket_uhdr ph;
-	int to_write, offset, len, tp_len, nr_frags, len_max, max_frame_len;
+	int to_write, offset, len, tp_len, nr_frags, len_max;
 	struct socket *sock = po->sk.sk_socket;
 	struct page *page;
 	void *data;
@@ -1947,6 +1946,10 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 		tp_len = ph.h1->tp_len;
 		break;
 	}
+	if (unlikely(tp_len > size_max)) {
+		pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
+		return -EMSGSIZE;
+	}
 
 	skb_reserve(skb, hlen);
 	skb_reset_network_header(skb);
@@ -2002,25 +2005,10 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 		if (unlikely(err))
 			return err;
 
-		if (dev->type == ARPHRD_ETHER)
-			skb->protocol = eth_type_trans(skb, dev);
-
 		data += dev->hard_header_len;
 		to_write -= dev->hard_header_len;
 	}
 
-	max_frame_len = dev->mtu + dev->hard_header_len;
-	if (skb->protocol == htons(ETH_P_8021Q))
-		max_frame_len += VLAN_HLEN;
-
-	if (size_max > max_frame_len)
-		size_max = max_frame_len;
-
-	if (unlikely(tp_len > size_max)) {
-		pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
-		return -EMSGSIZE;
-	}
-
 	offset = offset_in_page(data);
 	len_max = PAGE_SIZE - offset;
 	len = ((to_write > len_max) ? len_max : to_write);
@@ -2059,7 +2047,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 	struct net_device *dev;
 	__be16 proto;
 	bool need_rls_dev = false;
-	int err;
+	int err, reserve = 0;
 	void *ph;
 	struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
 	int tp_len, size_max;
@@ -2092,6 +2080,8 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 	if (unlikely(dev == NULL))
 		goto out;
 
+	reserve = dev->hard_header_len;
+
 	err = -ENETDOWN;
 	if (unlikely(!(dev->flags & IFF_UP)))
 		goto out_put;
@@ -2099,6 +2089,9 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 	size_max = po->tx_ring.frame_size
 		- (po->tp_hdrlen - sizeof(struct sockaddr_ll));
 
+	if (size_max > dev->mtu + reserve)
+		size_max = dev->mtu + reserve;
+
 	do {
 		ph = packet_current_frame(po, &po->tx_ring,
 				TP_STATUS_SEND_REQUEST);
@@ -2331,20 +2324,22 @@ static int packet_snd(struct socket *sock,
 
 	sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
 
-	if (dev->type == ARPHRD_ETHER) {
-		skb->protocol = eth_type_trans(skb, dev);
-		if (skb->protocol == htons(ETH_P_8021Q))
-			reserve += VLAN_HLEN;
-	} else {
-		skb->protocol = proto;
-		skb->dev = dev;
-	}
-
 	if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
-		err = -EMSGSIZE;
-		goto out_free;
+		/* Earlier code assumed this would be a VLAN pkt,
+		 * double-check this now that we have the actual
+		 * packet in hand.
+		 */
+		struct ethhdr *ehdr;
+		skb_reset_mac_header(skb);
+		ehdr = eth_hdr(skb);
+		if (ehdr->h_proto != htons(ETH_P_8021Q)) {
+			err = -EMSGSIZE;
+			goto out_free;
+		}
 	}
 
+	skb->protocol = proto;
+	skb->dev = dev;
 	skb->priority = sk->sk_priority;
 	skb->mark = sk->sk_mark;
 
-- 
cgit v1.2.3


From 1f07d03e2069df2ecd82301936b598a3b257c6d6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 6 Aug 2013 03:32:11 -0700
Subject: net: add SNMP counters tracking incoming ECN bits

With GRO/LRO processing, there is a problem because Ip[6]InReceives SNMP
counters do not count the number of frames, but number of aggregated
segments.

Its probably too late to change this now.

This patch adds four new counters, tracking number of frames, regardless
of LRO/GRO, and on a per ECN status basis, for IPv4 and IPv6.

Ip[6]NoECTPkts : Number of packets received with NOECT
Ip[6]ECT1Pkts  : Number of packets received with ECT(1)
Ip[6]ECT0Pkts  : Number of packets received with ECT(0)
Ip[6]CEPkts    : Number of packets received with Congestion Experienced

lph37:~# nstat | egrep "Pkts|InReceive"
IpInReceives                    1634137            0.0
Ip6InReceives                   3714107            0.0
Ip6InNoECTPkts                  19205              0.0
Ip6InECT0Pkts                   52651828           0.0
IpExtInNoECTPkts                33630              0.0
IpExtInECT0Pkts                 15581379           0.0
IpExtInCEPkts                   6                  0.0

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/snmp.h | 4 ++++
 net/ipv4/ip_input.c       | 8 ++++++++
 net/ipv4/proc.c           | 7 ++++++-
 net/ipv6/ip6_input.c      | 6 +++++-
 net/ipv6/proc.c           | 4 ++++
 5 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index af0a674cc677..60601d28da75 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -51,6 +51,10 @@ enum
 	IPSTATS_MIB_INBCASTOCTETS,		/* InBcastOctets */
 	IPSTATS_MIB_OUTBCASTOCTETS,		/* OutBcastOctets */
 	IPSTATS_MIB_CSUMERRORS,			/* InCsumErrors */
+	IPSTATS_MIB_NOECTPKTS,			/* InNoECTPkts */
+	IPSTATS_MIB_ECT1PKTS,			/* InECT1Pkts */
+	IPSTATS_MIB_ECT0PKTS,			/* InECT0Pkts */
+	IPSTATS_MIB_CEPKTS,			/* InCEPkts */
 	__IPSTATS_MIB_MAX
 };
 
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 15e3e683adec..054a3e97d822 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -141,6 +141,7 @@
 #include <net/icmp.h>
 #include <net/raw.h>
 #include <net/checksum.h>
+#include <net/inet_ecn.h>
 #include <linux/netfilter_ipv4.h>
 #include <net/xfrm.h>
 #include <linux/mroute.h>
@@ -410,6 +411,13 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
 	if (iph->ihl < 5 || iph->version != 4)
 		goto inhdr_error;
 
+	BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1);
+	BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0);
+	BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE);
+	IP_ADD_STATS_BH(dev_net(dev),
+			IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
+			max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
+
 	if (!pskb_may_pull(skb, iph->ihl*4))
 		goto inhdr_error;
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 6577a1149a47..5f5fa612647f 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -111,7 +111,7 @@ static const struct snmp_mib snmp4_ipstats_list[] = {
 	SNMP_MIB_SENTINEL
 };
 
-/* Following RFC4293 items are displayed in /proc/net/netstat */
+/* Following items are displayed in /proc/net/netstat */
 static const struct snmp_mib snmp4_ipextstats_list[] = {
 	SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES),
 	SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS),
@@ -125,7 +125,12 @@ static const struct snmp_mib snmp4_ipextstats_list[] = {
 	SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS),
 	SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS),
 	SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS),
+	/* Non RFC4293 fields */
 	SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS),
+	SNMP_MIB_ITEM("InNoECTPkts", IPSTATS_MIB_NOECTPKTS),
+	SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
+	SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
+	SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 2bab2aa59745..302d6fb1ff2b 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -44,7 +44,7 @@
 #include <net/ip6_route.h>
 #include <net/addrconf.h>
 #include <net/xfrm.h>
-
+#include <net/inet_ecn.h>
 
 
 int ip6_rcv_finish(struct sk_buff *skb)
@@ -109,6 +109,10 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
 	if (hdr->version != 6)
 		goto err;
 
+	IP6_ADD_STATS_BH(dev_net(dev), idev,
+			 IPSTATS_MIB_NOECTPKTS +
+				(ipv6_get_dsfield(hdr) & INET_ECN_MASK),
+			 max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
 	/*
 	 * RFC4291 2.5.3
 	 * A packet received on an interface with a destination address
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 51c3285b5d9b..091d066a57b3 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -91,6 +91,10 @@ static const struct snmp_mib snmp6_ipstats_list[] = {
 	SNMP_MIB_ITEM("Ip6InBcastOctets", IPSTATS_MIB_INBCASTOCTETS),
 	SNMP_MIB_ITEM("Ip6OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS),
 	/* IPSTATS_MIB_CSUMERRORS is not relevant in IPv6 (no checksum) */
+	SNMP_MIB_ITEM("Ip6InNoECTPkts", IPSTATS_MIB_NOECTPKTS),
+	SNMP_MIB_ITEM("Ip6InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
+	SNMP_MIB_ITEM("Ip6InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
+	SNMP_MIB_ITEM("Ip6InCEPkts", IPSTATS_MIB_CEPKTS),
 	SNMP_MIB_SENTINEL
 };
 
-- 
cgit v1.2.3


From c655bc6896b94ee0223393f26155c6daf1e2d148 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 29 Jul 2013 15:41:55 +0200
Subject: netfilter: nf_conntrack: don't send destroy events from iterator

Let nf_ct_delete handle delivery of the DESTROY event.

Based on earlier patch from Pablo Neira.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h |  4 +++-
 net/ipv4/netfilter/ipt_MASQUERADE.c  |  2 +-
 net/ipv6/netfilter/ip6t_MASQUERADE.c |  2 +-
 net/netfilter/nf_conntrack_core.c    | 36 ++++--------------------------------
 net/netfilter/nf_conntrack_proto.c   |  4 ++--
 net/netfilter/nf_nat_core.c          |  6 +++---
 6 files changed, 14 insertions(+), 40 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index e5eb8b62538c..0c1288a50e8b 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -248,7 +248,9 @@ extern void nf_ct_untracked_status_or(unsigned long bits);
 
 /* Iterate over all conntracks: if iter returns true, it's deleted. */
 extern void
-nf_ct_iterate_cleanup(struct net *net, int (*iter)(struct nf_conn *i, void *data), void *data);
+nf_ct_iterate_cleanup(struct net *net,
+		      int (*iter)(struct nf_conn *i, void *data),
+		      void *data, u32 portid, int report);
 extern void nf_conntrack_free(struct nf_conn *ct);
 extern struct nf_conn *
 nf_conntrack_alloc(struct net *net, u16 zone,
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 30e4de940567..00352ce0f0de 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -118,7 +118,7 @@ static int masq_device_event(struct notifier_block *this,
 		NF_CT_ASSERT(dev->ifindex != 0);
 
 		nf_ct_iterate_cleanup(net, device_cmp,
-				      (void *)(long)dev->ifindex);
+				      (void *)(long)dev->ifindex, 0, 0);
 	}
 
 	return NOTIFY_DONE;
diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c
index 47bff6107519..3e4e92d5e157 100644
--- a/net/ipv6/netfilter/ip6t_MASQUERADE.c
+++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c
@@ -76,7 +76,7 @@ static int masq_device_event(struct notifier_block *this,
 
 	if (event == NETDEV_DOWN)
 		nf_ct_iterate_cleanup(net, device_cmp,
-				      (void *)(long)dev->ifindex);
+				      (void *)(long)dev->ifindex, 0, 0);
 
 	return NOTIFY_DONE;
 }
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 0934611ff9f3..da6f1787a102 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1246,7 +1246,7 @@ found:
 
 void nf_ct_iterate_cleanup(struct net *net,
 			   int (*iter)(struct nf_conn *i, void *data),
-			   void *data)
+			   void *data, u32 portid, int report)
 {
 	struct nf_conn *ct;
 	unsigned int bucket = 0;
@@ -1254,7 +1254,7 @@ void nf_ct_iterate_cleanup(struct net *net,
 	while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
 		/* Time to push up daises... */
 		if (del_timer(&ct->timeout))
-			death_by_timeout((unsigned long)ct);
+			nf_ct_delete(ct, portid, report);
 
 		/* ... else the timer will get him soon. */
 
@@ -1263,30 +1263,6 @@ void nf_ct_iterate_cleanup(struct net *net,
 }
 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup);
 
-struct __nf_ct_flush_report {
-	u32 portid;
-	int report;
-};
-
-static int kill_report(struct nf_conn *i, void *data)
-{
-	struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data;
-	struct nf_conn_tstamp *tstamp;
-
-	tstamp = nf_conn_tstamp_find(i);
-	if (tstamp && tstamp->stop == 0)
-		tstamp->stop = ktime_to_ns(ktime_get_real());
-
-	/* If we fail to deliver the event, death_by_timeout() will retry */
-	if (nf_conntrack_event_report(IPCT_DESTROY, i,
-				      fr->portid, fr->report) < 0)
-		return 1;
-
-	/* Avoid the delivery of the destroy event in death_by_timeout(). */
-	set_bit(IPS_DYING_BIT, &i->status);
-	return 1;
-}
-
 static int kill_all(struct nf_conn *i, void *data)
 {
 	return 1;
@@ -1304,11 +1280,7 @@ EXPORT_SYMBOL_GPL(nf_ct_free_hashtable);
 
 void nf_conntrack_flush_report(struct net *net, u32 portid, int report)
 {
-	struct __nf_ct_flush_report fr = {
-		.portid	= portid,
-		.report = report,
-	};
-	nf_ct_iterate_cleanup(net, kill_report, &fr);
+	nf_ct_iterate_cleanup(net, kill_all, NULL, portid, report);
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_flush_report);
 
@@ -1389,7 +1361,7 @@ void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
 i_see_dead_people:
 	busy = 0;
 	list_for_each_entry(net, net_exit_list, exit_list) {
-		nf_ct_iterate_cleanup(net, kill_all, NULL);
+		nf_ct_iterate_cleanup(net, kill_all, NULL, 0, 0);
 		nf_ct_release_dying_list(net);
 		if (atomic_read(&net->ct.count) != 0)
 			busy = 1;
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 0ab9636ac57e..ce3004156eeb 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -281,7 +281,7 @@ void nf_ct_l3proto_pernet_unregister(struct net *net,
 	nf_ct_l3proto_unregister_sysctl(net, proto);
 
 	/* Remove all contrack entries for this protocol */
-	nf_ct_iterate_cleanup(net, kill_l3proto, proto);
+	nf_ct_iterate_cleanup(net, kill_l3proto, proto, 0, 0);
 }
 EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_unregister);
 
@@ -476,7 +476,7 @@ void nf_ct_l4proto_pernet_unregister(struct net *net,
 	nf_ct_l4proto_unregister_sysctl(net, pn, l4proto);
 
 	/* Remove all contrack entries for this protocol */
-	nf_ct_iterate_cleanup(net, kill_l4proto, l4proto);
+	nf_ct_iterate_cleanup(net, kill_l4proto, l4proto, 0, 0);
 }
 EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister);
 
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 038eee5c8f85..6ff808375b5e 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -497,7 +497,7 @@ static void nf_nat_l4proto_clean(u8 l3proto, u8 l4proto)
 
 	rtnl_lock();
 	for_each_net(net)
-		nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean);
+		nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean, 0, 0);
 	rtnl_unlock();
 }
 
@@ -511,7 +511,7 @@ static void nf_nat_l3proto_clean(u8 l3proto)
 	rtnl_lock();
 
 	for_each_net(net)
-		nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean);
+		nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean, 0, 0);
 	rtnl_unlock();
 }
 
@@ -749,7 +749,7 @@ static void __net_exit nf_nat_net_exit(struct net *net)
 {
 	struct nf_nat_proto_clean clean = {};
 
-	nf_ct_iterate_cleanup(net, &nf_nat_proto_remove, &clean);
+	nf_ct_iterate_cleanup(net, &nf_nat_proto_remove, &clean, 0, 0);
 	synchronize_rcu();
 	nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size);
 }
-- 
cgit v1.2.3


From 0670307992a69a2028e01e6a5fb5f4ab23167954 Mon Sep 17 00:00:00 2001
From: Chun-Yeow Yeoh <yeohchunyeow@cozybit.com>
Date: Tue, 6 Aug 2013 15:39:56 -0700
Subject: mac80211: allow lowest basic rate for unicast management for mesh

Allow lowest basic rate to be used for unicast management frame in
mesh. Otherwise, the lowest supported rate is used for unicast
management frame, such as 1Mbps for 2.4GHz and 6Mbps for 5GHz. Rename
the rc_send_low_broadcast to re_send_low_basicrate since now it is
also applied to unicast management frame in mesh.

Signed-off-by: Chun-Yeow Yeoh <yeohchunyeow@cozybit.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rate.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
index ba63ac851c2b..e126605cec66 100644
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c
@@ -210,7 +210,7 @@ static bool rc_no_data_or_no_ack_use_min(struct ieee80211_tx_rate_control *txrc)
 		!ieee80211_is_data(fc);
 }
 
-static void rc_send_low_broadcast(s8 *idx, u32 basic_rates,
+static void rc_send_low_basicrate(s8 *idx, u32 basic_rates,
 				  struct ieee80211_supported_band *sband)
 {
 	u8 i;
@@ -263,28 +263,37 @@ static void __rate_control_send_low(struct ieee80211_hw *hw,
 }
 
 
-bool rate_control_send_low(struct ieee80211_sta *sta,
+bool rate_control_send_low(struct ieee80211_sta *pubsta,
 			   void *priv_sta,
 			   struct ieee80211_tx_rate_control *txrc)
 {
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb);
 	struct ieee80211_supported_band *sband = txrc->sband;
+	struct sta_info *sta;
 	int mcast_rate;
+	bool use_basicrate = false;
 
-	if (!sta || !priv_sta || rc_no_data_or_no_ack_use_min(txrc)) {
-		__rate_control_send_low(txrc->hw, sband, sta, info);
+	if (!pubsta || !priv_sta || rc_no_data_or_no_ack_use_min(txrc)) {
+		__rate_control_send_low(txrc->hw, sband, pubsta, info);
 
-		if (!sta && txrc->bss) {
+		if (!pubsta && txrc->bss) {
 			mcast_rate = txrc->bss_conf->mcast_rate[sband->band];
 			if (mcast_rate > 0) {
 				info->control.rates[0].idx = mcast_rate - 1;
 				return true;
 			}
+			use_basicrate = true;
+		} else if (pubsta) {
+			sta = container_of(pubsta, struct sta_info, sta);
+			if (ieee80211_vif_is_mesh(&sta->sdata->vif))
+				use_basicrate = true;
+		}
 
-			rc_send_low_broadcast(&info->control.rates[0].idx,
+		if (use_basicrate)
+			rc_send_low_basicrate(&info->control.rates[0].idx,
 					      txrc->bss_conf->basic_rates,
 					      sband);
-		}
+
 		return true;
 	}
 	return false;
-- 
cgit v1.2.3


From ab1e8ad3b463fd15b99a9b3980ec0f84294f6207 Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <simon.wunderlich@s2003.tu-chemnitz.de>
Date: Fri, 2 Aug 2013 17:38:01 +0200
Subject: mac80211: fix ieee80211_sta_process_chanswitch for 5/10 MHz channels

Signed-off-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Mathias Kretschmer <mathias.kretschmer@fokus.fraunhofer.de>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mlme.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 211246b46819..45a87ee3f124 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1102,6 +1102,15 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 	case -1:
 		cfg80211_chandef_create(&new_chandef, new_chan,
 					NL80211_CHAN_NO_HT);
+		/* keep width for 5/10 MHz channels */
+		switch (sdata->vif.bss_conf.chandef.width) {
+		case NL80211_CHAN_WIDTH_5:
+		case NL80211_CHAN_WIDTH_10:
+			new_chandef.width = sdata->vif.bss_conf.chandef.width;
+			break;
+		default:
+			break;
+		}
 		break;
 	}
 
-- 
cgit v1.2.3


From cab70040dfd95ee32144f02fade64f0cb94f31a0 Mon Sep 17 00:00:00 2001
From: William Manley <william.manley@youview.com>
Date: Tue, 6 Aug 2013 19:03:13 +0100
Subject: net: igmp: Reduce Unsolicited report interval to 1s when using IGMPv3

If an IGMP join packet is lost you will not receive data sent to the
multicast group so if no data arrives from that multicast group in a
period of time after the IGMP join a second IGMP join will be sent.  The
delay between joins is the "IGMP Unsolicited Report Interval".

Previously this value was hard coded to be chosen randomly between 0-10s.
This can be too long for some use-cases, such as IPTV as it can cause
channel change to be slow in the presence of packet loss.

The value 10s has come from IGMPv2 RFC2236, which was reduced to 1s in
IGMPv3 RFC3376.  This patch makes the kernel use the 1s value from the
later RFC if we are operating in IGMPv3 mode.  IGMPv2 behaviour is
unaffected.

Tested with Wireshark and a simple program to join a (non-existent)
multicast group.  The distribution of timings for the second join differ
based upon setting /proc/sys/net/ipv4/conf/eth0/force_igmp_version.

Signed-off-by: William Manley <william.manley@youview.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Benjamin LaHaise <bcrl@kvack.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index ef7618630f36..ceb19ab9cf9d 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -114,7 +114,8 @@
 
 #define IGMP_V1_Router_Present_Timeout		(400*HZ)
 #define IGMP_V2_Router_Present_Timeout		(400*HZ)
-#define IGMP_Unsolicited_Report_Interval	(10*HZ)
+#define IGMP_V2_Unsolicited_Report_Interval	(10*HZ)
+#define IGMP_V3_Unsolicited_Report_Interval	(1*HZ)
 #define IGMP_Query_Response_Interval		(10*HZ)
 #define IGMP_Unsolicited_Report_Count		2
 
@@ -139,6 +140,14 @@
 	 ((in_dev)->mr_v2_seen && \
 	  time_before(jiffies, (in_dev)->mr_v2_seen)))
 
+static int unsolicited_report_interval(struct in_device *in_dev)
+{
+	if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
+		return IGMP_V2_Unsolicited_Report_Interval;
+	else /* v3 */
+		return IGMP_V3_Unsolicited_Report_Interval;
+}
+
 static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im);
 static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr);
 static void igmpv3_clear_delrec(struct in_device *in_dev);
@@ -722,7 +731,8 @@ static void igmp_ifc_timer_expire(unsigned long data)
 	igmpv3_send_cr(in_dev);
 	if (in_dev->mr_ifc_count) {
 		in_dev->mr_ifc_count--;
-		igmp_ifc_start_timer(in_dev, IGMP_Unsolicited_Report_Interval);
+		igmp_ifc_start_timer(in_dev,
+				     unsolicited_report_interval(in_dev));
 	}
 	__in_dev_put(in_dev);
 }
@@ -747,7 +757,7 @@ static void igmp_timer_expire(unsigned long data)
 
 	if (im->unsolicit_count) {
 		im->unsolicit_count--;
-		igmp_start_timer(im, IGMP_Unsolicited_Report_Interval);
+		igmp_start_timer(im, unsolicited_report_interval(in_dev));
 	}
 	im->reporter = 1;
 	spin_unlock(&im->lock);
-- 
cgit v1.2.3


From 5c6fe01c1fe3140a8fb088d2a9c32548b731c35e Mon Sep 17 00:00:00 2001
From: William Manley <william.manley@youview.com>
Date: Tue, 6 Aug 2013 19:03:14 +0100
Subject: net: igmp: Don't flush routing cache when force_igmp_version is
 modified

The procfs knob /proc/sys/net/ipv4/conf/*/force_igmp_version allows the
IGMP protocol version to use to be explicitly set.  As a side effect this
caused the routing cache to be flushed as it was declared as a
DEVINET_SYSCTL_FLUSHING_ENTRY.  Flushing is unnecessary and this patch
makes it so flushing does not occur.

Requested by Hannes Frederic Sowa as he was reviewing other patches
adding procfs entries.

Suggested-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: William Manley <william.manley@youview.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Benjamin LaHaise <bcrl@kvack.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inetdevice.h | 2 +-
 net/ipv4/devinet.c         | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index b99cd23f3474..0a4a6cb35338 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -27,9 +27,9 @@ enum
 	IPV4_DEVCONF_TAG,
 	IPV4_DEVCONF_ARPFILTER,
 	IPV4_DEVCONF_MEDIUM_ID,
+	IPV4_DEVCONF_FORCE_IGMP_VERSION,
 	IPV4_DEVCONF_NOXFRM,
 	IPV4_DEVCONF_NOPOLICY,
-	IPV4_DEVCONF_FORCE_IGMP_VERSION,
 	IPV4_DEVCONF_ARP_ANNOUNCE,
 	IPV4_DEVCONF_ARP_IGNORE,
 	IPV4_DEVCONF_PROMOTE_SECONDARIES,
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 43923dc77440..87d47ce9fa33 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2094,11 +2094,11 @@ static struct devinet_sysctl_table {
 		DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
 		DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
 		DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"),
+		DEVINET_SYSCTL_RW_ENTRY(FORCE_IGMP_VERSION,
+					"force_igmp_version"),
 
 		DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
 		DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
-		DEVINET_SYSCTL_FLUSHING_ENTRY(FORCE_IGMP_VERSION,
-					      "force_igmp_version"),
 		DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,
 					      "promote_secondaries"),
 		DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,
-- 
cgit v1.2.3


From 2690048c01f32bf45d1c1e1ab3079bc10ad2aea7 Mon Sep 17 00:00:00 2001
From: William Manley <william.manley@youview.com>
Date: Tue, 6 Aug 2013 19:03:15 +0100
Subject: net: igmp: Allow user-space configuration of igmp unsolicited report
 interval

Adds the new procfs knobs:

    /proc/sys/net/ipv4/conf/*/igmpv2_unsolicited_report_interval
    /proc/sys/net/ipv4/conf/*/igmpv3_unsolicited_report_interval

Which will allow userspace configuration of the IGMP unsolicited report
interval (see below) in milliseconds.  The defaults are 10000ms for IGMPv2
and 1000ms for IGMPv3 in accordance with RFC2236 and RFC3376.

Background:

If an IGMP join packet is lost you will not receive data sent to the
multicast group so if no data arrives from that multicast group in a
period of time after the IGMP join a second IGMP join will be sent.  The
delay between joins is the "IGMP Unsolicited Report Interval".

Prior to this patch this value was hard coded in the kernel to 10s for
IGMPv2 and 1s for IGMPv3.  10s is unsuitable for some use-cases, such as
IPTV as it can cause channel change to be slow in the presence of packet
loss.

This patch allows the value to be overridden from userspace for both
IGMPv2 and IGMPv3 such that it can be tuned accoding to the network.

Tested with Wireshark and a simple program to join a (non-existent)
multicast group.  The distribution of timings for the second join differ
based upon setting the procfs knobs.

igmpvX_unsolicited_report_interval is intended to follow the pattern
established by force_igmp_version, and while a procfs entry has been added
a corresponding sysctl knob has not as it is my understanding that sysctl
is deprecated[1].

[1]: http://lwn.net/Articles/247243/

Signed-off-by: William Manley <william.manley@youview.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Benjamin LaHaise <bcrl@kvack.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inetdevice.h |  2 ++
 net/ipv4/devinet.c         |  8 ++++++++
 net/ipv4/igmp.c            | 19 +++++++++++++++++--
 3 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 0a4a6cb35338..c796ce26c7c0 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -28,6 +28,8 @@ enum
 	IPV4_DEVCONF_ARPFILTER,
 	IPV4_DEVCONF_MEDIUM_ID,
 	IPV4_DEVCONF_FORCE_IGMP_VERSION,
+	IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL,
+	IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL,
 	IPV4_DEVCONF_NOXFRM,
 	IPV4_DEVCONF_NOPOLICY,
 	IPV4_DEVCONF_ARP_ANNOUNCE,
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 87d47ce9fa33..a1b5bcbd04ae 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -73,6 +73,8 @@ static struct ipv4_devconf ipv4_devconf = {
 		[IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
 		[IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
 		[IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
+		[IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/,
+		[IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] =  1000 /*ms*/,
 	},
 };
 
@@ -83,6 +85,8 @@ static struct ipv4_devconf ipv4_devconf_dflt = {
 		[IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
 		[IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
 		[IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1,
+		[IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/,
+		[IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] =  1000 /*ms*/,
 	},
 };
 
@@ -2096,6 +2100,10 @@ static struct devinet_sysctl_table {
 		DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"),
 		DEVINET_SYSCTL_RW_ENTRY(FORCE_IGMP_VERSION,
 					"force_igmp_version"),
+		DEVINET_SYSCTL_RW_ENTRY(IGMPV2_UNSOLICITED_REPORT_INTERVAL,
+					"igmpv2_unsolicited_report_interval"),
+		DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL,
+					"igmpv3_unsolicited_report_interval"),
 
 		DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
 		DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index ceb19ab9cf9d..d6c0e64ec97f 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -142,10 +142,25 @@
 
 static int unsolicited_report_interval(struct in_device *in_dev)
 {
+	int interval_ms, interval_jiffies;
+
 	if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
-		return IGMP_V2_Unsolicited_Report_Interval;
+		interval_ms = IN_DEV_CONF_GET(
+			in_dev,
+			IGMPV2_UNSOLICITED_REPORT_INTERVAL);
 	else /* v3 */
-		return IGMP_V3_Unsolicited_Report_Interval;
+		interval_ms = IN_DEV_CONF_GET(
+			in_dev,
+			IGMPV3_UNSOLICITED_REPORT_INTERVAL);
+
+	interval_jiffies = msecs_to_jiffies(interval_ms);
+
+	/* _timer functions can't handle a delay of 0 jiffies so ensure
+	 *  we always return a positive value.
+	 */
+	if (interval_jiffies <= 0)
+		interval_jiffies = 1;
+	return interval_jiffies;
 }
 
 static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im);
-- 
cgit v1.2.3


From cda5f98e36576596b9230483ec52bff3cc97eb21 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Tue, 6 Aug 2013 21:18:12 +0200
Subject: net: sctp: convert sctp_checksum_disable module param into sctp
 sysctl

Get rid of the last module parameter for SCTP and make this
configurable via sysctl for SCTP like all the rest of SCTP's
configuration knobs.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  8 ++++++++
 include/net/netns/sctp.h               |  3 +++
 include/net/sctp/structs.h             |  5 -----
 net/sctp/input.c                       |  4 ++--
 net/sctp/output.c                      |  5 ++++-
 net/sctp/protocol.c                    |  5 +++--
 net/sctp/sysctl.c                      | 10 +++++++++-
 7 files changed, 29 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 36be26b2ef7a..1b27b0b8687f 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1507,6 +1507,14 @@ sack_timeout - INTEGER
 
 	Default: 200
 
+checksum_disable - BOOLEAN
+	Disable SCTP checksum computing and verification for debugging purpose.
+
+	1: Disable checksumming
+	0: Enable checksumming
+
+	Default: 0
+
 valid_cookie_life - INTEGER
 	The default lifetime of the SCTP cookie (in milliseconds).  The cookie
 	is used during association establishment.
diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
index 3573a81815ad..ebfdf1e7d402 100644
--- a/include/net/netns/sctp.h
+++ b/include/net/netns/sctp.h
@@ -129,6 +129,9 @@ struct netns_sctp {
 
 	/* Threshold for autoclose timeout, in seconds. */
 	unsigned long max_autoclose;
+
+	/* Flag to disable SCTP checksumming. */
+	int checksum_disable;
 };
 
 #endif /* __NETNS_SCTP_H__ */
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index d9c93a77b1a9..06ebeaaaa9aa 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -141,10 +141,6 @@ extern struct sctp_globals {
 	/* This is the sctp port control hash.	*/
 	int port_hashsize;
 	struct sctp_bind_hashbucket *port_hashtable;
-
-	/* Flag to indicate whether computing and verifying checksum
-	 * is disabled. */
-        bool checksum_disable;
 } sctp_globals;
 
 #define sctp_max_instreams		(sctp_globals.max_instreams)
@@ -156,7 +152,6 @@ extern struct sctp_globals {
 #define sctp_assoc_hashtable		(sctp_globals.assoc_hashtable)
 #define sctp_port_hashsize		(sctp_globals.port_hashsize)
 #define sctp_port_hashtable		(sctp_globals.port_hashtable)
-#define sctp_checksum_disable		(sctp_globals.checksum_disable)
 
 /* SCTP Socket type: UDP or TCP style. */
 typedef enum {
diff --git a/net/sctp/input.c b/net/sctp/input.c
index fa91aff02388..b9a25e18bb2b 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -140,8 +140,8 @@ int sctp_rcv(struct sk_buff *skb)
 	__skb_pull(skb, skb_transport_offset(skb));
 	if (skb->len < sizeof(struct sctphdr))
 		goto discard_it;
-	if (!sctp_checksum_disable && !skb_csum_unnecessary(skb) &&
-		  sctp_rcv_checksum(net, skb) < 0)
+	if (!net->sctp.checksum_disable && !skb_csum_unnecessary(skb) &&
+	    sctp_rcv_checksum(net, skb) < 0)
 		goto discard_it;
 
 	skb_pull(skb, sizeof(struct sctphdr));
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 5a55c55d71ad..cdb5f4914e17 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -395,6 +395,7 @@ int sctp_packet_transmit(struct sctp_packet *packet)
 	int padding;		/* How much padding do we need?  */
 	__u8 has_data = 0;
 	struct dst_entry *dst = tp->dst;
+	struct net *net;
 	unsigned char *auth = NULL;	/* pointer to auth in skb data */
 	__u32 cksum_buf_len = sizeof(struct sctphdr);
 
@@ -541,7 +542,9 @@ int sctp_packet_transmit(struct sctp_packet *packet)
 	 * Note: Adler-32 is no longer applicable, as has been replaced
 	 * by CRC32-C as described in <draft-ietf-tsvwg-sctpcsum-02.txt>.
 	 */
-	if (!sctp_checksum_disable) {
+	net = dev_net(dst->dev);
+
+	if (!net->sctp.checksum_disable) {
 		if (!(dst->dev->features & NETIF_F_SCTP_CSUM)) {
 			__u32 crc32 = sctp_start_cksum((__u8 *)sh, cksum_buf_len);
 
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index b52ec2510101..a570a6365f87 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1193,6 +1193,9 @@ static int __net_init sctp_net_init(struct net *net)
 	/* Whether Cookie Preservative is enabled(1) or not(0) */
 	net->sctp.cookie_preserve_enable 	= 1;
 
+	/* Whether SCTP checksumming is disabled(1) or not(0) */
+	net->sctp.checksum_disable		= 0;
+
 	/* Default sctp sockets to use md5 as their hmac alg */
 #if defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5)
 	net->sctp.sctp_hmac_alg			= "md5";
@@ -1549,6 +1552,4 @@ MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-132");
 MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-132");
 MODULE_AUTHOR("Linux Kernel SCTP developers <linux-sctp@vger.kernel.org>");
 MODULE_DESCRIPTION("Support for the SCTP protocol (RFC2960)");
-module_param_named(no_checksums, sctp_checksum_disable, bool, 0644);
-MODULE_PARM_DESC(no_checksums, "Disable checksums computing and verification");
 MODULE_LICENSE("GPL");
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 190674702b20..754809a7183d 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -296,7 +296,15 @@ static struct ctl_table sctp_net_table[] = {
 		.extra1		= &max_autoclose_min,
 		.extra2		= &max_autoclose_max,
 	},
-
+	{
+		.procname	= "checksum_disable",
+		.data		= &init_net.sctp.checksum_disable,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 	{ /* sentinel */ }
 };
 
-- 
cgit v1.2.3


From 477143e3fece3dc12629bb1ebd7b47e8e6e72b2b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Tue, 6 Aug 2013 21:18:13 +0200
Subject: net: sctp: trivial: update bug report in header comment

With the restructuring of the lksctp.org site, we only allow bug
reports through the SCTP mailing list linux-sctp@vger.kernel.org,
not via SF, as SF is only used for web hosting and nothing more.
While at it, also remove the obvious statement that bugs will be
fixed and incooperated into the kernel.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Acked-by: Vlad Yasevich <vyasevich@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/auth.h      |  6 ------
 include/net/sctp/checksum.h  |  6 ------
 include/net/sctp/command.h   | 18 ++++++++----------
 include/net/sctp/constants.h |  6 ------
 include/net/sctp/sctp.h      |  6 ------
 include/net/sctp/sm.h        |  6 ------
 include/net/sctp/structs.h   |  6 ------
 include/net/sctp/tsnmap.h    |  6 ------
 include/net/sctp/ulpevent.h  |  6 ------
 include/net/sctp/ulpqueue.h  |  6 ------
 net/sctp/associola.c         |  6 ------
 net/sctp/auth.c              |  6 ------
 net/sctp/bind_addr.c         |  6 ------
 net/sctp/chunk.c             |  6 ------
 net/sctp/command.c           |  6 ------
 net/sctp/debug.c             |  6 ------
 net/sctp/endpointola.c       |  6 ------
 net/sctp/input.c             |  6 ------
 net/sctp/inqueue.c           |  6 ------
 net/sctp/ipv6.c              |  6 ------
 net/sctp/objcnt.c            |  6 ------
 net/sctp/output.c            |  6 ------
 net/sctp/outqueue.c          |  6 ------
 net/sctp/primitive.c         |  6 ------
 net/sctp/proc.c              |  6 ------
 net/sctp/protocol.c          |  6 ------
 net/sctp/sm_make_chunk.c     |  6 ------
 net/sctp/sm_sideeffect.c     |  6 ------
 net/sctp/sm_statefuns.c      |  6 ------
 net/sctp/sm_statetable.c     |  6 ------
 net/sctp/socket.c            |  6 ------
 net/sctp/ssnmap.c            |  6 ------
 net/sctp/sysctl.c            |  6 ------
 net/sctp/transport.c         |  6 ------
 net/sctp/tsnmap.c            |  6 ------
 net/sctp/ulpevent.c          |  6 ------
 net/sctp/ulpqueue.c          |  6 ------
 37 files changed, 8 insertions(+), 226 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/auth.h b/include/net/sctp/auth.h
index 754c511f4cf9..aa80bef3c9d5 100644
--- a/include/net/sctp/auth.h
+++ b/include/net/sctp/auth.h
@@ -24,14 +24,8 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *   Vlad Yasevich     <vladislav.yasevich@hp.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #ifndef __sctp_auth_h__
diff --git a/include/net/sctp/checksum.h b/include/net/sctp/checksum.h
index 483e6303458d..259924d63ba6 100644
--- a/include/net/sctp/checksum.h
+++ b/include/net/sctp/checksum.h
@@ -27,9 +27,6 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    Dinakaran Joseph
  *    Jon Grimm <jgrimm@us.ibm.com>
@@ -37,9 +34,6 @@
  *
  * Rewritten to use libcrc32c by:
  *    Vlad Yasevich <vladislav.yasevich@hp.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #ifndef __sctp_checksum_h__
diff --git a/include/net/sctp/command.h b/include/net/sctp/command.h
index 35247271e557..832f2191489c 100644
--- a/include/net/sctp/command.h
+++ b/include/net/sctp/command.h
@@ -23,19 +23,17 @@
  * the Free Software Foundation, 59 Temple Place - Suite 330,
  * Boston, MA 02111-1307, USA.
  *
- * Please send any bug reports or fixes you make to one of the
- * following email addresses:
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * La Monte H.P. Yarroll <piggy@acm.org>
- * Karl Knutson <karl@athena.chicago.il.us>
- * Ardelle Fan <ardelle.fan@intel.com>
- * Sridhar Samudrala <sri@us.ibm.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
+ * Written or modified by:
+ *   La Monte H.P. Yarroll <piggy@acm.org>
+ *   Karl Knutson <karl@athena.chicago.il.us>
+ *   Ardelle Fan <ardelle.fan@intel.com>
+ *   Sridhar Samudrala <sri@us.ibm.com>
  */
 
-
 #ifndef __net_sctp_command_h__
 #define __net_sctp_command_h__
 
diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index a1e7aa5c8bbf..2f0a565a0fd5 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -27,9 +27,6 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *   La Monte H.P. Yarroll <piggy@acm.org>
  *   Karl Knutson          <karl@athena.chicago.il.us>
@@ -39,9 +36,6 @@
  *   Xingang Guo           <xingang.guo@intel.com>
  *   Sridhar Samudrala     <samudrala@us.ibm.com>
  *   Daisy Chang           <daisyc@us.ibm.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #ifndef __sctp_constants_h__
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index cb28df9c2781..3794c5ad20fe 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -29,9 +29,6 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    La Monte H.P. Yarroll <piggy@acm.org>
  *    Xingang Guo           <xingang.guo@intel.com>
@@ -41,9 +38,6 @@
  *    Ardelle Fan           <ardelle.fan@intel.com>
  *    Ryan Layer            <rmlayer@us.ibm.com>
  *    Kevin Gao             <kevin.gao@intel.com> 
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #ifndef __net_sctp_h__
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 2aa66dda4a5a..4ef75af340b6 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -29,9 +29,6 @@
  * email addresses:
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    La Monte H.P. Yarroll <piggy@acm.org>
  *    Karl Knutson <karl@athena.chicago.il.us>
@@ -42,9 +39,6 @@
  *    Daisy Chang <daisyc@us.ibm.com>
  *    Ardelle Fan <ardelle.fan@intel.com>
  *    Kevin Gao <kevin.gao@intel.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #include <linux/types.h>
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 06ebeaaaa9aa..78eedf0cd4b3 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -27,9 +27,6 @@
  * email addresses:
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    Randall Stewart	    <randall@sctp.chicago.il.us>
  *    Ken Morneau	    <kmorneau@cisco.com>
@@ -46,9 +43,6 @@
  *    Ryan Layer	    <rmlayer@us.ibm.com>
  *    Anup Pemmaiah	    <pemmaiah@cc.usu.edu>
  *    Kevin Gao             <kevin.gao@intel.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #ifndef __sctp_structs_h__
diff --git a/include/net/sctp/tsnmap.h b/include/net/sctp/tsnmap.h
index c361d6f7e0cb..54bbbe547303 100644
--- a/include/net/sctp/tsnmap.h
+++ b/include/net/sctp/tsnmap.h
@@ -30,17 +30,11 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *   Jon Grimm             <jgrimm@us.ibm.com>
  *   La Monte H.P. Yarroll <piggy@acm.org>
  *   Karl Knutson          <karl@athena.chicago.il.us>
  *   Sridhar Samudrala     <sri@us.ibm.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 #include <net/sctp/constants.h>
 
diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h
index 34f0d34d22ce..27b9f5c90153 100644
--- a/include/net/sctp/ulpevent.h
+++ b/include/net/sctp/ulpevent.h
@@ -33,17 +33,11 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *   Jon Grimm             <jgrimm@us.ibm.com>
  *   La Monte H.P. Yarroll <piggy@acm.org>
  *   Karl Knutson          <karl@athena.chicago.il.us>
  *   Sridhar Samudrala     <sri@us.ibm.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #ifndef __sctp_ulpevent_h__
diff --git a/include/net/sctp/ulpqueue.h b/include/net/sctp/ulpqueue.h
index add3237a9fdb..b0cf5d54d717 100644
--- a/include/net/sctp/ulpqueue.h
+++ b/include/net/sctp/ulpqueue.h
@@ -32,16 +32,10 @@
  * email addresses:
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *   Jon Grimm             <jgrimm@us.ibm.com>
  *   La Monte H.P. Yarroll <piggy@acm.org>
  *   Sridhar Samudrala     <sri@us.ibm.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #ifndef __sctp_ulpqueue_h__
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index e425ba0bd79e..e051920816e6 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -30,9 +30,6 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    La Monte H.P. Yarroll <piggy@acm.org>
  *    Karl Knutson          <karl@athena.chicago.il.us>
@@ -43,9 +40,6 @@
  *    Daisy Chang	    <daisyc@us.ibm.com>
  *    Ryan Layer	    <rmlayer@us.ibm.com>
  *    Kevin Gao             <kevin.gao@intel.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index 3aab967081be..8c4fa5dec824 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -24,14 +24,8 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *   Vlad Yasevich     <vladislav.yasevich@hp.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #include <linux/slab.h>
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index f34ce8bc1395..077bb070052b 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -29,17 +29,11 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    La Monte H.P. Yarroll <piggy@acm.org>
  *    Karl Knutson          <karl@athena.chicago.il.us>
  *    Jon Grimm             <jgrimm@us.ibm.com>
  *    Daisy Chang           <daisyc@us.ibm.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #include <linux/types.h>
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index b50b90c16a86..bd0bdd0ba8f1 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -26,15 +26,9 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    Jon Grimm             <jgrimm@us.ibm.com>
  *    Sridhar Samudrala     <sri@us.ibm.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/net/sctp/command.c b/net/sctp/command.c
index f4bebdadf007..3d9a9ff69c03 100644
--- a/net/sctp/command.c
+++ b/net/sctp/command.c
@@ -27,15 +27,9 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    La Monte H.P. Yarroll <piggy@acm.org>
  *    Karl Knutson <karl@athena.chicago.il.us>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #include <linux/types.h>
diff --git a/net/sctp/debug.c b/net/sctp/debug.c
index 44aa4e7543a9..e89015d8935a 100644
--- a/net/sctp/debug.c
+++ b/net/sctp/debug.c
@@ -30,9 +30,6 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    La Monte H.P. Yarroll <piggy@acm.org>
  *    Karl Knutson          <karl@athena.chicago.il.us>
@@ -40,9 +37,6 @@
  *    Jon Grimm             <jgrimm@us.ibm.com>
  *    Daisy Chang	    <daisyc@us.ibm.com>
  *    Sridhar Samudrala	    <sri@us.ibm.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #include <net/sctp/sctp.h>
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 825b7543a436..09b8daac87c8 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -31,18 +31,12 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    La Monte H.P. Yarroll <piggy@acm.org>
  *    Karl Knutson <karl@athena.chicago.il.us>
  *    Jon Grimm <jgrimm@austin.ibm.com>
  *    Daisy Chang <daisyc@us.ibm.com>
  *    Dajiang Zhang <dajiang.zhang@nokia.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #include <linux/types.h>
diff --git a/net/sctp/input.c b/net/sctp/input.c
index b9a25e18bb2b..2873e221e187 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -31,9 +31,6 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    La Monte H.P. Yarroll <piggy@acm.org>
  *    Karl Knutson <karl@athena.chicago.il.us>
@@ -43,9 +40,6 @@
  *    Daisy Chang <daisyc@us.ibm.com>
  *    Sridhar Samudrala <sri@us.ibm.com>
  *    Ardelle Fan <ardelle.fan@intel.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #include <linux/types.h>
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index e70f60ae92ef..5856932fdc38 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -32,15 +32,9 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    La Monte H.P. Yarroll <piggy@acm.org>
  *    Karl Knutson <karl@athena.chicago.il.us>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 5a9402e3c0cf..da613ceae28c 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -29,9 +29,6 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    Le Yanqun		    <yanqun.le@nokia.com>
  *    Hui Huang		    <hui.huang@nokia.com>
@@ -42,9 +39,6 @@
  *
  * Based on:
  *	linux/net/ipv6/tcp_ipv6.c
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c
index aec346cbed67..5ea573b37648 100644
--- a/net/sctp/objcnt.c
+++ b/net/sctp/objcnt.c
@@ -28,14 +28,8 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    Jon Grimm             <jgrimm@us.ibm.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/net/sctp/output.c b/net/sctp/output.c
index cdb5f4914e17..e35b84cc1d9f 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -28,17 +28,11 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    La Monte H.P. Yarroll <piggy@acm.org>
  *    Karl Knutson          <karl@athena.chicago.il.us>
  *    Jon Grimm             <jgrimm@austin.ibm.com>
  *    Sridhar Samudrala     <sri@us.ibm.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 51313233e635..94df75877869 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -30,9 +30,6 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    La Monte H.P. Yarroll <piggy@acm.org>
  *    Karl Knutson          <karl@athena.chicago.il.us>
@@ -41,9 +38,6 @@
  *    Hui Huang 	    <hui.huang@nokia.com>
  *    Sridhar Samudrala     <sri@us.ibm.com>
  *    Jon Grimm             <jgrimm@us.ibm.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/net/sctp/primitive.c b/net/sctp/primitive.c
index 31fa437da117..ce1ffd811775 100644
--- a/net/sctp/primitive.c
+++ b/net/sctp/primitive.c
@@ -31,18 +31,12 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    La Monte H.P. Yarroll <piggy@acm.org>
  *    Narasimha Budihal     <narasimha@refcode.org>
  *    Karl Knutson          <karl@athena.chicago.il.us>
  *    Ardelle Fan	    <ardelle.fan@intel.com>
  *    Kevin Gao             <kevin.gao@intel.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #include <linux/types.h>
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index aff0cac45b64..82432bfb742f 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -24,14 +24,8 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    Sridhar Samudrala <sri@us.ibm.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #include <linux/types.h>
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index a570a6365f87..54482977a48f 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -31,9 +31,6 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    La Monte H.P. Yarroll <piggy@acm.org>
  *    Karl Knutson <karl@athena.chicago.il.us>
@@ -41,9 +38,6 @@
  *    Sridhar Samudrala <sri@us.ibm.com>
  *    Daisy Chang <daisyc@us.ibm.com>
  *    Ardelle Fan <ardelle.fan@intel.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 780a2d4b98a2..9c131338ccfa 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -31,9 +31,6 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    La Monte H.P. Yarroll <piggy@acm.org>
  *    Karl Knutson          <karl@athena.chicago.il.us>
@@ -45,9 +42,6 @@
  *    Daisy Chang	    <daisyc@us.ibm.com>
  *    Ardelle Fan	    <ardelle.fan@intel.com>
  *    Kevin Gao             <kevin.gao@intel.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index f1f3aac3f3bb..666c66842799 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -30,9 +30,6 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    La Monte H.P. Yarroll <piggy@acm.org>
  *    Karl Knutson          <karl@athena.chicago.il.us>
@@ -42,9 +39,6 @@
  *    Daisy Chang	    <daisyc@us.ibm.com>
  *    Sridhar Samudrala	    <sri@us.ibm.com>
  *    Ardelle Fan	    <ardelle.fan@intel.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 93271f0966c6..dfe3f36ff2aa 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -30,9 +30,6 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    La Monte H.P. Yarroll <piggy@acm.org>
  *    Karl Knutson          <karl@athena.chicago.il.us>
@@ -45,9 +42,6 @@
  *    Ardelle Fan	    <ardelle.fan@intel.com>
  *    Ryan Layer	    <rmlayer@us.ibm.com>
  *    Kevin Gao		    <kevin.gao@intel.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index 64ea5fd87eb2..c5999b2dde7d 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -30,9 +30,6 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    La Monte H.P. Yarroll <piggy@acm.org>
  *    Karl Knutson          <karl@athena.chicago.il.us>
@@ -41,9 +38,6 @@
  *    Daisy Chang	    <daisyc@us.ibm.com>
  *    Ardelle Fan	    <ardelle.fan@intel.com>
  *    Sridhar Samudrala	    <sri@us.ibm.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 02457123bdee..d5d5882a2891 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -36,9 +36,6 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    La Monte H.P. Yarroll <piggy@acm.org>
  *    Narasimha Budihal     <narsi@refcode.org>
@@ -52,9 +49,6 @@
  *    Ryan Layer	    <rmlayer@us.ibm.com>
  *    Anup Pemmaiah         <pemmaiah@cc.usu.edu>
  *    Kevin Gao             <kevin.gao@intel.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/net/sctp/ssnmap.c b/net/sctp/ssnmap.c
index 72b5939101bb..6007124aefa0 100644
--- a/net/sctp/ssnmap.c
+++ b/net/sctp/ssnmap.c
@@ -26,14 +26,8 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    Jon Grimm             <jgrimm@us.ibm.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #include <linux/types.h>
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 754809a7183d..1b1ee769b5bf 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -27,18 +27,12 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    Mingqin Liu           <liuming@us.ibm.com>
  *    Jon Grimm             <jgrimm@us.ibm.com>
  *    Ardelle Fan           <ardelle.fan@intel.com>
  *    Ryan Layer            <rmlayer@us.ibm.com>
  *    Sridhar Samudrala     <sri@us.ibm.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #include <net/sctp/structs.h>
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 9602c52aa617..6836ead5c957 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -32,9 +32,6 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    La Monte H.P. Yarroll <piggy@acm.org>
  *    Karl Knutson          <karl@athena.chicago.il.us>
@@ -43,9 +40,6 @@
  *    Hui Huang             <hui.huang@nokia.com>
  *    Sridhar Samudrala	    <sri@us.ibm.com>
  *    Ardelle Fan	    <ardelle.fan@intel.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/net/sctp/tsnmap.c b/net/sctp/tsnmap.c
index 0eff8667d4c7..fbda20028285 100644
--- a/net/sctp/tsnmap.c
+++ b/net/sctp/tsnmap.c
@@ -29,17 +29,11 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    La Monte H.P. Yarroll <piggy@acm.org>
  *    Jon Grimm             <jgrimm@us.ibm.com>
  *    Karl Knutson          <karl@athena.chicago.il.us>
  *    Sridhar Samudrala     <sri@us.ibm.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #include <linux/slab.h>
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index c07624fca7e5..81089ed65456 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -30,17 +30,11 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    Jon Grimm             <jgrimm@us.ibm.com>
  *    La Monte H.P. Yarroll <piggy@acm.org>
  *    Ardelle Fan	    <ardelle.fan@intel.com>
  *    Sridhar Samudrala     <sri@us.ibm.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #include <linux/slab.h>
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 2cdb3010f50f..1c1484ed605d 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -29,16 +29,10 @@
  * email address(es):
  *    lksctp developers <linux-sctp@vger.kernel.org>
  *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
  * Written or modified by:
  *    Jon Grimm             <jgrimm@us.ibm.com>
  *    La Monte H.P. Yarroll <piggy@acm.org>
  *    Sridhar Samudrala     <sri@us.ibm.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
  */
 
 #include <linux/slab.h>
-- 
cgit v1.2.3


From 71acc0ddd499cc323199fb1ae350ce9ea0744352 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 9 Aug 2013 13:09:41 -0700
Subject: Revert "net: sctp: convert sctp_checksum_disable module param into
 sctp sysctl"

This reverts commit cda5f98e36576596b9230483ec52bff3cc97eb21.

As per Vlad's request.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  8 --------
 include/net/netns/sctp.h               |  3 ---
 include/net/sctp/structs.h             |  5 +++++
 net/sctp/input.c                       |  4 ++--
 net/sctp/output.c                      |  5 +----
 net/sctp/protocol.c                    |  5 ++---
 net/sctp/sysctl.c                      | 10 +---------
 7 files changed, 11 insertions(+), 29 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 1b27b0b8687f..36be26b2ef7a 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1507,14 +1507,6 @@ sack_timeout - INTEGER
 
 	Default: 200
 
-checksum_disable - BOOLEAN
-	Disable SCTP checksum computing and verification for debugging purpose.
-
-	1: Disable checksumming
-	0: Enable checksumming
-
-	Default: 0
-
 valid_cookie_life - INTEGER
 	The default lifetime of the SCTP cookie (in milliseconds).  The cookie
 	is used during association establishment.
diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
index ebfdf1e7d402..3573a81815ad 100644
--- a/include/net/netns/sctp.h
+++ b/include/net/netns/sctp.h
@@ -129,9 +129,6 @@ struct netns_sctp {
 
 	/* Threshold for autoclose timeout, in seconds. */
 	unsigned long max_autoclose;
-
-	/* Flag to disable SCTP checksumming. */
-	int checksum_disable;
 };
 
 #endif /* __NETNS_SCTP_H__ */
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 78eedf0cd4b3..422db6cc3214 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -135,6 +135,10 @@ extern struct sctp_globals {
 	/* This is the sctp port control hash.	*/
 	int port_hashsize;
 	struct sctp_bind_hashbucket *port_hashtable;
+
+	/* Flag to indicate whether computing and verifying checksum
+	 * is disabled. */
+        bool checksum_disable;
 } sctp_globals;
 
 #define sctp_max_instreams		(sctp_globals.max_instreams)
@@ -146,6 +150,7 @@ extern struct sctp_globals {
 #define sctp_assoc_hashtable		(sctp_globals.assoc_hashtable)
 #define sctp_port_hashsize		(sctp_globals.port_hashsize)
 #define sctp_port_hashtable		(sctp_globals.port_hashtable)
+#define sctp_checksum_disable		(sctp_globals.checksum_disable)
 
 /* SCTP Socket type: UDP or TCP style. */
 typedef enum {
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 2873e221e187..5f2068679f83 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -134,8 +134,8 @@ int sctp_rcv(struct sk_buff *skb)
 	__skb_pull(skb, skb_transport_offset(skb));
 	if (skb->len < sizeof(struct sctphdr))
 		goto discard_it;
-	if (!net->sctp.checksum_disable && !skb_csum_unnecessary(skb) &&
-	    sctp_rcv_checksum(net, skb) < 0)
+	if (!sctp_checksum_disable && !skb_csum_unnecessary(skb) &&
+		  sctp_rcv_checksum(net, skb) < 0)
 		goto discard_it;
 
 	skb_pull(skb, sizeof(struct sctphdr));
diff --git a/net/sctp/output.c b/net/sctp/output.c
index e35b84cc1d9f..0ac3a65daccb 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -389,7 +389,6 @@ int sctp_packet_transmit(struct sctp_packet *packet)
 	int padding;		/* How much padding do we need?  */
 	__u8 has_data = 0;
 	struct dst_entry *dst = tp->dst;
-	struct net *net;
 	unsigned char *auth = NULL;	/* pointer to auth in skb data */
 	__u32 cksum_buf_len = sizeof(struct sctphdr);
 
@@ -536,9 +535,7 @@ int sctp_packet_transmit(struct sctp_packet *packet)
 	 * Note: Adler-32 is no longer applicable, as has been replaced
 	 * by CRC32-C as described in <draft-ietf-tsvwg-sctpcsum-02.txt>.
 	 */
-	net = dev_net(dst->dev);
-
-	if (!net->sctp.checksum_disable) {
+	if (!sctp_checksum_disable) {
 		if (!(dst->dev->features & NETIF_F_SCTP_CSUM)) {
 			__u32 crc32 = sctp_start_cksum((__u8 *)sh, cksum_buf_len);
 
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 54482977a48f..5e17092f4ada 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1187,9 +1187,6 @@ static int __net_init sctp_net_init(struct net *net)
 	/* Whether Cookie Preservative is enabled(1) or not(0) */
 	net->sctp.cookie_preserve_enable 	= 1;
 
-	/* Whether SCTP checksumming is disabled(1) or not(0) */
-	net->sctp.checksum_disable		= 0;
-
 	/* Default sctp sockets to use md5 as their hmac alg */
 #if defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5)
 	net->sctp.sctp_hmac_alg			= "md5";
@@ -1546,4 +1543,6 @@ MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-132");
 MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-132");
 MODULE_AUTHOR("Linux Kernel SCTP developers <linux-sctp@vger.kernel.org>");
 MODULE_DESCRIPTION("Support for the SCTP protocol (RFC2960)");
+module_param_named(no_checksums, sctp_checksum_disable, bool, 0644);
+MODULE_PARM_DESC(no_checksums, "Disable checksums computing and verification");
 MODULE_LICENSE("GPL");
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 1b1ee769b5bf..6b36561a1b3b 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -290,15 +290,7 @@ static struct ctl_table sctp_net_table[] = {
 		.extra1		= &max_autoclose_min,
 		.extra2		= &max_autoclose_max,
 	},
-	{
-		.procname	= "checksum_disable",
-		.data		= &init_net.sctp.checksum_disable,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
-	},
+
 	{ /* sentinel */ }
 };
 
-- 
cgit v1.2.3


From 149479d019e06df5a7f4096f95c00cfb1380309c Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 8 Aug 2013 14:06:22 -0700
Subject: tcp: add server ip to encrypt cookie in fast open

Encrypt the cookie with both server and client IPv4 addresses,
such that multi-homed server will grant different cookies
based on both the source and destination IPs. No client change
is needed since cookie is opaque to the client.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h       |  3 ++-
 net/ipv4/tcp_fastopen.c | 13 ++++++-------
 net/ipv4/tcp_ipv4.c     | 10 +++++++---
 3 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 27b652f379ef..09cb5c11ceea 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1300,7 +1300,8 @@ void tcp_free_fastopen_req(struct tcp_sock *tp);
 
 extern struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
 int tcp_fastopen_reset_cipher(void *key, unsigned int len);
-void tcp_fastopen_cookie_gen(__be32 addr, struct tcp_fastopen_cookie *foc);
+extern void tcp_fastopen_cookie_gen(__be32 src, __be32 dst,
+				    struct tcp_fastopen_cookie *foc);
 
 #define TCP_FASTOPEN_KEY_LENGTH 16
 
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 8f7ef0ad80e5..ab7bd35bb312 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -58,23 +58,22 @@ error:		kfree(ctx);
 	return err;
 }
 
-/* Computes the fastopen cookie for the peer.
- * The peer address is a 128 bits long (pad with zeros for IPv4).
+/* Computes the fastopen cookie for the IP path.
+ * The path is a 128 bits long (pad with zeros for IPv4).
  *
  * The caller must check foc->len to determine if a valid cookie
  * has been generated successfully.
 */
-void tcp_fastopen_cookie_gen(__be32 addr, struct tcp_fastopen_cookie *foc)
+void tcp_fastopen_cookie_gen(__be32 src, __be32 dst,
+			     struct tcp_fastopen_cookie *foc)
 {
-	__be32 peer_addr[4] = { addr, 0, 0, 0 };
+	__be32 path[4] = { src, dst, 0, 0 };
 	struct tcp_fastopen_context *ctx;
 
 	rcu_read_lock();
 	ctx = rcu_dereference(tcp_fastopen_ctx);
 	if (ctx) {
-		crypto_cipher_encrypt_one(ctx->tfm,
-					  foc->val,
-					  (__u8 *)peer_addr);
+		crypto_cipher_encrypt_one(ctx->tfm, foc->val, (__u8 *)path);
 		foc->len = TCP_FASTOPEN_COOKIE_SIZE;
 	}
 	rcu_read_unlock();
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 280efe5f19c1..ec2702882d8d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1316,9 +1316,11 @@ static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
 		tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 		return true;
 	}
+
 	if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
 		if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
-			tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
+			tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
+						ip_hdr(skb)->daddr, valid_foc);
 			if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
 			    memcmp(&foc->val[0], &valid_foc->val[0],
 			    TCP_FASTOPEN_COOKIE_SIZE) != 0)
@@ -1329,14 +1331,16 @@ static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
 		tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 		return true;
 	} else if (foc->len == 0) { /* Client requesting a cookie */
-		tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
+		tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
+					ip_hdr(skb)->daddr, valid_foc);
 		NET_INC_STATS_BH(sock_net(sk),
 		    LINUX_MIB_TCPFASTOPENCOOKIEREQD);
 	} else {
 		/* Client sent a cookie with wrong size. Treat it
 		 * the same as invalid and return a valid one.
 		 */
-		tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
+		tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
+					ip_hdr(skb)->daddr, valid_foc);
 	}
 	return false;
 }
-- 
cgit v1.2.3


From e370a7236321773245c5522d8bb299380830d3b2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 8 Aug 2013 14:37:32 -0700
Subject: af_unix: improve STREAM behavior with fragmented memory

unix_stream_sendmsg() currently uses order-2 allocations,
and we had numerous reports this can fail.

The __GFP_REPEAT flag present in sock_alloc_send_pskb() is
not helping.

This patch extends the work done in commit eb6a24816b247c
("af_unix: reduce high order page allocations) for
datagram sockets.

This opens the possibility of zero copy IO (splice() and
friends)

The trick is to not use skb_pull() anymore in recvmsg() path,
and instead add a @consumed field in UNIXCB() to track amount
of already read payload in the skb.

There is a performance regression for large sends
because of extra page allocations that will be addressed
in a follow-up patch, allowing sock_alloc_send_pskb()
to attempt high order page allocations.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/af_unix.h |  1 +
 net/unix/af_unix.c    | 65 ++++++++++++++++++++++++---------------------------
 2 files changed, 31 insertions(+), 35 deletions(-)

(limited to 'net')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 05442df4ca89..a175ba4a7adb 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -35,6 +35,7 @@ struct unix_skb_parms {
 #ifdef CONFIG_SECURITY_NETWORK
 	u32			secid;		/* Security ID		*/
 #endif
+	u32			consumed;
 };
 
 #define UNIXCB(skb) 	(*(struct unix_skb_parms *)&((skb)->cb))
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index c4ce243824bb..99dc760cdd95 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1596,6 +1596,10 @@ out:
 	return err;
 }
 
+/* We use paged skbs for stream sockets, and limit occupancy to 32768
+ * bytes, and a minimun of a full page.
+ */
+#define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
 
 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 			       struct msghdr *msg, size_t len)
@@ -1609,6 +1613,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	struct scm_cookie tmp_scm;
 	bool fds_sent = false;
 	int max_level;
+	int data_len;
 
 	if (NULL == siocb->scm)
 		siocb->scm = &tmp_scm;
@@ -1635,40 +1640,21 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		goto pipe_err;
 
 	while (sent < len) {
-		/*
-		 *	Optimisation for the fact that under 0.01% of X
-		 *	messages typically need breaking up.
-		 */
-
-		size = len-sent;
+		size = len - sent;
 
 		/* Keep two messages in the pipe so it schedules better */
-		if (size > ((sk->sk_sndbuf >> 1) - 64))
-			size = (sk->sk_sndbuf >> 1) - 64;
+		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
 
-		if (size > SKB_MAX_ALLOC)
-			size = SKB_MAX_ALLOC;
-
-		/*
-		 *	Grab a buffer
-		 */
+		/* allow fallback to order-0 allocations */
+		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
 
-		skb = sock_alloc_send_skb(sk, size, msg->msg_flags&MSG_DONTWAIT,
-					  &err);
+		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
 
-		if (skb == NULL)
+		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
+					   msg->msg_flags & MSG_DONTWAIT, &err);
+		if (!skb)
 			goto out_err;
 
-		/*
-		 *	If you pass two values to the sock_alloc_send_skb
-		 *	it tries to grab the large buffer with GFP_NOFS
-		 *	(which can fail easily), and if it fails grab the
-		 *	fallback size buffer which is under a page and will
-		 *	succeed. [Alan]
-		 */
-		size = min_t(int, size, skb_tailroom(skb));
-
-
 		/* Only send the fds in the first buffer */
 		err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
 		if (err < 0) {
@@ -1678,7 +1664,10 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		max_level = err + 1;
 		fds_sent = true;
 
-		err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
+		skb_put(skb, size - data_len);
+		skb->data_len = data_len;
+		skb->len = size;
+		err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, size);
 		if (err) {
 			kfree_skb(skb);
 			goto out_err;
@@ -1890,6 +1879,11 @@ static long unix_stream_data_wait(struct sock *sk, long timeo,
 	return timeo;
 }
 
+static unsigned int unix_skb_len(const struct sk_buff *skb)
+{
+	return skb->len - UNIXCB(skb).consumed;
+}
+
 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 			       struct msghdr *msg, size_t size,
 			       int flags)
@@ -1977,8 +1971,8 @@ again:
 		}
 
 		skip = sk_peek_offset(sk, flags);
-		while (skip >= skb->len) {
-			skip -= skb->len;
+		while (skip >= unix_skb_len(skb)) {
+			skip -= unix_skb_len(skb);
 			last = skb;
 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
 			if (!skb)
@@ -2005,8 +1999,9 @@ again:
 			sunaddr = NULL;
 		}
 
-		chunk = min_t(unsigned int, skb->len - skip, size);
-		if (memcpy_toiovec(msg->msg_iov, skb->data + skip, chunk)) {
+		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
+		if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip,
+					    msg->msg_iov, chunk)) {
 			if (copied == 0)
 				copied = -EFAULT;
 			break;
@@ -2016,14 +2011,14 @@ again:
 
 		/* Mark read part of skb as used */
 		if (!(flags & MSG_PEEK)) {
-			skb_pull(skb, chunk);
+			UNIXCB(skb).consumed += chunk;
 
 			sk_peek_offset_bwd(sk, chunk);
 
 			if (UNIXCB(skb).fp)
 				unix_detach_fds(siocb->scm, skb);
 
-			if (skb->len)
+			if (unix_skb_len(skb))
 				break;
 
 			skb_unlink(skb, &sk->sk_receive_queue);
@@ -2107,7 +2102,7 @@ long unix_inq_len(struct sock *sk)
 	if (sk->sk_type == SOCK_STREAM ||
 	    sk->sk_type == SOCK_SEQPACKET) {
 		skb_queue_walk(&sk->sk_receive_queue, skb)
-			amount += skb->len;
+			amount += unix_skb_len(skb);
 	} else {
 		skb = skb_peek(&sk->sk_receive_queue);
 		if (skb)
-- 
cgit v1.2.3


From 28d6427109d13b0f447cba5761f88d3548e83605 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 8 Aug 2013 14:38:47 -0700
Subject: net: attempt high order allocations in sock_alloc_send_pskb()

Adding paged frags skbs to af_unix sockets introduced a performance
regression on large sends because of additional page allocations, even
if each skb could carry at least 100% more payload than before.

We can instruct sock_alloc_send_pskb() to attempt high order
allocations.

Most of the time, it does a single page allocation instead of 8.

I added an additional parameter to sock_alloc_send_pskb() to
let other users to opt-in for this new feature on followup patches.

Tested:

Before patch :

$ netperf -t STREAM_STREAM
STREAM STREAM TEST
Recv   Send    Send
Socket Socket  Message  Elapsed
Size   Size    Size     Time     Throughput
bytes  bytes   bytes    secs.    10^6bits/sec

 2304  212992  212992    10.00    46861.15

After patch :

$ netperf -t STREAM_STREAM
STREAM STREAM TEST
Recv   Send    Send
Socket Socket  Message  Elapsed
Size   Size    Size     Time     Throughput
bytes  bytes   bytes    secs.    10^6bits/sec

 2304  212992  212992    10.00    57981.11

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvtap.c  |   2 +-
 drivers/net/tun.c      |   2 +-
 include/net/sock.h     |   3 +-
 net/core/sock.c        | 100 +++++++++++++++++++++++++------------------------
 net/packet/af_packet.c |   2 +-
 net/unix/af_unix.c     |   6 ++-
 6 files changed, 60 insertions(+), 55 deletions(-)

(limited to 'net')

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 182364abfa35..8f6056d9ba97 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -524,7 +524,7 @@ static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad,
 		linear = len;
 
 	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
-				   err);
+				   err, 0);
 	if (!skb)
 		return NULL;
 
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index b1630479d4a1..978d8654b14a 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -949,7 +949,7 @@ static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
 		linear = len;
 
 	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
-				   &err);
+				   &err, 0);
 	if (!skb)
 		return ERR_PTR(err);
 
diff --git a/include/net/sock.h b/include/net/sock.h
index ab6a8b75207e..e4bbcbfd07ea 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1539,7 +1539,8 @@ extern struct sk_buff		*sock_alloc_send_pskb(struct sock *sk,
 						      unsigned long header_len,
 						      unsigned long data_len,
 						      int noblock,
-						      int *errcode);
+						      int *errcode,
+						      int max_page_order);
 extern void *sock_kmalloc(struct sock *sk, int size,
 			  gfp_t priority);
 extern void sock_kfree_s(struct sock *sk, void *mem, int size);
diff --git a/net/core/sock.c b/net/core/sock.c
index 83667de45ec9..5b6beba494a3 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1741,24 +1741,23 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)
 
 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
 				     unsigned long data_len, int noblock,
-				     int *errcode)
+				     int *errcode, int max_page_order)
 {
-	struct sk_buff *skb;
+	struct sk_buff *skb = NULL;
+	unsigned long chunk;
 	gfp_t gfp_mask;
 	long timeo;
 	int err;
 	int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
+	struct page *page;
+	int i;
 
 	err = -EMSGSIZE;
 	if (npages > MAX_SKB_FRAGS)
 		goto failure;
 
-	gfp_mask = sk->sk_allocation;
-	if (gfp_mask & __GFP_WAIT)
-		gfp_mask |= __GFP_REPEAT;
-
 	timeo = sock_sndtimeo(sk, noblock);
-	while (1) {
+	while (!skb) {
 		err = sock_error(sk);
 		if (err != 0)
 			goto failure;
@@ -1767,50 +1766,52 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
 		if (sk->sk_shutdown & SEND_SHUTDOWN)
 			goto failure;
 
-		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
-			skb = alloc_skb(header_len, gfp_mask);
-			if (skb) {
-				int i;
-
-				/* No pages, we're done... */
-				if (!data_len)
-					break;
-
-				skb->truesize += data_len;
-				skb_shinfo(skb)->nr_frags = npages;
-				for (i = 0; i < npages; i++) {
-					struct page *page;
-
-					page = alloc_pages(sk->sk_allocation, 0);
-					if (!page) {
-						err = -ENOBUFS;
-						skb_shinfo(skb)->nr_frags = i;
-						kfree_skb(skb);
-						goto failure;
-					}
-
-					__skb_fill_page_desc(skb, i,
-							page, 0,
-							(data_len >= PAGE_SIZE ?
-							 PAGE_SIZE :
-							 data_len));
-					data_len -= PAGE_SIZE;
-				}
+		if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) {
+			set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+			err = -EAGAIN;
+			if (!timeo)
+				goto failure;
+			if (signal_pending(current))
+				goto interrupted;
+			timeo = sock_wait_for_wmem(sk, timeo);
+			continue;
+		}
 
-				/* Full success... */
-				break;
-			}
-			err = -ENOBUFS;
+		err = -ENOBUFS;
+		gfp_mask = sk->sk_allocation;
+		if (gfp_mask & __GFP_WAIT)
+			gfp_mask |= __GFP_REPEAT;
+
+		skb = alloc_skb(header_len, gfp_mask);
+		if (!skb)
 			goto failure;
+
+		skb->truesize += data_len;
+
+		for (i = 0; npages > 0; i++) {
+			int order = max_page_order;
+
+			while (order) {
+				if (npages >= 1 << order) {
+					page = alloc_pages(sk->sk_allocation |
+							   __GFP_COMP | __GFP_NOWARN,
+							   order);
+					if (page)
+						goto fill_page;
+				}
+				order--;
+			}
+			page = alloc_page(sk->sk_allocation);
+			if (!page)
+				goto failure;
+fill_page:
+			chunk = min_t(unsigned long, data_len,
+				      PAGE_SIZE << order);
+			skb_fill_page_desc(skb, i, page, 0, chunk);
+			data_len -= chunk;
+			npages -= 1 << order;
 		}
-		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
-		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-		err = -EAGAIN;
-		if (!timeo)
-			goto failure;
-		if (signal_pending(current))
-			goto interrupted;
-		timeo = sock_wait_for_wmem(sk, timeo);
 	}
 
 	skb_set_owner_w(skb, sk);
@@ -1819,6 +1820,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
 interrupted:
 	err = sock_intr_errno(timeo);
 failure:
+	kfree_skb(skb);
 	*errcode = err;
 	return NULL;
 }
@@ -1827,7 +1829,7 @@ EXPORT_SYMBOL(sock_alloc_send_pskb);
 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
 				    int noblock, int *errcode)
 {
-	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
+	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
 }
 EXPORT_SYMBOL(sock_alloc_send_skb);
 
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 4cb28a7f639b..6c53dd9f5ccc 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2181,7 +2181,7 @@ static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
 		linear = len;
 
 	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
-				   err);
+				   err, 0);
 	if (!skb)
 		return NULL;
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 99dc760cdd95..fee9e3397cd1 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1479,7 +1479,8 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
 				 MAX_SKB_FRAGS * PAGE_SIZE);
 
 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
-				   msg->msg_flags & MSG_DONTWAIT, &err);
+				   msg->msg_flags & MSG_DONTWAIT, &err,
+				   PAGE_ALLOC_COSTLY_ORDER);
 	if (skb == NULL)
 		goto out;
 
@@ -1651,7 +1652,8 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
 
 		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
-					   msg->msg_flags & MSG_DONTWAIT, &err);
+					   msg->msg_flags & MSG_DONTWAIT, &err,
+					   get_order(UNIX_SKB_FRAGS_SZ));
 		if (!skb)
 			goto out_err;
 
-- 
cgit v1.2.3


From f3dfd20860db3d0c400dd83a378176a28d3662db Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 11 Aug 2013 21:54:48 -0700
Subject: af_unix: fix bug on large send()

commit e370a723632 ("af_unix: improve STREAM behavior with fragmented
memory") added a bug on large send() because the
skb_copy_datagram_from_iovec() call always start from the beginning
of iovec.

We must instead use the @sent variable to properly skip the
already processed part.

Reported-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index fee9e3397cd1..86de99ad2976 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1669,7 +1669,8 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		skb_put(skb, size - data_len);
 		skb->data_len = data_len;
 		skb->len = size;
-		err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, size);
+		err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov,
+						   sent, size);
 		if (err) {
 			kfree_skb(skb);
 			goto out_err;
-- 
cgit v1.2.3


From 1da5fcc86d71040c5b294ca5611ae6c86bfa815c Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 6 Aug 2013 14:10:48 +0200
Subject: nl80211: clean up CQM settings code

Clean up the CQM settings code a bit and while at it
enforce that when setting the threshold to 0 (disable)
the hysteresis is also set to 0 to avoid confusion.
As we haven't enforce it, simply override userspace.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 51 ++++++++++++++++++++++++--------------------------
 1 file changed, 24 insertions(+), 27 deletions(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index f7cb12178bd2..c2a40a2e56bd 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -7562,14 +7562,12 @@ static int nl80211_set_cqm_txe(struct genl_info *info,
 			       u32 rate, u32 pkts, u32 intvl)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
-	struct wireless_dev *wdev;
 	struct net_device *dev = info->user_ptr[1];
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
 
 	if (rate > 100 || intvl > NL80211_CQM_TXE_MAX_INTVL)
 		return -EINVAL;
 
-	wdev = dev->ieee80211_ptr;
-
 	if (!rdev->ops->set_cqm_txe_config)
 		return -EOPNOTSUPP;
 
@@ -7584,13 +7582,15 @@ static int nl80211_set_cqm_rssi(struct genl_info *info,
 				s32 threshold, u32 hysteresis)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
-	struct wireless_dev *wdev;
 	struct net_device *dev = info->user_ptr[1];
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
 
 	if (threshold > 0)
 		return -EINVAL;
 
-	wdev = dev->ieee80211_ptr;
+	/* disabling - hysteresis should also be zero then */
+	if (threshold == 0)
+		hysteresis = 0;
 
 	if (!rdev->ops->set_cqm_rssi_config)
 		return -EOPNOTSUPP;
@@ -7609,36 +7609,33 @@ static int nl80211_set_cqm(struct sk_buff *skb, struct genl_info *info)
 	int err;
 
 	cqm = info->attrs[NL80211_ATTR_CQM];
-	if (!cqm) {
-		err = -EINVAL;
-		goto out;
-	}
+	if (!cqm)
+		return -EINVAL;
 
 	err = nla_parse_nested(attrs, NL80211_ATTR_CQM_MAX, cqm,
 			       nl80211_attr_cqm_policy);
 	if (err)
-		goto out;
+		return err;
 
 	if (attrs[NL80211_ATTR_CQM_RSSI_THOLD] &&
 	    attrs[NL80211_ATTR_CQM_RSSI_HYST]) {
-		s32 threshold;
-		u32 hysteresis;
-		threshold = nla_get_u32(attrs[NL80211_ATTR_CQM_RSSI_THOLD]);
-		hysteresis = nla_get_u32(attrs[NL80211_ATTR_CQM_RSSI_HYST]);
-		err = nl80211_set_cqm_rssi(info, threshold, hysteresis);
-	} else if (attrs[NL80211_ATTR_CQM_TXE_RATE] &&
-		   attrs[NL80211_ATTR_CQM_TXE_PKTS] &&
-		   attrs[NL80211_ATTR_CQM_TXE_INTVL]) {
-		u32 rate, pkts, intvl;
-		rate = nla_get_u32(attrs[NL80211_ATTR_CQM_TXE_RATE]);
-		pkts = nla_get_u32(attrs[NL80211_ATTR_CQM_TXE_PKTS]);
-		intvl = nla_get_u32(attrs[NL80211_ATTR_CQM_TXE_INTVL]);
-		err = nl80211_set_cqm_txe(info, rate, pkts, intvl);
-	} else
-		err = -EINVAL;
+		s32 threshold = nla_get_s32(attrs[NL80211_ATTR_CQM_RSSI_THOLD]);
+		u32 hysteresis = nla_get_u32(attrs[NL80211_ATTR_CQM_RSSI_HYST]);
 
-out:
-	return err;
+		return nl80211_set_cqm_rssi(info, threshold, hysteresis);
+	}
+
+	if (attrs[NL80211_ATTR_CQM_TXE_RATE] &&
+	    attrs[NL80211_ATTR_CQM_TXE_PKTS] &&
+	    attrs[NL80211_ATTR_CQM_TXE_INTVL]) {
+		u32 rate = nla_get_u32(attrs[NL80211_ATTR_CQM_TXE_RATE]);
+		u32 pkts = nla_get_u32(attrs[NL80211_ATTR_CQM_TXE_PKTS]);
+		u32 intvl = nla_get_u32(attrs[NL80211_ATTR_CQM_TXE_INTVL]);
+
+		return nl80211_set_cqm_txe(info, rate, pkts, intvl);
+	}
+
+	return -EINVAL;
 }
 
 static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info)
-- 
cgit v1.2.3


From af61a165187bb94b1dc7628ef815c23d0eacf40b Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 2 Jul 2013 18:09:12 +0200
Subject: mac80211: add control port protocol TX control flag

A lot of drivers check the frame protocol for ETH_P_PAE,
for various reasons (like making those more reliable).
Add a new flags bitmap to the TX control info and a new
flag indicating the control port protocol is in use to
let all drivers also apply such logic to other control
port protocols, should they be configured.

Also use the new flag in the iwlwifi drivers.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 Documentation/DocBook/80211.tmpl            |  1 +
 drivers/net/wireless/iwlwifi/dvm/tx.c       |  2 +-
 drivers/net/wireless/iwlwifi/iwl-devtrace.h |  7 ++++---
 drivers/net/wireless/iwlwifi/mvm/tx.c       |  9 ++++-----
 include/net/mac80211.h                      | 19 ++++++++++++++++---
 net/mac80211/rc80211_minstrel_ht.c          |  5 +++--
 net/mac80211/tx.c                           |  8 +++++---
 7 files changed, 34 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/Documentation/DocBook/80211.tmpl b/Documentation/DocBook/80211.tmpl
index 49267ea97568..f403ec3c5c9a 100644
--- a/Documentation/DocBook/80211.tmpl
+++ b/Documentation/DocBook/80211.tmpl
@@ -325,6 +325,7 @@
           <title>functions/definitions</title>
 !Finclude/net/mac80211.h ieee80211_rx_status
 !Finclude/net/mac80211.h mac80211_rx_flags
+!Finclude/net/mac80211.h mac80211_tx_info_flags
 !Finclude/net/mac80211.h mac80211_tx_control_flags
 !Finclude/net/mac80211.h mac80211_rate_control_flags
 !Finclude/net/mac80211.h ieee80211_tx_rate
diff --git a/drivers/net/wireless/iwlwifi/dvm/tx.c b/drivers/net/wireless/iwlwifi/dvm/tx.c
index 5ee983faa679..f364583b1535 100644
--- a/drivers/net/wireless/iwlwifi/dvm/tx.c
+++ b/drivers/net/wireless/iwlwifi/dvm/tx.c
@@ -87,7 +87,7 @@ static void iwlagn_tx_cmd_build_basic(struct iwl_priv *priv,
 		 priv->lib->bt_params->advanced_bt_coexist &&
 		 (ieee80211_is_auth(fc) || ieee80211_is_assoc_req(fc) ||
 		 ieee80211_is_reassoc_req(fc) ||
-		 skb->protocol == cpu_to_be16(ETH_P_PAE)))
+		 info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO))
 		tx_flags |= TX_CMD_FLG_IGNORE_BT;
 
 
diff --git a/drivers/net/wireless/iwlwifi/iwl-devtrace.h b/drivers/net/wireless/iwlwifi/iwl-devtrace.h
index 4491c1c72cc7..684c416d3493 100644
--- a/drivers/net/wireless/iwlwifi/iwl-devtrace.h
+++ b/drivers/net/wireless/iwlwifi/iwl-devtrace.h
@@ -33,10 +33,11 @@
 static inline bool iwl_trace_data(struct sk_buff *skb)
 {
 	struct ieee80211_hdr *hdr = (void *)skb->data;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 
-	if (ieee80211_is_data(hdr->frame_control))
-		return skb->protocol != cpu_to_be16(ETH_P_PAE);
-	return false;
+	if (!ieee80211_is_data(hdr->frame_control))
+		return false;
+	return !(info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO);
 }
 
 static inline size_t iwl_rx_trace_len(const struct iwl_trans *trans,
diff --git a/drivers/net/wireless/iwlwifi/mvm/tx.c b/drivers/net/wireless/iwlwifi/mvm/tx.c
index f0e96a927407..d62a6d3053ff 100644
--- a/drivers/net/wireless/iwlwifi/mvm/tx.c
+++ b/drivers/net/wireless/iwlwifi/mvm/tx.c
@@ -91,11 +91,10 @@ static void iwl_mvm_set_tx_cmd(struct iwl_mvm *mvm, struct sk_buff *skb,
 		tx_flags |= TX_CMD_FLG_ACK | TX_CMD_FLG_BAR;
 
 	/* High prio packet (wrt. BT coex) if it is EAPOL, MCAST or MGMT */
-	if (info->band == IEEE80211_BAND_2GHZ        &&
-	    (skb->protocol == cpu_to_be16(ETH_P_PAE)  ||
-	     is_multicast_ether_addr(hdr->addr1)      ||
-	     ieee80211_is_back_req(fc)                ||
-	     ieee80211_is_mgmt(fc)))
+	if (info->band == IEEE80211_BAND_2GHZ &&
+	    (info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO ||
+	     is_multicast_ether_addr(hdr->addr1) ||
+	     ieee80211_is_back_req(fc) || ieee80211_is_mgmt(fc)))
 		tx_flags |= TX_CMD_FLG_BT_DIS;
 
 	if (ieee80211_has_morefrags(fc))
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 9cda3728c2cb..b70c00111323 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -375,7 +375,7 @@ struct ieee80211_bss_conf {
 };
 
 /**
- * enum mac80211_tx_control_flags - flags to describe transmission information/status
+ * enum mac80211_tx_info_flags - flags to describe transmission information/status
  *
  * These flags are used with the @flags member of &ieee80211_tx_info.
  *
@@ -471,7 +471,7 @@ struct ieee80211_bss_conf {
  * Note: If you have to add new flags to the enumeration, then don't
  *	 forget to update %IEEE80211_TX_TEMPORARY_FLAGS when necessary.
  */
-enum mac80211_tx_control_flags {
+enum mac80211_tx_info_flags {
 	IEEE80211_TX_CTL_REQ_TX_STATUS		= BIT(0),
 	IEEE80211_TX_CTL_ASSIGN_SEQ		= BIT(1),
 	IEEE80211_TX_CTL_NO_ACK			= BIT(2),
@@ -507,6 +507,18 @@ enum mac80211_tx_control_flags {
 
 #define IEEE80211_TX_CTL_STBC_SHIFT		23
 
+/**
+ * enum mac80211_tx_control_flags - flags to describe transmit control
+ *
+ * @IEEE80211_TX_CTRL_PORT_CTRL_PROTO: this frame is a port control
+ *	protocol frame (e.g. EAP)
+ *
+ * These flags are used in tx_info->control.flags.
+ */
+enum mac80211_tx_control_flags {
+	IEEE80211_TX_CTRL_PORT_CTRL_PROTO	= BIT(0),
+};
+
 /*
  * This definition is used as a mask to clear all temporary flags, which are
  * set by the tx handlers for each transmission attempt by the mac80211 stack.
@@ -680,7 +692,8 @@ struct ieee80211_tx_info {
 			/* NB: vif can be NULL for injected frames */
 			struct ieee80211_vif *vif;
 			struct ieee80211_key_conf *hw_key;
-			/* 8 bytes free */
+			u32 flags;
+			/* 4 bytes free */
 		} control;
 		struct {
 			struct ieee80211_tx_rate rates[IEEE80211_TX_MAX_RATES];
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 7475a7a33797..9eff3824f2d1 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -439,12 +439,13 @@ minstrel_aggr_check(struct ieee80211_sta *pubsta, struct sk_buff *skb)
 {
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
 	struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	u16 tid;
 
 	if (unlikely(!ieee80211_is_data_qos(hdr->frame_control)))
 		return;
 
-	if (unlikely(skb->protocol == cpu_to_be16(ETH_P_PAE)))
+	if (unlikely(info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO))
 		return;
 
 	tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
@@ -776,7 +777,7 @@ minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta,
 
 	/* Don't use EAPOL frames for sampling on non-mrr hw */
 	if (mp->hw->max_rates == 1 &&
-	    txrc->skb->protocol == cpu_to_be16(ETH_P_PAE))
+	    (info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO))
 		sample_idx = -1;
 	else
 		sample_idx = minstrel_get_sample_rate(mp, mi);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 0e42322aa6b1..098ae854ad3c 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -539,9 +539,11 @@ ieee80211_tx_h_check_control_port_protocol(struct ieee80211_tx_data *tx)
 {
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
 
-	if (unlikely(tx->sdata->control_port_protocol == tx->skb->protocol &&
-		     tx->sdata->control_port_no_encrypt))
-		info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+	if (unlikely(tx->sdata->control_port_protocol == tx->skb->protocol)) {
+		if (tx->sdata->control_port_no_encrypt)
+			info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+		info->control.flags |= IEEE80211_TX_CTRL_PORT_CTRL_PROTO;
+	}
 
 	return TX_CONTINUE;
 }
-- 
cgit v1.2.3


From fc73f11f5fa230f8c687d51b0fddb00433092ce0 Mon Sep 17 00:00:00 2001
From: David Spinadel <david.spinadel@intel.com>
Date: Wed, 31 Jul 2013 18:04:15 +0300
Subject: cfg80211: add wdev to testmode cmd

To allow drivers to implement per-interface testmode operations
more easily, pass a wdev pointer if any identification for one
was given from userspace. Clean up the code a bit while at it.

Signed-off-by: David Spinadel <david.spinadel@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath6kl/testmode.c         |  3 ++-
 drivers/net/wireless/ath/ath6kl/testmode.h         |  7 +++++--
 .../net/wireless/brcm80211/brcmfmac/wl_cfg80211.c  |  4 +++-
 include/net/cfg80211.h                             |  5 +++--
 net/mac80211/cfg.c                                 |  4 +++-
 net/wireless/nl80211.c                             | 23 ++++++++++++++++------
 net/wireless/rdev-ops.h                            |  5 +++--
 net/wireless/trace.h                               |  8 +++++---
 8 files changed, 41 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/drivers/net/wireless/ath/ath6kl/testmode.c b/drivers/net/wireless/ath/ath6kl/testmode.c
index acc9aa832f76..d67170ea1038 100644
--- a/drivers/net/wireless/ath/ath6kl/testmode.c
+++ b/drivers/net/wireless/ath/ath6kl/testmode.c
@@ -66,7 +66,8 @@ nla_put_failure:
 	ath6kl_warn("nla_put failed on testmode rx skb!\n");
 }
 
-int ath6kl_tm_cmd(struct wiphy *wiphy, void *data, int len)
+int ath6kl_tm_cmd(struct wiphy *wiphy, struct wireless_dev *wdev,
+		  void *data, int len)
 {
 	struct ath6kl *ar = wiphy_priv(wiphy);
 	struct nlattr *tb[ATH6KL_TM_ATTR_MAX + 1];
diff --git a/drivers/net/wireless/ath/ath6kl/testmode.h b/drivers/net/wireless/ath/ath6kl/testmode.h
index fe651d6707df..9fbcdec3e208 100644
--- a/drivers/net/wireless/ath/ath6kl/testmode.h
+++ b/drivers/net/wireless/ath/ath6kl/testmode.h
@@ -20,7 +20,8 @@
 #ifdef CONFIG_NL80211_TESTMODE
 
 void ath6kl_tm_rx_event(struct ath6kl *ar, void *buf, size_t buf_len);
-int ath6kl_tm_cmd(struct wiphy *wiphy, void *data, int len);
+int ath6kl_tm_cmd(struct wiphy *wiphy, struct wireless_dev *wdev,
+		  void *data, int len);
 
 #else
 
@@ -29,7 +30,9 @@ static inline void ath6kl_tm_rx_event(struct ath6kl *ar, void *buf,
 {
 }
 
-static inline int ath6kl_tm_cmd(struct wiphy *wiphy, void *data, int len)
+static inline int ath6kl_tm_cmd(struct wiphy *wiphy,
+				struct wireless_dev *wdev,
+				void *data, int len)
 {
 	return 0;
 }
diff --git a/drivers/net/wireless/brcm80211/brcmfmac/wl_cfg80211.c b/drivers/net/wireless/brcm80211/brcmfmac/wl_cfg80211.c
index 277b37ae7126..b7d885020dd7 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/wl_cfg80211.c
+++ b/drivers/net/wireless/brcm80211/brcmfmac/wl_cfg80211.c
@@ -3152,7 +3152,9 @@ static int brcmf_cfg80211_sched_scan_stop(struct wiphy *wiphy,
 }
 
 #ifdef CONFIG_NL80211_TESTMODE
-static int brcmf_cfg80211_testmode(struct wiphy *wiphy, void *data, int len)
+static int brcmf_cfg80211_testmode(struct wiphy *wiphy,
+				   struct wireless_dev *wdev,
+				   void *data, int len)
 {
 	struct brcmf_cfg80211_info *cfg = wiphy_to_cfg(wiphy);
 	struct net_device *ndev = cfg_to_ndev(cfg);
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index b7495c72061c..9ab7a0690d93 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2081,7 +2081,7 @@ struct cfg80211_update_ft_ies_params {
  * @mgmt_tx_cancel_wait: Cancel the wait time from transmitting a management
  *	frame on another channel
  *
- * @testmode_cmd: run a test mode command
+ * @testmode_cmd: run a test mode command; @wdev may be %NULL
  * @testmode_dump: Implement a test mode dump. The cb->args[2] and up may be
  *	used by the function, but 0 and 1 must not be touched. Additionally,
  *	return error codes other than -ENOBUFS and -ENOENT will terminate the
@@ -2290,7 +2290,8 @@ struct cfg80211_ops {
 	void	(*rfkill_poll)(struct wiphy *wiphy);
 
 #ifdef CONFIG_NL80211_TESTMODE
-	int	(*testmode_cmd)(struct wiphy *wiphy, void *data, int len);
+	int	(*testmode_cmd)(struct wiphy *wiphy, struct wireless_dev *wdev,
+				void *data, int len);
 	int	(*testmode_dump)(struct wiphy *wiphy, struct sk_buff *skb,
 				 struct netlink_callback *cb,
 				 void *data, int len);
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 44449ceb7966..c77916ffe74c 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -2300,7 +2300,9 @@ static void ieee80211_rfkill_poll(struct wiphy *wiphy)
 }
 
 #ifdef CONFIG_NL80211_TESTMODE
-static int ieee80211_testmode_cmd(struct wiphy *wiphy, void *data, int len)
+static int ieee80211_testmode_cmd(struct wiphy *wiphy,
+				  struct wireless_dev *wdev,
+				  void *data, int len)
 {
 	struct ieee80211_local *local = wiphy_priv(wiphy);
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index c2a40a2e56bd..334697de5cc0 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -6591,19 +6591,30 @@ static struct genl_multicast_group nl80211_testmode_mcgrp = {
 static int nl80211_testmode_do(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct wireless_dev *wdev =
+		__cfg80211_wdev_from_attrs(genl_info_net(info), info->attrs);
 	int err;
 
+	if (!rdev->ops->testmode_cmd)
+		return -EOPNOTSUPP;
+
+	if (IS_ERR(wdev)) {
+		err = PTR_ERR(wdev);
+		if (err != -EINVAL)
+			return err;
+		wdev = NULL;
+	} else if (wdev->wiphy != &rdev->wiphy) {
+		return -EINVAL;
+	}
+
 	if (!info->attrs[NL80211_ATTR_TESTDATA])
 		return -EINVAL;
 
-	err = -EOPNOTSUPP;
-	if (rdev->ops->testmode_cmd) {
-		rdev->testmode_info = info;
-		err = rdev_testmode_cmd(rdev,
+	rdev->testmode_info = info;
+	err = rdev_testmode_cmd(rdev, wdev,
 				nla_data(info->attrs[NL80211_ATTR_TESTDATA]),
 				nla_len(info->attrs[NL80211_ATTR_TESTDATA]));
-		rdev->testmode_info = NULL;
-	}
+	rdev->testmode_info = NULL;
 
 	return err;
 }
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index de870d4d0bcc..37ce9fdfe934 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -516,11 +516,12 @@ static inline void rdev_rfkill_poll(struct cfg80211_registered_device *rdev)
 
 #ifdef CONFIG_NL80211_TESTMODE
 static inline int rdev_testmode_cmd(struct cfg80211_registered_device *rdev,
+				    struct wireless_dev *wdev,
 				    void *data, int len)
 {
 	int ret;
-	trace_rdev_testmode_cmd(&rdev->wiphy);
-	ret = rdev->ops->testmode_cmd(&rdev->wiphy, data, len);
+	trace_rdev_testmode_cmd(&rdev->wiphy, wdev);
+	ret = rdev->ops->testmode_cmd(&rdev->wiphy, wdev, data, len);
 	trace_rdev_return_int(&rdev->wiphy, ret);
 	return ret;
 }
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index f0ebdcd394ef..ba5f0d6614d5 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -1293,15 +1293,17 @@ TRACE_EVENT(rdev_return_int_int,
 
 #ifdef CONFIG_NL80211_TESTMODE
 TRACE_EVENT(rdev_testmode_cmd,
-	TP_PROTO(struct wiphy *wiphy),
-	TP_ARGS(wiphy),
+	TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
+	TP_ARGS(wiphy, wdev),
 	TP_STRUCT__entry(
 		WIPHY_ENTRY
+		WDEV_ENTRY
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
+		WDEV_ASSIGN;
 	),
-	TP_printk(WIPHY_PR_FMT, WIPHY_PR_ARG)
+	TP_printk(WIPHY_PR_FMT WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG)
 );
 
 TRACE_EVENT(rdev_testmode_dump,
-- 
cgit v1.2.3


From 52981cd79461e47fe683febfcbd3d380c72b1c6c Mon Sep 17 00:00:00 2001
From: David Spinadel <david.spinadel@intel.com>
Date: Wed, 31 Jul 2013 18:06:22 +0300
Subject: mac80211: add vif to testmode cmd

Pass the wdev from cfg80211 on to the driver as the vif
if given and it's valid for the driver.

Signed-off-by: David Spinadel <david.spinadel@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/mac80211_hwsim.c     |  1 +
 drivers/net/wireless/ti/wlcore/testmode.c |  3 ++-
 drivers/net/wireless/ti/wlcore/testmode.h |  3 ++-
 include/net/mac80211.h                    |  7 ++++---
 net/mac80211/cfg.c                        | 11 ++++++++++-
 5 files changed, 19 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 7b2a6229eedb..a0d2aacd5e09 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -1364,6 +1364,7 @@ static const struct nla_policy hwsim_testmode_policy[HWSIM_TM_ATTR_MAX + 1] = {
 static int hwsim_fops_ps_write(void *dat, u64 val);
 
 static int mac80211_hwsim_testmode_cmd(struct ieee80211_hw *hw,
+				       struct ieee80211_vif *vif,
 				       void *data, int len)
 {
 	struct mac80211_hwsim_data *hwsim = hw->priv;
diff --git a/drivers/net/wireless/ti/wlcore/testmode.c b/drivers/net/wireless/ti/wlcore/testmode.c
index f3442762d884..527590f2adfb 100644
--- a/drivers/net/wireless/ti/wlcore/testmode.c
+++ b/drivers/net/wireless/ti/wlcore/testmode.c
@@ -356,7 +356,8 @@ out:
 	return ret;
 }
 
-int wl1271_tm_cmd(struct ieee80211_hw *hw, void *data, int len)
+int wl1271_tm_cmd(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+		  void *data, int len)
 {
 	struct wl1271 *wl = hw->priv;
 	struct nlattr *tb[WL1271_TM_ATTR_MAX + 1];
diff --git a/drivers/net/wireless/ti/wlcore/testmode.h b/drivers/net/wireless/ti/wlcore/testmode.h
index 8071654259ea..61d8434d859a 100644
--- a/drivers/net/wireless/ti/wlcore/testmode.h
+++ b/drivers/net/wireless/ti/wlcore/testmode.h
@@ -26,6 +26,7 @@
 
 #include <net/mac80211.h>
 
-int wl1271_tm_cmd(struct ieee80211_hw *hw, void *data, int len);
+int wl1271_tm_cmd(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+		  void *data, int len);
 
 #endif /* __WL1271_TESTMODE_H__ */
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index b70c00111323..df93c77c97ab 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2516,8 +2516,8 @@ enum ieee80211_roc_type {
  *	in IEEE 802.11-2007 section 17.3.8.6 and modify ACK timeout
  *	accordingly. This callback is not required and may sleep.
  *
- * @testmode_cmd: Implement a cfg80211 test mode command.
- *	The callback can sleep.
+ * @testmode_cmd: Implement a cfg80211 test mode command. The passed @vif may
+ *	be %NULL. The callback can sleep.
  * @testmode_dump: Implement a cfg80211 test mode dump. The callback can sleep.
  *
  * @flush: Flush all pending frames from the hardware queue, making sure
@@ -2778,7 +2778,8 @@ struct ieee80211_ops {
 	void (*rfkill_poll)(struct ieee80211_hw *hw);
 	void (*set_coverage_class)(struct ieee80211_hw *hw, u8 coverage_class);
 #ifdef CONFIG_NL80211_TESTMODE
-	int (*testmode_cmd)(struct ieee80211_hw *hw, void *data, int len);
+	int (*testmode_cmd)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+			    void *data, int len);
 	int (*testmode_dump)(struct ieee80211_hw *hw, struct sk_buff *skb,
 			     struct netlink_callback *cb,
 			     void *data, int len);
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index c77916ffe74c..7aa38ce0b524 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -2305,11 +2305,20 @@ static int ieee80211_testmode_cmd(struct wiphy *wiphy,
 				  void *data, int len)
 {
 	struct ieee80211_local *local = wiphy_priv(wiphy);
+	struct ieee80211_vif *vif = NULL;
 
 	if (!local->ops->testmode_cmd)
 		return -EOPNOTSUPP;
 
-	return local->ops->testmode_cmd(&local->hw, data, len);
+	if (wdev) {
+		struct ieee80211_sub_if_data *sdata;
+
+		sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+		if (sdata->flags & IEEE80211_SDATA_IN_DRIVER)
+			vif = &sdata->vif;
+	}
+
+	return local->ops->testmode_cmd(&local->hw, vif, data, len);
 }
 
 static int ieee80211_testmode_dump(struct wiphy *wiphy,
-- 
cgit v1.2.3


From d8eb741eb374804e864751c7f3919ae50321d831 Mon Sep 17 00:00:00 2001
From: Antonio Quartulli <antonio@open-mesh.com>
Date: Fri, 9 Aug 2013 18:58:32 +0200
Subject: mac80211: ibss - do not scan if not needed when creating an IBSS

In some cases mac80211 will scan before creating an IBSS
even if bssid and frequency have been forced by the user.
This is not needed and leads only to a delay in the IBSS
establishment phase.

Immediately create the cell if both bssid and frequency
(and fixed_freq is set) have been specified.

Signed-off-by: Antonio Quartulli <antonio@open-mesh.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ibss.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index e08387cdc8fd..79e294e9b5cc 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -891,6 +891,17 @@ static void ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata)
 		return;
 	}
 
+	/* if a fixed bssid and a fixed freq have been provided create the IBSS
+	 * directly and do not waste time scanning
+	 */
+	if (ifibss->fixed_bssid && ifibss->fixed_channel) {
+		sdata_info(sdata, "Created IBSS using preconfigured BSSID %pM\n",
+			   bssid);
+		ieee80211_sta_create_ibss(sdata);
+		return;
+	}
+
+
 	ibss_dbg(sdata, "sta_find_ibss: did not try to join ibss\n");
 
 	/* Selected IBSS not found in current scan results - try to scan */
-- 
cgit v1.2.3


From 0ef71ee1a5b92c038abefd8991d5368e6031d7de Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 7 Aug 2013 19:12:34 +0200
Subject: netfilter: ctnetlink: refactor ctnetlink_create_expect

This patch refactors ctnetlink_create_expect by spliting it in two
chunks. As a result, we have a new function ctnetlink_alloc_expect
to allocate and to setup the expectation from ctnetlink.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_netlink.c | 156 +++++++++++++++++++----------------
 1 file changed, 87 insertions(+), 69 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index e842c0ded79d..9aaa68bbbcdb 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2735,76 +2735,26 @@ ctnetlink_parse_expect_nat(const struct nlattr *attr,
 #endif
 }
 
-static int
-ctnetlink_create_expect(struct net *net, u16 zone,
-			const struct nlattr * const cda[],
-			u_int8_t u3,
-			u32 portid, int report)
+static struct nf_conntrack_expect *
+ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct,
+		       struct nf_conntrack_helper *helper,
+		       struct nf_conntrack_tuple *tuple,
+		       struct nf_conntrack_tuple *mask)
 {
-	struct nf_conntrack_tuple tuple, mask, master_tuple;
-	struct nf_conntrack_tuple_hash *h = NULL;
+	u_int32_t class = 0;
 	struct nf_conntrack_expect *exp;
-	struct nf_conn *ct;
 	struct nf_conn_help *help;
-	struct nf_conntrack_helper *helper = NULL;
-	u_int32_t class = 0;
-	int err = 0;
-
-	/* caller guarantees that those three CTA_EXPECT_* exist */
-	err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
-	if (err < 0)
-		return err;
-	err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, u3);
-	if (err < 0)
-		return err;
-	err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, u3);
-	if (err < 0)
-		return err;
-
-	/* Look for master conntrack of this expectation */
-	h = nf_conntrack_find_get(net, zone, &master_tuple);
-	if (!h)
-		return -ENOENT;
-	ct = nf_ct_tuplehash_to_ctrack(h);
-
-	/* Look for helper of this expectation */
-	if (cda[CTA_EXPECT_HELP_NAME]) {
-		const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]);
-
-		helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),
-						    nf_ct_protonum(ct));
-		if (helper == NULL) {
-#ifdef CONFIG_MODULES
-			if (request_module("nfct-helper-%s", helpname) < 0) {
-				err = -EOPNOTSUPP;
-				goto out;
-			}
-
-			helper = __nf_conntrack_helper_find(helpname,
-							    nf_ct_l3num(ct),
-							    nf_ct_protonum(ct));
-			if (helper) {
-				err = -EAGAIN;
-				goto out;
-			}
-#endif
-			err = -EOPNOTSUPP;
-			goto out;
-		}
-	}
+	int err;
 
 	if (cda[CTA_EXPECT_CLASS] && helper) {
 		class = ntohl(nla_get_be32(cda[CTA_EXPECT_CLASS]));
-		if (class > helper->expect_class_max) {
-			err = -EINVAL;
-			goto out;
-		}
+		if (class > helper->expect_class_max)
+			return ERR_PTR(-EINVAL);
 	}
 	exp = nf_ct_expect_alloc(ct);
-	if (!exp) {
-		err = -ENOMEM;
-		goto out;
-	}
+	if (!exp)
+		return ERR_PTR(-ENOMEM);
+
 	help = nfct_help(ct);
 	if (!help) {
 		if (!cda[CTA_EXPECT_TIMEOUT]) {
@@ -2842,21 +2792,89 @@ ctnetlink_create_expect(struct net *net, u16 zone,
 	exp->class = class;
 	exp->master = ct;
 	exp->helper = helper;
-	memcpy(&exp->tuple, &tuple, sizeof(struct nf_conntrack_tuple));
-	memcpy(&exp->mask.src.u3, &mask.src.u3, sizeof(exp->mask.src.u3));
-	exp->mask.src.u.all = mask.src.u.all;
+	exp->tuple = *tuple;
+	exp->mask.src.u3 = mask->src.u3;
+	exp->mask.src.u.all = mask->src.u.all;
 
 	if (cda[CTA_EXPECT_NAT]) {
 		err = ctnetlink_parse_expect_nat(cda[CTA_EXPECT_NAT],
-						 exp, u3);
+						 exp, nf_ct_l3num(ct));
 		if (err < 0)
 			goto err_out;
 	}
-	err = nf_ct_expect_related_report(exp, portid, report);
+	return exp;
 err_out:
 	nf_ct_expect_put(exp);
-out:
-	nf_ct_put(nf_ct_tuplehash_to_ctrack(h));
+	return ERR_PTR(err);
+}
+
+static int
+ctnetlink_create_expect(struct net *net, u16 zone,
+			const struct nlattr * const cda[],
+			u_int8_t u3, u32 portid, int report)
+{
+	struct nf_conntrack_tuple tuple, mask, master_tuple;
+	struct nf_conntrack_tuple_hash *h = NULL;
+	struct nf_conntrack_helper *helper = NULL;
+	struct nf_conntrack_expect *exp;
+	struct nf_conn *ct;
+	int err;
+
+	/* caller guarantees that those three CTA_EXPECT_* exist */
+	err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+	if (err < 0)
+		return err;
+	err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, u3);
+	if (err < 0)
+		return err;
+	err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, u3);
+	if (err < 0)
+		return err;
+
+	/* Look for master conntrack of this expectation */
+	h = nf_conntrack_find_get(net, zone, &master_tuple);
+	if (!h)
+		return -ENOENT;
+	ct = nf_ct_tuplehash_to_ctrack(h);
+
+	if (cda[CTA_EXPECT_HELP_NAME]) {
+		const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]);
+
+		helper = __nf_conntrack_helper_find(helpname, u3,
+						    nf_ct_protonum(ct));
+		if (helper == NULL) {
+#ifdef CONFIG_MODULES
+			if (request_module("nfct-helper-%s", helpname) < 0) {
+				err = -EOPNOTSUPP;
+				goto err_ct;
+			}
+			helper = __nf_conntrack_helper_find(helpname, u3,
+							    nf_ct_protonum(ct));
+			if (helper) {
+				err = -EAGAIN;
+				goto err_ct;
+			}
+#endif
+			err = -EOPNOTSUPP;
+			goto err_ct;
+		}
+	}
+
+	exp = ctnetlink_alloc_expect(cda, ct, helper, &tuple, &mask);
+	if (IS_ERR(exp)) {
+		err = PTR_ERR(exp);
+		goto err_ct;
+	}
+
+	err = nf_ct_expect_related_report(exp, portid, report);
+	if (err < 0)
+		goto err_exp;
+
+	return 0;
+err_exp:
+	nf_ct_expect_put(exp);
+err_ct:
+	nf_ct_put(ct);
 	return err;
 }
 
-- 
cgit v1.2.3


From bd0779370588386e4a67ba5d0b176cfded8e6a53 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 7 Aug 2013 18:13:20 +0200
Subject: netfilter: nfnetlink_queue: allow to attach expectations to
 conntracks

This patch adds the capability to attach expectations via nfnetlink_queue.
This is required by conntrack helpers that trigger expectations based on
the first packet seen like the TFTP and the DHCPv6 user-space helpers.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h                      |  2 +
 include/net/netfilter/nfnetlink_queue.h        |  8 +++
 include/uapi/linux/netfilter/nfnetlink_queue.h |  1 +
 net/netfilter/nf_conntrack_netlink.c           | 95 ++++++++++++++++++++++----
 net/netfilter/nfnetlink_queue_core.c           |  9 ++-
 net/netfilter/nfnetlink_queue_ct.c             | 15 ++++
 6 files changed, 114 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 655d5d198d49..e2cf786be22f 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -325,6 +325,8 @@ struct nfq_ct_hook {
 	size_t (*build_size)(const struct nf_conn *ct);
 	int (*build)(struct sk_buff *skb, struct nf_conn *ct);
 	int (*parse)(const struct nlattr *attr, struct nf_conn *ct);
+	int (*attach_expect)(const struct nlattr *attr, struct nf_conn *ct,
+			     u32 portid, u32 report);
 };
 extern struct nfq_ct_hook __rcu *nfq_ct_hook;
 
diff --git a/include/net/netfilter/nfnetlink_queue.h b/include/net/netfilter/nfnetlink_queue.h
index 86267a529514..aff88ba91391 100644
--- a/include/net/netfilter/nfnetlink_queue.h
+++ b/include/net/netfilter/nfnetlink_queue.h
@@ -15,6 +15,8 @@ int nfqnl_ct_put(struct sk_buff *skb, struct nf_conn *ct,
 		 enum ip_conntrack_info ctinfo);
 void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
 			 enum ip_conntrack_info ctinfo, int diff);
+int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr,
+			u32 portid, u32 report);
 #else
 inline struct nf_conn *
 nfqnl_ct_get(struct sk_buff *entskb, size_t *size, enum ip_conntrack_info *ctinfo)
@@ -39,5 +41,11 @@ inline void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
 				enum ip_conntrack_info ctinfo, int diff)
 {
 }
+
+inline int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr,
+			       u32 portid, u32 report)
+{
+	return 0;
+}
 #endif /* NF_CONNTRACK */
 #endif
diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h
index 3a9b92147339..0132bad79de7 100644
--- a/include/uapi/linux/netfilter/nfnetlink_queue.h
+++ b/include/uapi/linux/netfilter/nfnetlink_queue.h
@@ -46,6 +46,7 @@ enum nfqnl_attr_type {
 	NFQA_CT_INFO,			/* enum ip_conntrack_info */
 	NFQA_CAP_LEN,			/* __u32 length of captured packet */
 	NFQA_SKB_INFO,			/* __u32 skb meta information */
+	NFQA_EXP,			/* nf_conntrack_netlink.h */
 
 	__NFQA_MAX
 };
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 9aaa68bbbcdb..fa61fea63234 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1987,6 +1987,27 @@ out:
 	return err == -EAGAIN ? -ENOBUFS : err;
 }
 
+static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = {
+	[CTA_EXPECT_MASTER]	= { .type = NLA_NESTED },
+	[CTA_EXPECT_TUPLE]	= { .type = NLA_NESTED },
+	[CTA_EXPECT_MASK]	= { .type = NLA_NESTED },
+	[CTA_EXPECT_TIMEOUT]	= { .type = NLA_U32 },
+	[CTA_EXPECT_ID]		= { .type = NLA_U32 },
+	[CTA_EXPECT_HELP_NAME]	= { .type = NLA_NUL_STRING,
+				    .len = NF_CT_HELPER_NAME_LEN - 1 },
+	[CTA_EXPECT_ZONE]	= { .type = NLA_U16 },
+	[CTA_EXPECT_FLAGS]	= { .type = NLA_U32 },
+	[CTA_EXPECT_CLASS]	= { .type = NLA_U32 },
+	[CTA_EXPECT_NAT]	= { .type = NLA_NESTED },
+	[CTA_EXPECT_FN]		= { .type = NLA_NUL_STRING },
+};
+
+static struct nf_conntrack_expect *
+ctnetlink_alloc_expect(const struct nlattr *const cda[], struct nf_conn *ct,
+		       struct nf_conntrack_helper *helper,
+		       struct nf_conntrack_tuple *tuple,
+		       struct nf_conntrack_tuple *mask);
+
 #ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT
 static size_t
 ctnetlink_nfqueue_build_size(const struct nf_conn *ct)
@@ -2127,10 +2148,69 @@ ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct)
 	return ret;
 }
 
+static int ctnetlink_nfqueue_exp_parse(const struct nlattr * const *cda,
+				       const struct nf_conn *ct,
+				       struct nf_conntrack_tuple *tuple,
+				       struct nf_conntrack_tuple *mask)
+{
+	int err;
+
+	err = ctnetlink_parse_tuple(cda, tuple, CTA_EXPECT_TUPLE,
+				    nf_ct_l3num(ct));
+	if (err < 0)
+		return err;
+
+	return ctnetlink_parse_tuple(cda, mask, CTA_EXPECT_MASK,
+				     nf_ct_l3num(ct));
+}
+
+static int
+ctnetlink_nfqueue_attach_expect(const struct nlattr *attr, struct nf_conn *ct,
+				u32 portid, u32 report)
+{
+	struct nlattr *cda[CTA_EXPECT_MAX+1];
+	struct nf_conntrack_tuple tuple, mask;
+	struct nf_conntrack_helper *helper;
+	struct nf_conntrack_expect *exp;
+	int err;
+
+	err = nla_parse_nested(cda, CTA_EXPECT_MAX, attr, exp_nla_policy);
+	if (err < 0)
+		return err;
+
+	err = ctnetlink_nfqueue_exp_parse((const struct nlattr * const *)cda,
+					  ct, &tuple, &mask);
+	if (err < 0)
+		return err;
+
+	if (cda[CTA_EXPECT_HELP_NAME]) {
+		const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]);
+
+		helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),
+						    nf_ct_protonum(ct));
+		if (helper == NULL)
+			return -EOPNOTSUPP;
+	}
+
+	exp = ctnetlink_alloc_expect((const struct nlattr * const *)cda, ct,
+				     helper, &tuple, &mask);
+	if (IS_ERR(exp))
+		return PTR_ERR(exp);
+
+	err = nf_ct_expect_related_report(exp, portid, report);
+	if (err < 0) {
+		nf_ct_expect_put(exp);
+		return err;
+	}
+
+	return 0;
+}
+
 static struct nfq_ct_hook ctnetlink_nfqueue_hook = {
 	.build_size	= ctnetlink_nfqueue_build_size,
 	.build		= ctnetlink_nfqueue_build,
 	.parse		= ctnetlink_nfqueue_parse,
+	.attach_expect	= ctnetlink_nfqueue_attach_expect,
 };
 #endif /* CONFIG_NETFILTER_NETLINK_QUEUE_CT */
 
@@ -2498,21 +2578,6 @@ static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb,
 	return err;
 }
 
-static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = {
-	[CTA_EXPECT_MASTER]	= { .type = NLA_NESTED },
-	[CTA_EXPECT_TUPLE]	= { .type = NLA_NESTED },
-	[CTA_EXPECT_MASK]	= { .type = NLA_NESTED },
-	[CTA_EXPECT_TIMEOUT]	= { .type = NLA_U32 },
-	[CTA_EXPECT_ID]		= { .type = NLA_U32 },
-	[CTA_EXPECT_HELP_NAME]	= { .type = NLA_NUL_STRING,
-				    .len = NF_CT_HELPER_NAME_LEN - 1 },
-	[CTA_EXPECT_ZONE]	= { .type = NLA_U16 },
-	[CTA_EXPECT_FLAGS]	= { .type = NLA_U32 },
-	[CTA_EXPECT_CLASS]	= { .type = NLA_U32 },
-	[CTA_EXPECT_NAT]	= { .type = NLA_NESTED },
-	[CTA_EXPECT_FN]		= { .type = NLA_NUL_STRING },
-};
-
 static int
 ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
 		     const struct nlmsghdr *nlh,
diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c
index ec9de12aa488..e8c9f3bb779c 100644
--- a/net/netfilter/nfnetlink_queue_core.c
+++ b/net/netfilter/nfnetlink_queue_core.c
@@ -859,6 +859,7 @@ static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = {
 	[NFQA_MARK]		= { .type = NLA_U32 },
 	[NFQA_PAYLOAD]		= { .type = NLA_UNSPEC },
 	[NFQA_CT]		= { .type = NLA_UNSPEC },
+	[NFQA_EXP]		= { .type = NLA_UNSPEC },
 };
 
 static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = {
@@ -987,8 +988,14 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
 	if (entry == NULL)
 		return -ENOENT;
 
-	if (nfqa[NFQA_CT])
+	if (nfqa[NFQA_CT]) {
 		ct = nfqnl_ct_parse(entry->skb, nfqa[NFQA_CT], &ctinfo);
+		if (ct && nfqa[NFQA_EXP]) {
+			nfqnl_attach_expect(ct, nfqa[NFQA_EXP],
+					    NETLINK_CB(skb).portid,
+					    nlmsg_report(nlh));
+		}
+	}
 
 	if (nfqa[NFQA_PAYLOAD]) {
 		u16 payload_len = nla_len(nfqa[NFQA_PAYLOAD]);
diff --git a/net/netfilter/nfnetlink_queue_ct.c b/net/netfilter/nfnetlink_queue_ct.c
index ab61d66bc0b9..be893039966d 100644
--- a/net/netfilter/nfnetlink_queue_ct.c
+++ b/net/netfilter/nfnetlink_queue_ct.c
@@ -96,3 +96,18 @@ void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
 	if ((ct->status & IPS_NAT_MASK) && diff)
 		nfq_nat_ct->seq_adjust(skb, ct, ctinfo, diff);
 }
+
+int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr,
+			u32 portid, u32 report)
+{
+	struct nfq_ct_hook *nfq_ct;
+
+	if (nf_ct_is_untracked(ct))
+		return 0;
+
+	nfq_ct = rcu_dereference(nfq_ct_hook);
+	if (nfq_ct == NULL)
+		return -EOPNOTSUPP;
+
+	return nfq_ct->attach_expect(attr, ct, portid, report);
+}
-- 
cgit v1.2.3


From 072017b41e49e2a8e8a4e0258837a614bb5daa8d Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vyasevich@gmail.com>
Date: Fri, 9 Aug 2013 22:05:36 -0400
Subject: net: sctp: Add rudimentary infrastructure to account for control
 chunks

This patch adds a base infrastructure that allows SCTP to do
memory accounting for control chunks.  Real accounting code will
follow.

This patch alos fixes the following triggered bug ...

[  553.109742] kernel BUG at include/linux/skbuff.h:1813!
[  553.109766] invalid opcode: 0000 [#1] SMP
[  553.109789] Modules linked in: sctp libcrc32c rfcomm [...]
[  553.110259]  uinput i915 i2c_algo_bit drm_kms_helper e1000e drm ptp
pps_core i2c_core wmi video sunrpc
[  553.110320] CPU: 0 PID: 1636 Comm: lt-test_1_to_1_ Not tainted
3.11.0-rc3+ #2
[  553.110350] Hardware name: LENOVO 74597D6/74597D6, BIOS 6DET60WW
(3.10 ) 09/17/2009
[  553.110381] task: ffff88020a01dd40 ti: ffff880204ed0000 task.ti:
ffff880204ed0000
[  553.110411] RIP: 0010:[<ffffffffa0698017>]  [<ffffffffa0698017>]
skb_orphan.part.9+0x4/0x6 [sctp]
[  553.110459] RSP: 0018:ffff880204ed1bb8  EFLAGS: 00010286
[  553.110483] RAX: ffff8802086f5a40 RBX: ffff880204303300 RCX:
0000000000000000
[  553.110487] RDX: ffff880204303c28 RSI: ffff8802086f5a40 RDI:
ffff880202158000
[  553.110487] RBP: ffff880204ed1bb8 R08: 0000000000000000 R09:
0000000000000000
[  553.110487] R10: ffff88022f2d9a04 R11: ffff880233001600 R12:
0000000000000000
[  553.110487] R13: ffff880204303c00 R14: ffff8802293d0000 R15:
ffff880202158000
[  553.110487] FS:  00007f31b31fe740(0000) GS:ffff88023bc00000(0000)
knlGS:0000000000000000
[  553.110487] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[  553.110487] CR2: 000000379980e3e0 CR3: 000000020d225000 CR4:
00000000000407f0
[  553.110487] Stack:
[  553.110487]  ffff880204ed1ca8 ffffffffa068d7fc 0000000000000000
0000000000000000
[  553.110487]  0000000000000000 ffff8802293d0000 ffff880202158000
ffffffff81cb7900
[  553.110487]  0000000000000000 0000400000001c68 ffff8802086f5a40
000000000000000f
[  553.110487] Call Trace:
[  553.110487]  [<ffffffffa068d7fc>] sctp_sendmsg+0x6bc/0xc80 [sctp]
[  553.110487]  [<ffffffff8128f185>] ? sock_has_perm+0x75/0x90
[  553.110487]  [<ffffffff815a3593>] inet_sendmsg+0x63/0xb0
[  553.110487]  [<ffffffff8128f2b3>] ? selinux_socket_sendmsg+0x23/0x30
[  553.110487]  [<ffffffff8151c5d6>] sock_sendmsg+0xa6/0xd0
[  553.110487]  [<ffffffff81637b05>] ? _raw_spin_unlock_bh+0x15/0x20
[  553.110487]  [<ffffffff8151cd38>] SYSC_sendto+0x128/0x180
[  553.110487]  [<ffffffff8151ce6b>] ? SYSC_connect+0xdb/0x100
[  553.110487]  [<ffffffffa0690031>] ? sctp_inet_listen+0x71/0x1f0
[sctp]
[  553.110487]  [<ffffffff8151d35e>] SyS_sendto+0xe/0x10
[  553.110487]  [<ffffffff81640202>] system_call_fastpath+0x16/0x1b
[  553.110487] Code: e0 48 c7 c7 00 22 6a a0 e8 67 a3 f0 e0 48 c7 [...]
[  553.110487] RIP  [<ffffffffa0698017>] skb_orphan.part.9+0x4/0x6
[sctp]
[  553.110487]  RSP <ffff880204ed1bb8>
[  553.121578] ---[ end trace 46c20c5903ef5be2 ]---

The approach taken here is to split data and control chunks
creation a  bit.  Data chunks already have memory accounting
so noting needs to happen.  For control chunks, add stubs handlers.

Signed-off-by: Vlad Yasevich <vyasevich@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_make_chunk.c | 99 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 69 insertions(+), 30 deletions(-)

(limited to 'net')

diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 9c131338ccfa..01e97836ca6c 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -62,8 +62,12 @@
 #include <net/sctp/sctp.h>
 #include <net/sctp/sm.h>
 
-static struct sctp_chunk *sctp_make_chunk(const struct sctp_association *asoc,
-					  __u8 type, __u8 flags, int paylen);
+static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc,
+					    __u8 type, __u8 flags, int paylen);
+static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc,
+					 __u8 flags, int paylen);
+static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc,
+					   __u8 type, __u8 flags, int paylen);
 static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
 					const struct sctp_chunk *init_chunk,
@@ -76,6 +80,28 @@ static int sctp_process_param(struct sctp_association *asoc,
 static void *sctp_addto_param(struct sctp_chunk *chunk, int len,
 			      const void *data);
 
+/* Control chunk destructor */
+static void sctp_control_release_owner(struct sk_buff *skb)
+{
+	/*TODO: do memory release */
+}
+
+static void sctp_control_set_owner_w(struct sctp_chunk *chunk)
+{
+	struct sctp_association *asoc = chunk->asoc;
+	struct sk_buff *skb = chunk->skb;
+
+	/* TODO: properly account for control chunks.
+	 * To do it right we'll need:
+	 *  1) endpoint if association isn't known.
+	 *  2) proper memory accounting.
+	 *
+	 *  For now don't do anything for now.
+	 */
+	skb->sk = asoc ? asoc->base.sk : NULL;
+	skb->destructor = sctp_control_release_owner;
+}
+
 /* What was the inbound interface for this chunk? */
 int sctp_chunk_iif(const struct sctp_chunk *chunk)
 {
@@ -290,7 +316,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 	 * PLEASE DO NOT FIXME [This version does not support Host Name.]
 	 */
 
-	retval = sctp_make_chunk(asoc, SCTP_CID_INIT, 0, chunksize);
+	retval = sctp_make_control(asoc, SCTP_CID_INIT, 0, chunksize);
 	if (!retval)
 		goto nodata;
 
@@ -437,7 +463,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
 					num_ext);
 
 	/* Now allocate and fill out the chunk.  */
-	retval = sctp_make_chunk(asoc, SCTP_CID_INIT_ACK, 0, chunksize);
+	retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize);
 	if (!retval)
 		goto nomem_chunk;
 
@@ -542,7 +568,7 @@ struct sctp_chunk *sctp_make_cookie_echo(const struct sctp_association *asoc,
 	cookie_len = asoc->peer.cookie_len;
 
 	/* Build a cookie echo chunk.  */
-	retval = sctp_make_chunk(asoc, SCTP_CID_COOKIE_ECHO, 0, cookie_len);
+	retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ECHO, 0, cookie_len);
 	if (!retval)
 		goto nodata;
 	retval->subh.cookie_hdr =
@@ -587,7 +613,7 @@ struct sctp_chunk *sctp_make_cookie_ack(const struct sctp_association *asoc,
 {
 	struct sctp_chunk *retval;
 
-	retval = sctp_make_chunk(asoc, SCTP_CID_COOKIE_ACK, 0, 0);
+	retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ACK, 0, 0);
 
 	/* RFC 2960 6.4 Multi-homed SCTP Endpoints
 	 *
@@ -635,8 +661,8 @@ struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc,
 	sctp_cwrhdr_t cwr;
 
 	cwr.lowest_tsn = htonl(lowest_tsn);
-	retval = sctp_make_chunk(asoc, SCTP_CID_ECN_CWR, 0,
-				 sizeof(sctp_cwrhdr_t));
+	retval = sctp_make_control(asoc, SCTP_CID_ECN_CWR, 0,
+				   sizeof(sctp_cwrhdr_t));
 
 	if (!retval)
 		goto nodata;
@@ -669,8 +695,8 @@ struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc,
 	sctp_ecnehdr_t ecne;
 
 	ecne.lowest_tsn = htonl(lowest_tsn);
-	retval = sctp_make_chunk(asoc, SCTP_CID_ECN_ECNE, 0,
-				 sizeof(sctp_ecnehdr_t));
+	retval = sctp_make_control(asoc, SCTP_CID_ECN_ECNE, 0,
+				   sizeof(sctp_ecnehdr_t));
 	if (!retval)
 		goto nodata;
 	retval->subh.ecne_hdr =
@@ -706,7 +732,7 @@ struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc,
 		dp.ssn = htons(ssn);
 
 	chunk_len = sizeof(dp) + data_len;
-	retval = sctp_make_chunk(asoc, SCTP_CID_DATA, flags, chunk_len);
+	retval = sctp_make_data(asoc, flags, chunk_len);
 	if (!retval)
 		goto nodata;
 
@@ -753,7 +779,7 @@ struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc)
 		+ sizeof(__u32) * num_dup_tsns;
 
 	/* Create the chunk.  */
-	retval = sctp_make_chunk(asoc, SCTP_CID_SACK, 0, len);
+	retval = sctp_make_control(asoc, SCTP_CID_SACK, 0, len);
 	if (!retval)
 		goto nodata;
 
@@ -832,8 +858,8 @@ struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc,
 	ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map);
 	shut.cum_tsn_ack = htonl(ctsn);
 
-	retval = sctp_make_chunk(asoc, SCTP_CID_SHUTDOWN, 0,
-				 sizeof(sctp_shutdownhdr_t));
+	retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN, 0,
+				   sizeof(sctp_shutdownhdr_t));
 	if (!retval)
 		goto nodata;
 
@@ -851,7 +877,7 @@ struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc,
 {
 	struct sctp_chunk *retval;
 
-	retval = sctp_make_chunk(asoc, SCTP_CID_SHUTDOWN_ACK, 0, 0);
+	retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_ACK, 0, 0);
 
 	/* RFC 2960 6.4 Multi-homed SCTP Endpoints
 	 *
@@ -880,7 +906,7 @@ struct sctp_chunk *sctp_make_shutdown_complete(
 	 */
 	flags |= asoc ? 0 : SCTP_CHUNK_FLAG_T;
 
-	retval = sctp_make_chunk(asoc, SCTP_CID_SHUTDOWN_COMPLETE, flags, 0);
+	retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_COMPLETE, flags, 0);
 
 	/* RFC 2960 6.4 Multi-homed SCTP Endpoints
 	 *
@@ -919,7 +945,7 @@ struct sctp_chunk *sctp_make_abort(const struct sctp_association *asoc,
 			flags = SCTP_CHUNK_FLAG_T;
 	}
 
-	retval = sctp_make_chunk(asoc, SCTP_CID_ABORT, flags, hint);
+	retval = sctp_make_control(asoc, SCTP_CID_ABORT, flags, hint);
 
 	/* RFC 2960 6.4 Multi-homed SCTP Endpoints
 	 *
@@ -1111,7 +1137,7 @@ struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc,
 	struct sctp_chunk *retval;
 	sctp_sender_hb_info_t hbinfo;
 
-	retval = sctp_make_chunk(asoc, SCTP_CID_HEARTBEAT, 0, sizeof(hbinfo));
+	retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT, 0, sizeof(hbinfo));
 
 	if (!retval)
 		goto nodata;
@@ -1139,7 +1165,7 @@ struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *asoc,
 {
 	struct sctp_chunk *retval;
 
-	retval  = sctp_make_chunk(asoc, SCTP_CID_HEARTBEAT_ACK, 0, paylen);
+	retval  = sctp_make_control(asoc, SCTP_CID_HEARTBEAT_ACK, 0, paylen);
 	if (!retval)
 		goto nodata;
 
@@ -1171,8 +1197,8 @@ static struct sctp_chunk *sctp_make_op_error_space(
 {
 	struct sctp_chunk *retval;
 
-	retval = sctp_make_chunk(asoc, SCTP_CID_ERROR, 0,
-				 sizeof(sctp_errhdr_t) + size);
+	retval = sctp_make_control(asoc, SCTP_CID_ERROR, 0,
+				   sizeof(sctp_errhdr_t) + size);
 	if (!retval)
 		goto nodata;
 
@@ -1242,7 +1268,7 @@ struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc)
 	if (unlikely(!hmac_desc))
 		return NULL;
 
-	retval = sctp_make_chunk(asoc, SCTP_CID_AUTH, 0,
+	retval = sctp_make_control(asoc, SCTP_CID_AUTH, 0,
 			hmac_desc->hmac_len + sizeof(sctp_authhdr_t));
 	if (!retval)
 		return NULL;
@@ -1345,8 +1371,8 @@ const union sctp_addr *sctp_source(const struct sctp_chunk *chunk)
 /* Create a new chunk, setting the type and flags headers from the
  * arguments, reserving enough space for a 'paylen' byte payload.
  */
-static struct sctp_chunk *sctp_make_chunk(const struct sctp_association *asoc,
-					  __u8 type, __u8 flags, int paylen)
+static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc,
+					    __u8 type, __u8 flags, int paylen)
 {
 	struct sctp_chunk *retval;
 	sctp_chunkhdr_t *chunk_hdr;
@@ -1379,14 +1405,27 @@ static struct sctp_chunk *sctp_make_chunk(const struct sctp_association *asoc,
 	if (sctp_auth_send_cid(type, asoc))
 		retval->auth = 1;
 
-	/* Set the skb to the belonging sock for accounting.  */
-	skb->sk = sk;
-
 	return retval;
 nodata:
 	return NULL;
 }
 
+static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc,
+					 __u8 flags, int paylen)
+{
+	return _sctp_make_chunk(asoc, SCTP_CID_DATA, flags, paylen);
+}
+
+static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc,
+					    __u8 type, __u8 flags, int paylen)
+{
+	struct sctp_chunk *chunk = _sctp_make_chunk(asoc, type, flags, paylen);
+
+	if (chunk)
+		sctp_control_set_owner_w(chunk);
+
+	return chunk;
+}
 
 /* Release the memory occupied by a chunk.  */
 static void sctp_chunk_destroy(struct sctp_chunk *chunk)
@@ -2727,7 +2766,7 @@ static struct sctp_chunk *sctp_make_asconf(struct sctp_association *asoc,
 	length += addrlen;
 
 	/* Create the chunk.  */
-	retval = sctp_make_chunk(asoc, SCTP_CID_ASCONF, 0, length);
+	retval = sctp_make_control(asoc, SCTP_CID_ASCONF, 0, length);
 	if (!retval)
 		return NULL;
 
@@ -2911,7 +2950,7 @@ static struct sctp_chunk *sctp_make_asconf_ack(const struct sctp_association *as
 	int			length = sizeof(asconf) + vparam_len;
 
 	/* Create the chunk.  */
-	retval = sctp_make_chunk(asoc, SCTP_CID_ASCONF_ACK, 0, length);
+	retval = sctp_make_control(asoc, SCTP_CID_ASCONF_ACK, 0, length);
 	if (!retval)
 		return NULL;
 
@@ -3442,7 +3481,7 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc,
 
 	hint = (nstreams + 1) * sizeof(__u32);
 
-	retval = sctp_make_chunk(asoc, SCTP_CID_FWD_TSN, 0, hint);
+	retval = sctp_make_control(asoc, SCTP_CID_FWD_TSN, 0, hint);
 
 	if (!retval)
 		return NULL;
-- 
cgit v1.2.3


From 369f4d503ac12363f5d11b91f849377875d57598 Mon Sep 17 00:00:00 2001
From: Samuel Ortiz <sameo@linux.intel.com>
Date: Wed, 24 Jul 2013 14:49:22 +0200
Subject: NFC: Fix SE discovery failure warning condition

This is a typo coming from the initial implementation. se_discover fails
when it returns something different than zero and we should only display
a warning in that case.

Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 net/nfc/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/nfc/core.c b/net/nfc/core.c
index 1d074dd1650f..aad7f8f59784 100644
--- a/net/nfc/core.c
+++ b/net/nfc/core.c
@@ -129,7 +129,7 @@ int nfc_dev_up(struct nfc_dev *dev)
 	/* We have to enable the device before discovering SEs */
 	if (dev->ops->discover_se) {
 		rc = dev->ops->discover_se(dev);
-		if (!rc)
+		if (rc)
 			pr_warn("SE discovery failed\n");
 	}
 
-- 
cgit v1.2.3


From ac22ac466a659f1b2e02a2e2ee23fc5c42da2c95 Mon Sep 17 00:00:00 2001
From: Samuel Ortiz <sameo@linux.intel.com>
Date: Wed, 24 Jul 2013 18:10:50 +0200
Subject: NFC: Add a GET_SE netlink API

In order to fetch the discovered secure elements from an NFC controller,
we need to send a netlink command that will dump the list of available
SEs from NFC.

Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/uapi/linux/nfc.h |  2 ++
 net/nfc/netlink.c        | 91 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 93 insertions(+)

(limited to 'net')

diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h
index 539d60494c04..029921b067fd 100644
--- a/include/uapi/linux/nfc.h
+++ b/include/uapi/linux/nfc.h
@@ -84,6 +84,7 @@
  * @NFC_EVENT_SE_TRANSACTION: This event is sent when an application running on
  *	a specific SE notifies us about the end of a transaction. The parameter
  *	for this event is the application ID (AID).
+ * @NFC_CMD_GET_SE: Dump all discovered secure elements from an NFC controller.
  */
 enum nfc_commands {
 	NFC_CMD_UNSPEC,
@@ -112,6 +113,7 @@ enum nfc_commands {
 	NFC_EVENT_SE_REMOVED,
 	NFC_EVENT_SE_CONNECTIVITY,
 	NFC_EVENT_SE_TRANSACTION,
+	NFC_CMD_GET_SE,
 /* private: internal use only */
 	__NFC_CMD_AFTER_LAST
 };
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index f16fd59d4160..3b08ef90e045 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -1191,6 +1191,91 @@ static int nfc_genl_disable_se(struct sk_buff *skb, struct genl_info *info)
 	return rc;
 }
 
+static int nfc_genl_send_se(struct sk_buff *msg, struct nfc_dev *dev,
+				u32 portid, u32 seq,
+				struct netlink_callback *cb,
+				int flags)
+{
+	void *hdr;
+	struct nfc_se *se, *n;
+
+	list_for_each_entry_safe(se, n, &dev->secure_elements, list) {
+		hdr = genlmsg_put(msg, portid, seq, &nfc_genl_family, flags,
+				  NFC_CMD_GET_SE);
+		if (!hdr)
+			goto nla_put_failure;
+
+		if (cb)
+			genl_dump_check_consistent(cb, hdr, &nfc_genl_family);
+
+		if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) ||
+		    nla_put_u32(msg, NFC_ATTR_SE_INDEX, se->idx) ||
+		    nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type))
+			goto nla_put_failure;
+
+		if (genlmsg_end(msg, hdr) < 0)
+			goto nla_put_failure;
+	}
+
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static int nfc_genl_dump_ses(struct sk_buff *skb,
+				 struct netlink_callback *cb)
+{
+	struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0];
+	struct nfc_dev *dev = (struct nfc_dev *) cb->args[1];
+	bool first_call = false;
+
+	if (!iter) {
+		first_call = true;
+		iter = kmalloc(sizeof(struct class_dev_iter), GFP_KERNEL);
+		if (!iter)
+			return -ENOMEM;
+		cb->args[0] = (long) iter;
+	}
+
+	mutex_lock(&nfc_devlist_mutex);
+
+	cb->seq = nfc_devlist_generation;
+
+	if (first_call) {
+		nfc_device_iter_init(iter);
+		dev = nfc_device_iter_next(iter);
+	}
+
+	while (dev) {
+		int rc;
+
+		rc = nfc_genl_send_se(skb, dev, NETLINK_CB(cb->skb).portid,
+					  cb->nlh->nlmsg_seq, cb, NLM_F_MULTI);
+		if (rc < 0)
+			break;
+
+		dev = nfc_device_iter_next(iter);
+	}
+
+	mutex_unlock(&nfc_devlist_mutex);
+
+	cb->args[1] = (long) dev;
+
+	return skb->len;
+}
+
+static int nfc_genl_dump_ses_done(struct netlink_callback *cb)
+{
+	struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0];
+
+	nfc_device_iter_exit(iter);
+	kfree(iter);
+
+	return 0;
+}
+
 static struct genl_ops nfc_genl_ops[] = {
 	{
 		.cmd = NFC_CMD_GET_DEVICE,
@@ -1265,6 +1350,12 @@ static struct genl_ops nfc_genl_ops[] = {
 		.doit = nfc_genl_disable_se,
 		.policy = nfc_genl_policy,
 	},
+	{
+		.cmd = NFC_CMD_GET_SE,
+		.dumpit = nfc_genl_dump_ses,
+		.done = nfc_genl_dump_ses_done,
+		.policy = nfc_genl_policy,
+	},
 };
 
 
-- 
cgit v1.2.3


From ef04158e13e827315680cf8449d9af3bd8dc6280 Mon Sep 17 00:00:00 2001
From: Eric Lapuyade <eric.lapuyade@linux.intel.com>
Date: Fri, 19 Jul 2013 14:56:08 +0200
Subject: NFC: Move nfc_fw_download_done() definition from private to public

This API must be called by NFC drivers, and its prototype was
incorrectly placed.

Signed-off-by: Eric Lapuyade <eric.lapuyade@intel.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/net/nfc/nfc.h | 2 ++
 net/nfc/nfc.h         | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h
index 5f286b726bb6..100595560584 100644
--- a/include/net/nfc/nfc.h
+++ b/include/net/nfc/nfc.h
@@ -224,6 +224,8 @@ int nfc_set_remote_general_bytes(struct nfc_dev *dev,
 				 u8 *gt, u8 gt_len);
 u8 *nfc_get_local_general_bytes(struct nfc_dev *dev, size_t *gb_len);
 
+int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name);
+
 int nfc_targets_found(struct nfc_dev *dev,
 		      struct nfc_target *targets, int ntargets);
 int nfc_target_lost(struct nfc_dev *dev, u32 target_idx);
diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h
index 820a7850c36a..4e2e5a787c4a 100644
--- a/net/nfc/nfc.h
+++ b/net/nfc/nfc.h
@@ -126,8 +126,6 @@ static inline void nfc_device_iter_exit(struct class_dev_iter *iter)
 int nfc_fw_download(struct nfc_dev *dev, const char *firmware_name);
 int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name);
 
-int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name);
-
 int nfc_dev_up(struct nfc_dev *dev);
 
 int nfc_dev_down(struct nfc_dev *dev);
-- 
cgit v1.2.3


From 74c181d528bd8b5989f424a489262d0742ca31ae Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 12 Aug 2013 16:41:25 -0700
Subject: tcp: reset reordering est. selectively on timeout

On timeout the TCP sender unconditionally resets the estimated degree
of network reordering (tp->reordering). The idea behind this is that
the estimate is too large to trigger fast recovery (e.g., due to a IP
path change).

But for example if the sender only had 2 packets outstanding, then a
timeout doesn't tell much about reordering. A sender that learns about
reordering on big writes and loses packets on small writes will end up
falsely retransmitting again and again, especially when reordering is
more likely on big writes.

Therefore the sender should only suspect that tp->reordering is too
high if it could have gone into fast recovery with the (lower) default
estimate.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b61274b666f6..e965cc7b87ff 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1877,8 +1877,13 @@ void tcp_enter_loss(struct sock *sk, int how)
 	}
 	tcp_verify_left_out(tp);
 
-	tp->reordering = min_t(unsigned int, tp->reordering,
-			       sysctl_tcp_reordering);
+	/* Timeout in disordered state after receiving substantial DUPACKs
+	 * suggests that the degree of reordering is over-estimated.
+	 */
+	if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
+	    tp->sacked_out >= sysctl_tcp_reordering)
+		tp->reordering = min_t(unsigned int, tp->reordering,
+				       sysctl_tcp_reordering);
 	tcp_set_ca_state(sk, TCP_CA_Loss);
 	tp->high_seq = tp->snd_nxt;
 	TCP_ECN_queue_cwr(tp);
-- 
cgit v1.2.3


From 352a5f5fb3ad8f829cfd4248fe6119895bda881f Mon Sep 17 00:00:00 2001
From: Eric Lapuyade <eric.lapuyade@linux.intel.com>
Date: Fri, 19 Jul 2013 14:57:55 +0200
Subject: NFC: netlink: Add result of firmware operation to completion event

Result is added as an NFC_ATTR_FIRMWARE_DOWNLOAD_STATUS attribute
containing the standard errno positive value of the completion result.
This event will be sent when the firmare download operation is done and
will contain the operation result.

Signed-off-by: Eric Lapuyade <eric.lapuyade@intel.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/net/nfc/nfc.h    |  3 ++-
 include/uapi/linux/nfc.h |  2 ++
 net/nfc/core.c           | 12 ++++++++++--
 net/nfc/netlink.c        |  4 +++-
 net/nfc/nfc.h            |  3 ++-
 5 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h
index 100595560584..f68ee68e4e3e 100644
--- a/include/net/nfc/nfc.h
+++ b/include/net/nfc/nfc.h
@@ -224,7 +224,8 @@ int nfc_set_remote_general_bytes(struct nfc_dev *dev,
 				 u8 *gt, u8 gt_len);
 u8 *nfc_get_local_general_bytes(struct nfc_dev *dev, size_t *gb_len);
 
-int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name);
+int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name,
+			 u32 result);
 
 int nfc_targets_found(struct nfc_dev *dev,
 		      struct nfc_target *targets, int ntargets);
diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h
index 029921b067fd..29bed72a4ac4 100644
--- a/include/uapi/linux/nfc.h
+++ b/include/uapi/linux/nfc.h
@@ -146,6 +146,7 @@ enum nfc_commands {
  * @NFC_ATTR_FIRMWARE_NAME: Free format firmware version
  * @NFC_ATTR_SE_INDEX: Secure element index
  * @NFC_ATTR_SE_TYPE: Secure element type (UICC or EMBEDDED)
+ * @NFC_ATTR_FIRMWARE_DOWNLOAD_STATUS: Firmware download operation status
  */
 enum nfc_attrs {
 	NFC_ATTR_UNSPEC,
@@ -172,6 +173,7 @@ enum nfc_attrs {
 	NFC_ATTR_SE_INDEX,
 	NFC_ATTR_SE_TYPE,
 	NFC_ATTR_SE_AID,
+	NFC_ATTR_FIRMWARE_DOWNLOAD_STATUS,
 /* private: internal use only */
 	__NFC_ATTR_AFTER_LAST
 };
diff --git a/net/nfc/core.c b/net/nfc/core.c
index aad7f8f59784..d252912b8deb 100644
--- a/net/nfc/core.c
+++ b/net/nfc/core.c
@@ -77,11 +77,19 @@ error:
 	return rc;
 }
 
-int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name)
+/**
+ * nfc_fw_download_done - inform that a firmware download was completed
+ *
+ * @dev: The nfc device to which firmware was downloaded
+ * @firmware_name: The firmware filename
+ * @result: The positive value of a standard errno value
+ */
+int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name,
+			 u32 result)
 {
 	dev->fw_download_in_progress = false;
 
-	return nfc_genl_fw_download_done(dev, firmware_name);
+	return nfc_genl_fw_download_done(dev, firmware_name, result);
 }
 EXPORT_SYMBOL(nfc_fw_download_done);
 
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index 3b08ef90e045..68063b2025da 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -1114,7 +1114,8 @@ static int nfc_genl_fw_download(struct sk_buff *skb, struct genl_info *info)
 	return rc;
 }
 
-int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name)
+int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name,
+			      u32 result)
 {
 	struct sk_buff *msg;
 	void *hdr;
@@ -1129,6 +1130,7 @@ int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name)
 		goto free_msg;
 
 	if (nla_put_string(msg, NFC_ATTR_FIRMWARE_NAME, firmware_name) ||
+	    nla_put_u32(msg, NFC_ATTR_FIRMWARE_DOWNLOAD_STATUS, result) ||
 	    nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx))
 		goto nla_put_failure;
 
diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h
index 4e2e5a787c4a..aaf606fc1faa 100644
--- a/net/nfc/nfc.h
+++ b/net/nfc/nfc.h
@@ -124,7 +124,8 @@ static inline void nfc_device_iter_exit(struct class_dev_iter *iter)
 }
 
 int nfc_fw_download(struct nfc_dev *dev, const char *firmware_name);
-int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name);
+int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name,
+			      u32 result);
 
 int nfc_dev_up(struct nfc_dev *dev);
 
-- 
cgit v1.2.3


From 4eba11e82a0365117be92453c5c91a263500fd1a Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 18 Jun 2013 01:48:26 +0300
Subject: NFC: hci: Fix enable/disable confusion

There is a cut and paste bug so we enable a second time instead of
disabling.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 net/nfc/hci/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c
index fe66908401f5..d07ca4c5cf8c 100644
--- a/net/nfc/hci/core.c
+++ b/net/nfc/hci/core.c
@@ -717,7 +717,7 @@ static int hci_disable_se(struct nfc_dev *nfc_dev, u32 se_idx)
 	struct nfc_hci_dev *hdev = nfc_get_drvdata(nfc_dev);
 
 	if (hdev->ops->disable_se)
-		return hdev->ops->enable_se(hdev, se_idx);
+		return hdev->ops->disable_se(hdev, se_idx);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 2c3832834b95e0226da1d13229472978f78462c5 Mon Sep 17 00:00:00 2001
From: Arron Wang <arron.wang@intel.com>
Date: Tue, 30 Jul 2013 14:35:35 +0200
Subject: NFC: Fix secure element state check

Another typo from the initial commit where we check for the secure
element type field instead of its state when enabling or disabling it.

Signed-off-by: Arron Wang <arron.wang@intel.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 net/nfc/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/nfc/core.c b/net/nfc/core.c
index d252912b8deb..ee1fe66e2c8a 100644
--- a/net/nfc/core.c
+++ b/net/nfc/core.c
@@ -583,7 +583,7 @@ int nfc_enable_se(struct nfc_dev *dev, u32 se_idx)
 		goto error;
 	}
 
-	if (se->type == NFC_SE_ENABLED) {
+	if (se->state == NFC_SE_ENABLED) {
 		rc = -EALREADY;
 		goto error;
 	}
@@ -626,7 +626,7 @@ int nfc_disable_se(struct nfc_dev *dev, u32 se_idx)
 		goto error;
 	}
 
-	if (se->type == NFC_SE_DISABLED) {
+	if (se->state == NFC_SE_DISABLED) {
 		rc = -EALREADY;
 		goto error;
 	}
-- 
cgit v1.2.3


From 39525ee1dc78ca1f5f2fb1f764f7a141005fe440 Mon Sep 17 00:00:00 2001
From: Arron Wang <arron.wang@intel.com>
Date: Tue, 30 Jul 2013 14:40:05 +0200
Subject: NFC: Update secure element state

The secure element state was not updated from the enable/disable ops,
leaving the SE state to disabled for ever.

Signed-off-by: Arron Wang <arron.wang@intel.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 net/nfc/core.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/nfc/core.c b/net/nfc/core.c
index ee1fe66e2c8a..e92923cf3e03 100644
--- a/net/nfc/core.c
+++ b/net/nfc/core.c
@@ -589,6 +589,8 @@ int nfc_enable_se(struct nfc_dev *dev, u32 se_idx)
 	}
 
 	rc = dev->ops->enable_se(dev, se_idx);
+	if (rc >= 0)
+		se->state = NFC_SE_ENABLED;
 
 error:
 	device_unlock(&dev->dev);
@@ -632,6 +634,8 @@ int nfc_disable_se(struct nfc_dev *dev, u32 se_idx)
 	}
 
 	rc = dev->ops->disable_se(dev, se_idx);
+	if (rc >= 0)
+		se->state = NFC_SE_DISABLED;
 
 error:
 	device_unlock(&dev->dev);
-- 
cgit v1.2.3


From fc4eba58b4c1462ff3d6247b66fb47d6928db6d2 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Wed, 14 Aug 2013 01:03:46 +0200
Subject: ipv6: make unsolicited report intervals configurable for mld

Commit cab70040dfd95ee32144f02fade64f0cb94f31a0 ("net: igmp:
Reduce Unsolicited report interval to 1s when using IGMPv3") and
2690048c01f32bf45d1c1e1ab3079bc10ad2aea7 ("net: igmp: Allow user-space
configuration of igmp unsolicited report interval") by William Manley made
igmp unsolicited report intervals configurable per interface and corrected
the interval of unsolicited igmpv3 report messages resendings to 1s.

Same needs to be done for IPv6:

MLDv1 (RFC2710 7.10.): 10 seconds
MLDv2 (RFC3810 9.11.): 1 second

Both intervals are configurable via new procfs knobs
mldv1_unsolicited_report_interval and mldv2_unsolicited_report_interval.

(also added .force_mld_version to ipv6_devconf_dflt to bring structs in
line without semantic changes)

v2:
a) Joined documentation update for IPv4 and IPv6 MLD/IGMP
   unsolicited_report_interval procfs knobs.
b) incorporate stylistic feedback from William Manley

v3:
a) add new DEVCONF_* values to the end of the enum (thanks to David
   Miller)

Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: William Manley <william.manley@youview.com>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 18 ++++++++++++++++++
 include/linux/ipv6.h                   |  2 ++
 include/uapi/linux/ipv6.h              |  2 ++
 net/ipv6/addrconf.c                    | 25 +++++++++++++++++++++++++
 net/ipv6/mcast.c                       | 17 ++++++++++++++---
 5 files changed, 61 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 36be26b2ef7a..debfe857d8f9 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1039,7 +1039,15 @@ disable_policy - BOOLEAN
 disable_xfrm - BOOLEAN
 	Disable IPSEC encryption on this interface, whatever the policy
 
+igmpv2_unsolicited_report_interval - INTEGER
+	The interval in milliseconds in which the next unsolicited
+	IGMPv1 or IGMPv2 report retransmit will take place.
+	Default: 10000 (10 seconds)
 
+igmpv3_unsolicited_report_interval - INTEGER
+	The interval in milliseconds in which the next unsolicited
+	IGMPv3 report retransmit will take place.
+	Default: 1000 (1 seconds)
 
 tag - INTEGER
 	Allows you to write a number, which can be used as required.
@@ -1331,6 +1339,16 @@ ndisc_notify - BOOLEAN
 	1 - Generate unsolicited neighbour advertisements when device is brought
 	    up or hardware address changes.
 
+mldv1_unsolicited_report_interval - INTEGER
+	The interval in milliseconds in which the next unsolicited
+	MLDv1 report retransmit will take place.
+	Default: 10000 (10 seconds)
+
+mldv2_unsolicited_report_interval - INTEGER
+	The interval in milliseconds in which the next unsolicited
+	MLDv2 report retransmit will take place.
+	Default: 1000 (1 second)
+
 icmp/*:
 ratelimit - INTEGER
 	Limit the maximal rates for sending ICMPv6 packets.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 850e95bc766c..77a478462d8e 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -19,6 +19,8 @@ struct ipv6_devconf {
 	__s32		rtr_solicit_interval;
 	__s32		rtr_solicit_delay;
 	__s32		force_mld_version;
+	__s32		mldv1_unsolicited_report_interval;
+	__s32		mldv2_unsolicited_report_interval;
 #ifdef CONFIG_IPV6_PRIVACY
 	__s32		use_tempaddr;
 	__s32		temp_valid_lft;
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 4bda4cf5b0f5..d07ac6903e59 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -160,6 +160,8 @@ enum {
 	DEVCONF_ACCEPT_DAD,
 	DEVCONF_FORCE_TLLAO,
 	DEVCONF_NDISC_NOTIFY,
+	DEVCONF_MLDV1_UNSOLICITED_REPORT_INTERVAL,
+	DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 7fd8572bac80..ad12f7c43f25 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -177,6 +177,8 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.accept_redirects	= 1,
 	.autoconf		= 1,
 	.force_mld_version	= 0,
+	.mldv1_unsolicited_report_interval = 10 * HZ,
+	.mldv2_unsolicited_report_interval = HZ,
 	.dad_transmits		= 1,
 	.rtr_solicits		= MAX_RTR_SOLICITATIONS,
 	.rtr_solicit_interval	= RTR_SOLICITATION_INTERVAL,
@@ -211,6 +213,9 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.accept_ra		= 1,
 	.accept_redirects	= 1,
 	.autoconf		= 1,
+	.force_mld_version	= 0,
+	.mldv1_unsolicited_report_interval = 10 * HZ,
+	.mldv2_unsolicited_report_interval = HZ,
 	.dad_transmits		= 1,
 	.rtr_solicits		= MAX_RTR_SOLICITATIONS,
 	.rtr_solicit_interval	= RTR_SOLICITATION_INTERVAL,
@@ -4179,6 +4184,10 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_RTR_SOLICIT_DELAY] =
 		jiffies_to_msecs(cnf->rtr_solicit_delay);
 	array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version;
+	array[DEVCONF_MLDV1_UNSOLICITED_REPORT_INTERVAL] =
+		jiffies_to_msecs(cnf->mldv1_unsolicited_report_interval);
+	array[DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL] =
+		jiffies_to_msecs(cnf->mldv2_unsolicited_report_interval);
 #ifdef CONFIG_IPV6_PRIVACY
 	array[DEVCONF_USE_TEMPADDR] = cnf->use_tempaddr;
 	array[DEVCONF_TEMP_VALID_LFT] = cnf->temp_valid_lft;
@@ -4862,6 +4871,22 @@ static struct addrconf_sysctl_table
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
+		{
+			.procname	= "mldv1_unsolicited_report_interval",
+			.data		=
+				&ipv6_devconf.mldv1_unsolicited_report_interval,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec_ms_jiffies,
+		},
+		{
+			.procname	= "mldv2_unsolicited_report_interval",
+			.data		=
+				&ipv6_devconf.mldv2_unsolicited_report_interval,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec_ms_jiffies,
+		},
 #ifdef CONFIG_IPV6_PRIVACY
 		{
 			.procname	= "use_tempaddr",
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index db25b8eb62bd..6c76df9909bf 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -108,7 +108,6 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml,
 			    struct inet6_dev *idev);
 
 
-#define IGMP6_UNSOLICITED_IVAL	(10*HZ)
 #define MLD_QRV_DEFAULT		2
 
 #define MLD_V1_SEEN(idev) (dev_net((idev)->dev)->ipv6.devconf_all->force_mld_version == 1 || \
@@ -129,6 +128,18 @@ int sysctl_mld_max_msf __read_mostly = IPV6_MLD_MAX_MSF;
 	     pmc != NULL;					\
 	     pmc = rcu_dereference(pmc->next))
 
+static int unsolicited_report_interval(struct inet6_dev *idev)
+{
+	int iv;
+
+	if (MLD_V1_SEEN(idev))
+		iv = idev->cnf.mldv1_unsolicited_report_interval;
+	else
+		iv = idev->cnf.mldv2_unsolicited_report_interval;
+
+	return iv > 0 ? iv : 1;
+}
+
 int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 {
 	struct net_device *dev = NULL;
@@ -2158,7 +2169,7 @@ static void igmp6_join_group(struct ifmcaddr6 *ma)
 
 	igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT);
 
-	delay = net_random() % IGMP6_UNSOLICITED_IVAL;
+	delay = net_random() % unsolicited_report_interval(ma->idev);
 
 	spin_lock_bh(&ma->mca_lock);
 	if (del_timer(&ma->mca_timer)) {
@@ -2325,7 +2336,7 @@ void ipv6_mc_init_dev(struct inet6_dev *idev)
 	setup_timer(&idev->mc_dad_timer, mld_dad_timer_expire,
 		    (unsigned long)idev);
 	idev->mc_qrv = MLD_QRV_DEFAULT;
-	idev->mc_maxdelay = IGMP6_UNSOLICITED_IVAL;
+	idev->mc_maxdelay = unsolicited_report_interval(idev);
 	idev->mc_v1_seen = 0;
 	write_unlock_bh(&idev->lock);
 }
-- 
cgit v1.2.3


From 64261f230a9157f5f520ce30ec6827d679375e2f Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Tue, 13 Aug 2013 17:51:09 +0200
Subject: dev: move skb_scrub_packet() after eth_type_trans()

skb_scrub_packet() was called before eth_type_trans() to let eth_type_trans()
set pkt_type.

In fact, we should force pkt_type to PACKET_HOST, so move the call after
eth_type_trans().

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c       | 6 +++---
 net/ipv4/ip_tunnel.c | 7 ++++---
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 58eb802584b9..1ed2b66a10a6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1691,13 +1691,13 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 		kfree_skb(skb);
 		return NET_RX_DROP;
 	}
-	skb_scrub_packet(skb);
 	skb->protocol = eth_type_trans(skb, dev);
 
 	/* eth_type_trans() can set pkt_type.
-	 * clear pkt_type _after_ calling eth_type_trans()
+	 * call skb_scrub_packet() after it to clear pkt_type _after_ calling
+	 * eth_type_trans().
 	 */
-	skb->pkt_type = PACKET_HOST;
+	skb_scrub_packet(skb);
 
 	return netif_rx(skb);
 }
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 9fdf8a6d95f3..fbc1094964bf 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -454,15 +454,16 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
 	tstats->rx_bytes += skb->len;
 	u64_stats_update_end(&tstats->syncp);
 
-	if (tunnel->net != dev_net(tunnel->dev))
-		skb_scrub_packet(skb);
-
 	if (tunnel->dev->type == ARPHRD_ETHER) {
 		skb->protocol = eth_type_trans(skb, tunnel->dev);
 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 	} else {
 		skb->dev = tunnel->dev;
 	}
+
+	if (tunnel->net != dev_net(tunnel->dev))
+		skb_scrub_packet(skb);
+
 	gro_cells_receive(&tunnel->gro_cells, skb);
 	return 0;
 
-- 
cgit v1.2.3


From fc8f999daa10a5c5c19562b88be96a9b2bacd9b7 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Tue, 13 Aug 2013 17:51:10 +0200
Subject: ipv4 tunnels: use net_eq() helper to check netns

It's better to use available helpers for these tests.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_tunnel.c | 4 ++--
 net/ipv6/sit.c       | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index fbc1094964bf..a351a003ee6b 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -461,7 +461,7 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
 		skb->dev = tunnel->dev;
 	}
 
-	if (tunnel->net != dev_net(tunnel->dev))
+	if (!net_eq(tunnel->net, dev_net(tunnel->dev)))
 		skb_scrub_packet(skb);
 
 	gro_cells_receive(&tunnel->gro_cells, skb);
@@ -614,7 +614,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 		goto tx_error;
 	}
 
-	if (tunnel->net != dev_net(dev))
+	if (!net_eq(tunnel->net, dev_net(dev)))
 		skb_scrub_packet(skb);
 
 	if (tunnel->err_count > 0) {
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index a3437a4cd07e..f18f842ac893 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -621,7 +621,7 @@ static int ipip6_rcv(struct sk_buff *skb)
 		tstats->rx_packets++;
 		tstats->rx_bytes += skb->len;
 
-		if (tunnel->net != dev_net(tunnel->dev))
+		if (!net_eq(tunnel->net, dev_net(tunnel->dev)))
 			skb_scrub_packet(skb);
 		netif_rx(skb);
 
@@ -860,7 +860,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 			tunnel->err_count = 0;
 	}
 
-	if (tunnel->net != dev_net(dev))
+	if (!net_eq(tunnel->net, dev_net(dev)))
 		skb_scrub_packet(skb);
 
 	/*
@@ -1589,7 +1589,7 @@ static void __net_exit sit_destroy_tunnels(struct sit_net *sitn, struct list_hea
 				/* If dev is in the same netns, it has already
 				 * been added to the list by the previous loop.
 				 */
-				if (dev_net(t->dev) != net)
+				if (!net_eq(dev_net(t->dev), net))
 					unregister_netdevice_queue(t->dev,
 								   head);
 				t = rtnl_dereference(t->next);
-- 
cgit v1.2.3


From 6c742e714d8c282fd8f8b22d3e20b5141738c1ee Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Tue, 13 Aug 2013 17:51:11 +0200
Subject: ipip: add x-netns support

This patch allows to switch the netns when packet is encapsulated or
decapsulated. In other word, the encapsulated packet is received in a netns,
where the lookup is done to find the tunnel. Once the tunnel is found, the
packet is decapsulated and injecting into the corresponding interface which
stands to another netns.

When one of the two netns is removed, the tunnel is destroyed.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h |  2 +-
 net/ipv4/ip_gre.c        |  4 ++--
 net/ipv4/ip_tunnel.c     | 43 ++++++++++++++++++++++++++++---------------
 net/ipv4/ip_vti.c        |  2 +-
 net/ipv4/ipip.c          |  3 +--
 5 files changed, 33 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index c6acd9f8f877..5a76f2bef822 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -102,7 +102,7 @@ void  ip_tunnel_dellink(struct net_device *dev, struct list_head *head);
 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
 		       struct rtnl_link_ops *ops, char *devname);
 
-void ip_tunnel_delete_net(struct ip_tunnel_net *itn);
+void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops);
 
 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 		    const struct iphdr *tnl_params, const u8 protocol);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 1f6eab66f7ce..bc3a76521deb 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -534,7 +534,7 @@ static int __net_init ipgre_init_net(struct net *net)
 static void __net_exit ipgre_exit_net(struct net *net)
 {
 	struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id);
-	ip_tunnel_delete_net(itn);
+	ip_tunnel_delete_net(itn, &ipgre_link_ops);
 }
 
 static struct pernet_operations ipgre_net_ops = {
@@ -767,7 +767,7 @@ static int __net_init ipgre_tap_init_net(struct net *net)
 static void __net_exit ipgre_tap_exit_net(struct net *net)
 {
 	struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id);
-	ip_tunnel_delete_net(itn);
+	ip_tunnel_delete_net(itn, &ipgre_tap_ops);
 }
 
 static struct pernet_operations ipgre_tap_net_ops = {
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index a351a003ee6b..a4d9126c7b51 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -350,7 +350,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
 		struct flowi4 fl4;
 		struct rtable *rt;
 
-		rt = ip_route_output_tunnel(dev_net(dev), &fl4,
+		rt = ip_route_output_tunnel(tunnel->net, &fl4,
 					    tunnel->parms.iph.protocol,
 					    iph->daddr, iph->saddr,
 					    tunnel->parms.o_key,
@@ -365,7 +365,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
 	}
 
 	if (!tdev && tunnel->parms.link)
-		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
+		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
 
 	if (tdev) {
 		hlen = tdev->hard_header_len + tdev->needed_headroom;
@@ -654,7 +654,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 		}
 	}
 
-	err = iptunnel_xmit(dev_net(dev), rt, skb,
+	err = iptunnel_xmit(tunnel->net, rt, skb,
 			    fl4.saddr, fl4.daddr, protocol,
 			    ip_tunnel_ecn_encap(tos, inner_iph, skb), ttl, df);
 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
@@ -821,11 +821,10 @@ static void ip_tunnel_dev_free(struct net_device *dev)
 
 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
 {
-	struct net *net = dev_net(dev);
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 	struct ip_tunnel_net *itn;
 
-	itn = net_generic(net, tunnel->ip_tnl_net_id);
+	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
 
 	if (itn->fb_tunnel_dev != dev) {
 		ip_tunnel_del(netdev_priv(dev));
@@ -855,6 +854,10 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
 
 	rtnl_lock();
 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
+	/* FB netdevice is special: we have one, and only one per netns.
+	 * Allowing to move it to another netns is clearly unsafe.
+	 */
+	itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
 	rtnl_unlock();
 
 	if (IS_ERR(itn->fb_tunnel_dev))
@@ -864,28 +867,39 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
 
-static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head)
+static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
+			      struct rtnl_link_ops *ops)
 {
+	struct net *net = dev_net(itn->fb_tunnel_dev);
+	struct net_device *dev, *aux;
 	int h;
 
+	for_each_netdev_safe(net, dev, aux)
+		if (dev->rtnl_link_ops == ops)
+			unregister_netdevice_queue(dev, head);
+
 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
 		struct ip_tunnel *t;
 		struct hlist_node *n;
 		struct hlist_head *thead = &itn->tunnels[h];
 
 		hlist_for_each_entry_safe(t, n, thead, hash_node)
-			unregister_netdevice_queue(t->dev, head);
+			/* If dev is in the same netns, it has already
+			 * been added to the list by the previous loop.
+			 */
+			if (!net_eq(dev_net(t->dev), net))
+				unregister_netdevice_queue(t->dev, head);
 	}
 	if (itn->fb_tunnel_dev)
 		unregister_netdevice_queue(itn->fb_tunnel_dev, head);
 }
 
-void ip_tunnel_delete_net(struct ip_tunnel_net *itn)
+void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
 {
 	LIST_HEAD(list);
 
 	rtnl_lock();
-	ip_tunnel_destroy(itn, &list);
+	ip_tunnel_destroy(itn, &list, ops);
 	unregister_netdevice_many(&list);
 	rtnl_unlock();
 }
@@ -929,23 +943,21 @@ EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
 			 struct ip_tunnel_parm *p)
 {
-	struct ip_tunnel *t, *nt;
-	struct net *net = dev_net(dev);
+	struct ip_tunnel *t;
 	struct ip_tunnel *tunnel = netdev_priv(dev);
+	struct net *net = tunnel->net;
 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
 
 	if (dev == itn->fb_tunnel_dev)
 		return -EINVAL;
 
-	nt = netdev_priv(dev);
-
 	t = ip_tunnel_find(itn, p, dev->type);
 
 	if (t) {
 		if (t->dev != dev)
 			return -EEXIST;
 	} else {
-		t = nt;
+		t = tunnel;
 
 		if (dev->type != ARPHRD_ETHER) {
 			unsigned int nflags = 0;
@@ -984,6 +996,7 @@ int ip_tunnel_init(struct net_device *dev)
 	}
 
 	tunnel->dev = dev;
+	tunnel->net = dev_net(dev);
 	strcpy(tunnel->parms.name, dev->name);
 	iph->version		= 4;
 	iph->ihl		= 5;
@@ -994,8 +1007,8 @@ EXPORT_SYMBOL_GPL(ip_tunnel_init);
 
 void ip_tunnel_uninit(struct net_device *dev)
 {
-	struct net *net = dev_net(dev);
 	struct ip_tunnel *tunnel = netdev_priv(dev);
+	struct net *net = tunnel->net;
 	struct ip_tunnel_net *itn;
 
 	itn = net_generic(net, tunnel->ip_tnl_net_id);
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 79b263da4168..e805e7b3030e 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -318,7 +318,7 @@ static int __net_init vti_init_net(struct net *net)
 static void __net_exit vti_exit_net(struct net *net)
 {
 	struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
-	ip_tunnel_delete_net(itn);
+	ip_tunnel_delete_net(itn, &vti_link_ops);
 }
 
 static struct pernet_operations vti_net_ops = {
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 51fc2a1dcdd3..87bd2952c733 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -286,7 +286,6 @@ static void ipip_tunnel_setup(struct net_device *dev)
 	dev->flags		= IFF_NOARP;
 	dev->iflink		= 0;
 	dev->addr_len		= 4;
-	dev->features		|= NETIF_F_NETNS_LOCAL;
 	dev->features		|= NETIF_F_LLTX;
 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
 
@@ -437,7 +436,7 @@ static int __net_init ipip_init_net(struct net *net)
 static void __net_exit ipip_exit_net(struct net *net)
 {
 	struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
-	ip_tunnel_delete_net(itn);
+	ip_tunnel_delete_net(itn, &ipip_link_ops);
 }
 
 static struct pernet_operations ipip_net_ops = {
-- 
cgit v1.2.3


From 0bd8762824e73a3cce7b7560a97463301764b616 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Tue, 13 Aug 2013 17:51:12 +0200
Subject: ip6tnl: add x-netns support

This patch allows to switch the netns when packet is encapsulated or
decapsulated. In other word, the encapsulated packet is received in a netns,
where the lookup is done to find the tunnel. Once the tunnel is found, the
packet is decapsulated and injecting into the corresponding interface which
stands to another netns.

When one of the two netns is removed, the tunnel is destroyed.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_tunnel.h |  1 +
 net/ipv6/ip6_gre.c       |  5 +++++
 net/ipv6/ip6_tunnel.c    | 41 +++++++++++++++++++++++++++++++----------
 3 files changed, 37 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index 4da5de10d1d4..2265b0bf97e5 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -36,6 +36,7 @@ struct __ip6_tnl_parm {
 struct ip6_tnl {
 	struct ip6_tnl __rcu *next;	/* next tunnel in list */
 	struct net_device *dev;	/* virtual device associated with tunnel */
+	struct net *net;	/* netns for packet i/o */
 	struct __ip6_tnl_parm parms;	/* tunnel configuration parameters */
 	struct flowi fl;	/* flowi template for xmit */
 	struct dst_entry *dst_cache;    /* cached dst */
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index ecd60733e5e2..f2d0a42f8057 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -335,6 +335,7 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net,
 	dev->rtnl_link_ops = &ip6gre_link_ops;
 
 	nt->dev = dev;
+	nt->net = dev_net(dev);
 	ip6gre_tnl_link_config(nt, 1);
 
 	if (register_netdevice(dev) < 0)
@@ -1255,6 +1256,7 @@ static int ip6gre_tunnel_init(struct net_device *dev)
 	tunnel = netdev_priv(dev);
 
 	tunnel->dev = dev;
+	tunnel->net = dev_net(dev);
 	strcpy(tunnel->parms.name, dev->name);
 
 	memcpy(dev->dev_addr, &tunnel->parms.laddr, sizeof(struct in6_addr));
@@ -1275,6 +1277,7 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev)
 	struct ip6_tnl *tunnel = netdev_priv(dev);
 
 	tunnel->dev = dev;
+	tunnel->net = dev_net(dev);
 	strcpy(tunnel->parms.name, dev->name);
 
 	tunnel->hlen		= sizeof(struct ipv6hdr) + 4;
@@ -1450,6 +1453,7 @@ static int ip6gre_tap_init(struct net_device *dev)
 	tunnel = netdev_priv(dev);
 
 	tunnel->dev = dev;
+	tunnel->net = dev_net(dev);
 	strcpy(tunnel->parms.name, dev->name);
 
 	ip6gre_tnl_link_config(tunnel, 1);
@@ -1501,6 +1505,7 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
 		eth_hw_addr_random(dev);
 
 	nt->dev = dev;
+	nt->net = dev_net(dev);
 	ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]);
 
 	/* Can use a lockless transmit, unless we generate output sequences */
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 1e55866cead7..cc3bb201b8b0 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -315,6 +315,7 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p)
 
 	t = netdev_priv(dev);
 	t->parms = *p;
+	t->net = dev_net(dev);
 	err = ip6_tnl_create2(dev);
 	if (err < 0)
 		goto failed_free;
@@ -374,7 +375,7 @@ static void
 ip6_tnl_dev_uninit(struct net_device *dev)
 {
 	struct ip6_tnl *t = netdev_priv(dev);
-	struct net *net = dev_net(dev);
+	struct net *net = t->net;
 	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
 
 	if (dev == ip6n->fb_tnl_dev)
@@ -741,7 +742,7 @@ int ip6_tnl_rcv_ctl(struct ip6_tnl *t,
 {
 	struct __ip6_tnl_parm *p = &t->parms;
 	int ret = 0;
-	struct net *net = dev_net(t->dev);
+	struct net *net = t->net;
 
 	if ((p->flags & IP6_TNL_F_CAP_RCV) ||
 	    ((p->flags & IP6_TNL_F_CAP_PER_PACKET) &&
@@ -827,6 +828,9 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol,
 		tstats->rx_packets++;
 		tstats->rx_bytes += skb->len;
 
+		if (!net_eq(t->net, dev_net(t->dev)))
+			skb_scrub_packet(skb);
+
 		netif_rx(skb);
 
 		rcu_read_unlock();
@@ -895,7 +899,7 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t)
 {
 	struct __ip6_tnl_parm *p = &t->parms;
 	int ret = 0;
-	struct net *net = dev_net(t->dev);
+	struct net *net = t->net;
 
 	if (p->flags & IP6_TNL_F_CAP_XMIT) {
 		struct net_device *ldev = NULL;
@@ -945,8 +949,8 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
 			 int encap_limit,
 			 __u32 *pmtu)
 {
-	struct net *net = dev_net(dev);
 	struct ip6_tnl *t = netdev_priv(dev);
+	struct net *net = t->net;
 	struct net_device_stats *stats = &t->dev->stats;
 	struct ipv6hdr *ipv6h = ipv6_hdr(skb);
 	struct ipv6_tel_txoption opt;
@@ -996,6 +1000,9 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
 		goto tx_err_dst_release;
 	}
 
+	if (!net_eq(t->net, dev_net(dev)))
+		skb_scrub_packet(skb);
+
 	/*
 	 * Okay, now see if we can stuff it in the buffer as-is.
 	 */
@@ -1202,7 +1209,7 @@ static void ip6_tnl_link_config(struct ip6_tnl *t)
 		int strict = (ipv6_addr_type(&p->raddr) &
 			      (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL));
 
-		struct rt6_info *rt = rt6_lookup(dev_net(dev),
+		struct rt6_info *rt = rt6_lookup(t->net,
 						 &p->raddr, &p->laddr,
 						 p->link, strict);
 
@@ -1251,7 +1258,7 @@ ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p)
 
 static int ip6_tnl_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p)
 {
-	struct net *net = dev_net(t->dev);
+	struct net *net = t->net;
 	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
 	int err;
 
@@ -1463,7 +1470,6 @@ static void ip6_tnl_dev_setup(struct net_device *dev)
 		dev->mtu-=8;
 	dev->flags |= IFF_NOARP;
 	dev->addr_len = sizeof(struct in6_addr);
-	dev->features |= NETIF_F_NETNS_LOCAL;
 	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
 }
 
@@ -1479,6 +1485,7 @@ ip6_tnl_dev_init_gen(struct net_device *dev)
 	struct ip6_tnl *t = netdev_priv(dev);
 
 	t->dev = dev;
+	t->net = dev_net(dev);
 	dev->tstats = alloc_percpu(struct pcpu_tstats);
 	if (!dev->tstats)
 		return -ENOMEM;
@@ -1596,9 +1603,9 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
 static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[],
 			      struct nlattr *data[])
 {
-	struct ip6_tnl *t;
+	struct ip6_tnl *t = netdev_priv(dev);
 	struct __ip6_tnl_parm p;
-	struct net *net = dev_net(dev);
+	struct net *net = t->net;
 	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
 
 	if (dev == ip6n->fb_tnl_dev)
@@ -1699,14 +1706,24 @@ static struct xfrm6_tunnel ip6ip6_handler __read_mostly = {
 
 static void __net_exit ip6_tnl_destroy_tunnels(struct ip6_tnl_net *ip6n)
 {
+	struct net *net = dev_net(ip6n->fb_tnl_dev);
+	struct net_device *dev, *aux;
 	int h;
 	struct ip6_tnl *t;
 	LIST_HEAD(list);
 
+	for_each_netdev_safe(net, dev, aux)
+		if (dev->rtnl_link_ops == &ip6_link_ops)
+			unregister_netdevice_queue(dev, &list);
+
 	for (h = 0; h < HASH_SIZE; h++) {
 		t = rtnl_dereference(ip6n->tnls_r_l[h]);
 		while (t != NULL) {
-			unregister_netdevice_queue(t->dev, &list);
+			/* If dev is in the same netns, it has already
+			 * been added to the list by the previous loop.
+			 */
+			if (!net_eq(dev_net(t->dev), net))
+				unregister_netdevice_queue(t->dev, &list);
 			t = rtnl_dereference(t->next);
 		}
 	}
@@ -1732,6 +1749,10 @@ static int __net_init ip6_tnl_init_net(struct net *net)
 	if (!ip6n->fb_tnl_dev)
 		goto err_alloc_dev;
 	dev_net_set(ip6n->fb_tnl_dev, net);
+	/* FB netdevice is special: we have one, and only one per netns.
+	 * Allowing to move it to another netns is clearly unsafe.
+	 */
+	ip6n->fb_tnl_dev->features |= NETIF_F_NETNS_LOCAL;
 
 	err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev);
 	if (err < 0)
-- 
cgit v1.2.3


From fce9b9be89cece975675142a3953bfb5299d195d Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 14 Aug 2013 12:35:42 +0300
Subject: rtnetlink: remove an unneeded test

We know that "dev" is a valid pointer at this point, so we can remove
the test and clean up a little.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 0b2972c59b97..242084edb658 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1867,10 +1867,10 @@ replay:
 		else
 			err = register_netdevice(dev);
 
-		if (err < 0 && !IS_ERR(dev))
+		if (err < 0) {
 			free_netdev(dev);
-		if (err < 0)
 			goto out;
+		}
 
 		err = rtnl_configure_link(dev, ifm);
 		if (err < 0)
-- 
cgit v1.2.3


From dee08ab83d0378d922b67e7cf10bbec3e4ea343b Mon Sep 17 00:00:00 2001
From: Luis Henriques <luis.henriques@canonical.com>
Date: Wed, 14 Aug 2013 23:10:06 +0100
Subject: net: rfkill: Do not ignore errors from regulator_enable()

Function regulator_enable() may return an error that has to be checked.
This patch changes function rfkill_regulator_set_block() so that it checks
for the return code.  Also, rfkill_data->reg_enabled is set to 'true' only
if there is no error.

This fixes the following compilation warning:

net/rfkill/rfkill-regulator.c:43:20: warning: ignoring return value of 'regulator_enable', declared with attribute warn_unused_result [-Wunused-result]

Signed-off-by: Luis Henriques <luis.henriques@canonical.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/rfkill/rfkill-regulator.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/rfkill/rfkill-regulator.c b/net/rfkill/rfkill-regulator.c
index d11ac79246e4..cf5b145902e5 100644
--- a/net/rfkill/rfkill-regulator.c
+++ b/net/rfkill/rfkill-regulator.c
@@ -30,6 +30,7 @@ struct rfkill_regulator_data {
 static int rfkill_regulator_set_block(void *data, bool blocked)
 {
 	struct rfkill_regulator_data *rfkill_data = data;
+	int ret = 0;
 
 	pr_debug("%s: blocked: %d\n", __func__, blocked);
 
@@ -40,15 +41,16 @@ static int rfkill_regulator_set_block(void *data, bool blocked)
 		}
 	} else {
 		if (!rfkill_data->reg_enabled) {
-			regulator_enable(rfkill_data->vcc);
-			rfkill_data->reg_enabled = true;
+			ret = regulator_enable(rfkill_data->vcc);
+			if (!ret)
+				rfkill_data->reg_enabled = true;
 		}
 	}
 
 	pr_debug("%s: regulator_is_enabled after set_block: %d\n", __func__,
 		regulator_is_enabled(rfkill_data->vcc));
 
-	return 0;
+	return ret;
 }
 
 static struct rfkill_ops rfkill_regulator_ops = {
-- 
cgit v1.2.3


From d14c5ab6bef6a46170b84c3589b27768e979f93d Mon Sep 17 00:00:00 2001
From: Francesco Fusco <ffusco@redhat.com>
Date: Thu, 15 Aug 2013 13:42:14 +0200
Subject: net: proc_fs: trivial: print UIDs as unsigned int

UIDs are printed in the proc_fs as signed int, whereas
they are unsigned int.

Signed-off-by: Francesco Fusco <ffusco@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/appletalk/atalk_proc.c | 2 +-
 net/ipv4/ping.c            | 2 +-
 net/ipv4/raw.c             | 2 +-
 net/ipv4/tcp_ipv4.c        | 4 ++--
 net/ipv4/udp.c             | 2 +-
 net/ipv6/datagram.c        | 2 +-
 net/ipv6/tcp_ipv6.c        | 4 ++--
 net/ipx/ipx_proc.c         | 2 +-
 net/llc/llc_proc.c         | 2 +-
 net/phonet/socket.c        | 2 +-
 net/sctp/proc.c            | 4 ++--
 11 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/appletalk/atalk_proc.c b/net/appletalk/atalk_proc.c
index c30f3a0717fb..af46bc49e1e9 100644
--- a/net/appletalk/atalk_proc.c
+++ b/net/appletalk/atalk_proc.c
@@ -178,7 +178,7 @@ static int atalk_seq_socket_show(struct seq_file *seq, void *v)
 	at = at_sk(s);
 
 	seq_printf(seq, "%02X   %04X:%02X:%02X  %04X:%02X:%02X  %08X:%08X "
-			"%02X %d\n",
+			"%02X %u\n",
 		   s->sk_type, ntohs(at->src_net), at->src_node, at->src_port,
 		   ntohs(at->dest_net), at->dest_node, at->dest_port,
 		   sk_wmem_alloc_get(s),
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 746427c9e719..d7d9882d4cae 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -1082,7 +1082,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
 	__u16 srcp = ntohs(inet->inet_sport);
 
 	seq_printf(f, "%5d: %08X:%04X %08X:%04X"
-		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n",
+		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n",
 		bucket, src, srcp, dest, destp, sp->sk_state,
 		sk_wmem_alloc_get(sp),
 		sk_rmem_alloc_get(sp),
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index dd44e0ab600c..41d84505a922 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -987,7 +987,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
 	      srcp  = inet->inet_num;
 
 	seq_printf(seq, "%4d: %08X:%04X %08X:%04X"
-		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d\n",
+		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d\n",
 		i, src, srcp, dest, destp, sp->sk_state,
 		sk_wmem_alloc_get(sp),
 		sk_rmem_alloc_get(sp),
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ec2702882d8d..05a3d45d3102 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2608,7 +2608,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req,
 	long delta = req->expires - jiffies;
 
 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
-		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
+		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK%n",
 		i,
 		ireq->loc_addr,
 		ntohs(inet_sk(sk)->inet_sport),
@@ -2666,7 +2666,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
 
 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
-			"%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
+			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d%n",
 		i, src, srcp, dest, destp, sk->sk_state,
 		tp->write_seq - tp->snd_una,
 		rx_queue,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 9e88af0e8ab0..0b24508bcdc4 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2159,7 +2159,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
 	__u16 srcp	  = ntohs(inet->inet_sport);
 
 	seq_printf(f, "%5d: %08X:%04X %08X:%04X"
-		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n",
+		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n",
 		bucket, src, srcp, dest, destp, sp->sk_state,
 		sk_wmem_alloc_get(sp),
 		sk_rmem_alloc_get(sp),
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 197e6f4a2b74..48b6bd2a9a14 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -890,7 +890,7 @@ void ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp,
 	src   = &np->rcv_saddr;
 	seq_printf(seq,
 		   "%5d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
-		   "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d\n",
+		   "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d\n",
 		   bucket,
 		   src->s6_addr32[0], src->s6_addr32[1],
 		   src->s6_addr32[2], src->s6_addr32[3], srcp,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 38c196ca6011..5bcfadf09e95 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1731,7 +1731,7 @@ static void get_openreq6(struct seq_file *seq,
 
 	seq_printf(seq,
 		   "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
-		   "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK\n",
+		   "%02X %08X:%08X %02X:%08lX %08X %5u %8d %d %d %pK\n",
 		   i,
 		   src->s6_addr32[0], src->s6_addr32[1],
 		   src->s6_addr32[2], src->s6_addr32[3],
@@ -1782,7 +1782,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
 
 	seq_printf(seq,
 		   "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
-		   "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %lu %lu %u %u %d\n",
+		   "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %lu %lu %u %u %d\n",
 		   i,
 		   src->s6_addr32[0], src->s6_addr32[1],
 		   src->s6_addr32[2], src->s6_addr32[3], srcp,
diff --git a/net/ipx/ipx_proc.c b/net/ipx/ipx_proc.c
index 65e8833a2510..e15c16a517e7 100644
--- a/net/ipx/ipx_proc.c
+++ b/net/ipx/ipx_proc.c
@@ -213,7 +213,7 @@ static int ipx_seq_socket_show(struct seq_file *seq, void *v)
 			   ntohs(ipxs->dest_addr.sock));
 	}
 
-	seq_printf(seq, "%08X  %08X  %02X     %03d\n",
+	seq_printf(seq, "%08X  %08X  %02X     %03u\n",
 		   sk_wmem_alloc_get(s),
 		   sk_rmem_alloc_get(s),
 		   s->sk_state,
diff --git a/net/llc/llc_proc.c b/net/llc/llc_proc.c
index 7b4799cfbf8d..1a3c7e0f5d0d 100644
--- a/net/llc/llc_proc.c
+++ b/net/llc/llc_proc.c
@@ -147,7 +147,7 @@ static int llc_seq_socket_show(struct seq_file *seq, void *v)
 	}
 	seq_printf(seq, "@%02X ", llc->sap->laddr.lsap);
 	llc_ui_format_mac(seq, llc->daddr.mac);
-	seq_printf(seq, "@%02X %8d %8d %2d %3d %4d\n", llc->daddr.lsap,
+	seq_printf(seq, "@%02X %8d %8d %2d %3u %4d\n", llc->daddr.lsap,
 		   sk_wmem_alloc_get(sk),
 		   sk_rmem_alloc_get(sk) - llc->copied_seq,
 		   sk->sk_state,
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index 1afd1381cdc7..77e38f733496 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -793,7 +793,7 @@ static int pn_res_seq_show(struct seq_file *seq, void *v)
 		struct sock **psk = v;
 		struct sock *sk = *psk;
 
-		seq_printf(seq, "%02X %5d %lu%n",
+		seq_printf(seq, "%02X %5u %lu%n",
 			   (int) (psk - pnres.sk),
 			   from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)),
 			   sock_i_ino(sk), &len);
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 82432bfb742f..0c0642156842 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -226,7 +226,7 @@ static int sctp_eps_seq_show(struct seq_file *seq, void *v)
 		sk = epb->sk;
 		if (!net_eq(sock_net(sk), seq_file_net(seq)))
 			continue;
-		seq_printf(seq, "%8pK %8pK %-3d %-3d %-4d %-5d %5d %5lu ", ep, sk,
+		seq_printf(seq, "%8pK %8pK %-3d %-3d %-4d %-5d %5u %5lu ", ep, sk,
 			   sctp_sk(sk)->type, sk->sk_state, hash,
 			   epb->bind_addr.port,
 			   from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)),
@@ -336,7 +336,7 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v)
 			continue;
 		seq_printf(seq,
 			   "%8pK %8pK %-3d %-3d %-2d %-4d "
-			   "%4d %8d %8d %7d %5lu %-5d %5d ",
+			   "%4d %8d %8d %7u %5lu %-5d %5d ",
 			   assoc, sk, sctp_sk(sk)->type, sk->sk_state,
 			   assoc->state, hash,
 			   assoc->assoc_id,
-- 
cgit v1.2.3


From 16b304f3404f8e0243d5ee2b70b68767b7b59b2b Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Thu, 15 Aug 2013 15:31:06 -0700
Subject: netlink: Eliminate kmalloc in netlink dump operation.

Following patch stores struct netlink_callback in netlink_sock
to avoid allocating and freeing it on every netlink dump msg.
Only one dump operation is allowed for a given socket at a time
therefore we can safely convert cb pointer to cb struct inside
netlink_sock.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/af_netlink.c | 97 +++++++++++++++++++++---------------------------
 net/netlink/af_netlink.h |  3 +-
 2 files changed, 44 insertions(+), 56 deletions(-)

(limited to 'net')

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 6273772aa30c..a17dda1bbee0 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -595,7 +595,7 @@ static unsigned int netlink_poll(struct file *file, struct socket *sock,
 		 * for dumps is performed here. A dump is allowed to continue
 		 * if at least half the ring is unused.
 		 */
-		while (nlk->cb != NULL && netlink_dump_space(nlk)) {
+		while (nlk->cb_running && netlink_dump_space(nlk)) {
 			err = netlink_dump(sk);
 			if (err < 0) {
 				sk->sk_err = err;
@@ -802,18 +802,6 @@ static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb)
 #define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, siocb)	0
 #endif /* CONFIG_NETLINK_MMAP */
 
-static void netlink_destroy_callback(struct netlink_callback *cb)
-{
-	kfree_skb(cb->skb);
-	kfree(cb);
-}
-
-static void netlink_consume_callback(struct netlink_callback *cb)
-{
-	consume_skb(cb->skb);
-	kfree(cb);
-}
-
 static void netlink_skb_destructor(struct sk_buff *skb)
 {
 #ifdef CONFIG_NETLINK_MMAP
@@ -872,12 +860,12 @@ static void netlink_sock_destruct(struct sock *sk)
 {
 	struct netlink_sock *nlk = nlk_sk(sk);
 
-	if (nlk->cb) {
-		if (nlk->cb->done)
-			nlk->cb->done(nlk->cb);
+	if (nlk->cb_running) {
+		if (nlk->cb.done)
+			nlk->cb.done(&nlk->cb);
 
-		module_put(nlk->cb->module);
-		netlink_destroy_callback(nlk->cb);
+		module_put(nlk->cb.module);
+		kfree_skb(nlk->cb.skb);
 	}
 
 	skb_queue_purge(&sk->sk_receive_queue);
@@ -2350,7 +2338,8 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
 
 	skb_free_datagram(sk, skb);
 
-	if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
+	if (nlk->cb_running &&
+	    atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
 		ret = netlink_dump(sk);
 		if (ret) {
 			sk->sk_err = ret;
@@ -2566,13 +2555,12 @@ static int netlink_dump(struct sock *sk)
 	int alloc_size;
 
 	mutex_lock(nlk->cb_mutex);
-
-	cb = nlk->cb;
-	if (cb == NULL) {
+	if (!nlk->cb_running) {
 		err = -EINVAL;
 		goto errout_skb;
 	}
 
+	cb = &nlk->cb;
 	alloc_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE);
 
 	if (!netlink_rx_is_mmaped(sk) &&
@@ -2610,11 +2598,11 @@ static int netlink_dump(struct sock *sk)
 
 	if (cb->done)
 		cb->done(cb);
-	nlk->cb = NULL;
-	mutex_unlock(nlk->cb_mutex);
 
+	nlk->cb_running = false;
+	mutex_unlock(nlk->cb_mutex);
 	module_put(cb->module);
-	netlink_consume_callback(cb);
+	consume_skb(cb->skb);
 	return 0;
 
 errout_skb:
@@ -2632,59 +2620,51 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 	struct netlink_sock *nlk;
 	int ret;
 
-	cb = kzalloc(sizeof(*cb), GFP_KERNEL);
-	if (cb == NULL)
-		return -ENOBUFS;
-
 	/* Memory mapped dump requests need to be copied to avoid looping
 	 * on the pending state in netlink_mmap_sendmsg() while the CB hold
 	 * a reference to the skb.
 	 */
 	if (netlink_skb_is_mmaped(skb)) {
 		skb = skb_copy(skb, GFP_KERNEL);
-		if (skb == NULL) {
-			kfree(cb);
+		if (skb == NULL)
 			return -ENOBUFS;
-		}
 	} else
 		atomic_inc(&skb->users);
 
-	cb->dump = control->dump;
-	cb->done = control->done;
-	cb->nlh = nlh;
-	cb->data = control->data;
-	cb->module = control->module;
-	cb->min_dump_alloc = control->min_dump_alloc;
-	cb->skb = skb;
-
 	sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid);
 	if (sk == NULL) {
-		netlink_destroy_callback(cb);
-		return -ECONNREFUSED;
+		ret = -ECONNREFUSED;
+		goto error_free;
 	}
-	nlk = nlk_sk(sk);
 
+	nlk = nlk_sk(sk);
 	mutex_lock(nlk->cb_mutex);
 	/* A dump is in progress... */
-	if (nlk->cb) {
-		mutex_unlock(nlk->cb_mutex);
-		netlink_destroy_callback(cb);
+	if (nlk->cb_running) {
 		ret = -EBUSY;
-		goto out;
+		goto error_unlock;
 	}
 	/* add reference of module which cb->dump belongs to */
-	if (!try_module_get(cb->module)) {
-		mutex_unlock(nlk->cb_mutex);
-		netlink_destroy_callback(cb);
+	if (!try_module_get(control->module)) {
 		ret = -EPROTONOSUPPORT;
-		goto out;
+		goto error_unlock;
 	}
 
-	nlk->cb = cb;
+	cb = &nlk->cb;
+	memset(cb, 0, sizeof(*cb));
+	cb->dump = control->dump;
+	cb->done = control->done;
+	cb->nlh = nlh;
+	cb->data = control->data;
+	cb->module = control->module;
+	cb->min_dump_alloc = control->min_dump_alloc;
+	cb->skb = skb;
+
+	nlk->cb_running = true;
+
 	mutex_unlock(nlk->cb_mutex);
 
 	ret = netlink_dump(sk);
-out:
 	sock_put(sk);
 
 	if (ret)
@@ -2694,6 +2674,13 @@ out:
 	 * signal not to send ACK even if it was requested.
 	 */
 	return -EINTR;
+
+error_unlock:
+	sock_put(sk);
+	mutex_unlock(nlk->cb_mutex);
+error_free:
+	kfree_skb(skb);
+	return ret;
 }
 EXPORT_SYMBOL(__netlink_dump_start);
 
@@ -2916,14 +2903,14 @@ static int netlink_seq_show(struct seq_file *seq, void *v)
 		struct sock *s = v;
 		struct netlink_sock *nlk = nlk_sk(s);
 
-		seq_printf(seq, "%pK %-3d %-6u %08x %-8d %-8d %pK %-8d %-8d %-8lu\n",
+		seq_printf(seq, "%pK %-3d %-6u %08x %-8d %-8d %d %-8d %-8d %-8lu\n",
 			   s,
 			   s->sk_protocol,
 			   nlk->portid,
 			   nlk->groups ? (u32)nlk->groups[0] : 0,
 			   sk_rmem_alloc_get(s),
 			   sk_wmem_alloc_get(s),
-			   nlk->cb,
+			   nlk->cb_running,
 			   atomic_read(&s->sk_refcnt),
 			   atomic_read(&s->sk_drops),
 			   sock_i_ino(s)
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index eaa88d187cdc..acbd774eeb7c 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -32,7 +32,8 @@ struct netlink_sock {
 	unsigned long		*groups;
 	unsigned long		state;
 	wait_queue_head_t	wait;
-	struct netlink_callback	*cb;
+	bool			cb_running;
+	struct netlink_callback	cb;
 	struct mutex		*cb_mutex;
 	struct mutex		cb_def_mutex;
 	void			(*netlink_rcv)(struct sk_buff *skb);
-- 
cgit v1.2.3


From 99565a6c471cbb66caa68347c195133017559943 Mon Sep 17 00:00:00 2001
From: Fan Du <fan.du@windriver.com>
Date: Thu, 15 Aug 2013 15:49:06 +0800
Subject: xfrm: Make xfrm_state timer monotonic

xfrm_state timer should be independent of system clock change,
so switch to CLOCK_BOOTTIME base which is not only monotonic but
also counting suspend time.

Thus issue reported in commit: 9e0d57fd6dad37d72a3ca6db00ca8c76f2215454
("xfrm: SAD entries do not expire correctly after suspend-resume")
could ALSO be avoided.

v2: Use CLOCK_BOOTTIME to count suspend time, but still monotonic.

Signed-off-by: Fan Du <fan.du@windriver.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_state.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index b2cd806c0be3..4f8ace855864 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -499,7 +499,8 @@ struct xfrm_state *xfrm_state_alloc(struct net *net)
 		INIT_HLIST_NODE(&x->bydst);
 		INIT_HLIST_NODE(&x->bysrc);
 		INIT_HLIST_NODE(&x->byspi);
-		tasklet_hrtimer_init(&x->mtimer, xfrm_timer_handler, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+		tasklet_hrtimer_init(&x->mtimer, xfrm_timer_handler,
+					CLOCK_BOOTTIME, HRTIMER_MODE_ABS);
 		setup_timer(&x->rtimer, xfrm_replay_timer_handler,
 				(unsigned long)x);
 		x->curlft.add_time = get_seconds();
-- 
cgit v1.2.3


From 86c228a7627f3f2776893da47592234140fbfba8 Mon Sep 17 00:00:00 2001
From: Johan Almbladh <ja@anyfi.net>
Date: Wed, 14 Aug 2013 15:29:46 +0200
Subject: mac80211: perform power save processing before decryption

This patch decouples the power save processing from the frame decryption
by running the decrypt rx handler after sta_process. In the case where
the decryption failed for some reason, the stack used to not process
the PM and MOREDATA bits for that frame. The stack now always performs
power save processing regardless of the decryption result. That means that
encrypted data frames and NULLFUNC frames are now handled in the same way
regarding power save processing, making the stack more robust.

Signed-off-by: Johan Almbladh <ja@anyfi.net>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rx.c | 404 +++++++++++++++++++++++++++---------------------------
 1 file changed, 202 insertions(+), 202 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 0ac75127b7d2..ffad155316a9 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1048,207 +1048,6 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx)
 }
 
 
-static ieee80211_rx_result debug_noinline
-ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
-{
-	struct sk_buff *skb = rx->skb;
-	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
-	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
-	int keyidx;
-	int hdrlen;
-	ieee80211_rx_result result = RX_DROP_UNUSABLE;
-	struct ieee80211_key *sta_ptk = NULL;
-	int mmie_keyidx = -1;
-	__le16 fc;
-
-	/*
-	 * Key selection 101
-	 *
-	 * There are four types of keys:
-	 *  - GTK (group keys)
-	 *  - IGTK (group keys for management frames)
-	 *  - PTK (pairwise keys)
-	 *  - STK (station-to-station pairwise keys)
-	 *
-	 * When selecting a key, we have to distinguish between multicast
-	 * (including broadcast) and unicast frames, the latter can only
-	 * use PTKs and STKs while the former always use GTKs and IGTKs.
-	 * Unless, of course, actual WEP keys ("pre-RSNA") are used, then
-	 * unicast frames can also use key indices like GTKs. Hence, if we
-	 * don't have a PTK/STK we check the key index for a WEP key.
-	 *
-	 * Note that in a regular BSS, multicast frames are sent by the
-	 * AP only, associated stations unicast the frame to the AP first
-	 * which then multicasts it on their behalf.
-	 *
-	 * There is also a slight problem in IBSS mode: GTKs are negotiated
-	 * with each station, that is something we don't currently handle.
-	 * The spec seems to expect that one negotiates the same key with
-	 * every station but there's no such requirement; VLANs could be
-	 * possible.
-	 */
-
-	/*
-	 * No point in finding a key and decrypting if the frame is neither
-	 * addressed to us nor a multicast frame.
-	 */
-	if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
-		return RX_CONTINUE;
-
-	/* start without a key */
-	rx->key = NULL;
-
-	if (rx->sta)
-		sta_ptk = rcu_dereference(rx->sta->ptk);
-
-	fc = hdr->frame_control;
-
-	if (!ieee80211_has_protected(fc))
-		mmie_keyidx = ieee80211_get_mmie_keyidx(rx->skb);
-
-	if (!is_multicast_ether_addr(hdr->addr1) && sta_ptk) {
-		rx->key = sta_ptk;
-		if ((status->flag & RX_FLAG_DECRYPTED) &&
-		    (status->flag & RX_FLAG_IV_STRIPPED))
-			return RX_CONTINUE;
-		/* Skip decryption if the frame is not protected. */
-		if (!ieee80211_has_protected(fc))
-			return RX_CONTINUE;
-	} else if (mmie_keyidx >= 0) {
-		/* Broadcast/multicast robust management frame / BIP */
-		if ((status->flag & RX_FLAG_DECRYPTED) &&
-		    (status->flag & RX_FLAG_IV_STRIPPED))
-			return RX_CONTINUE;
-
-		if (mmie_keyidx < NUM_DEFAULT_KEYS ||
-		    mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS)
-			return RX_DROP_MONITOR; /* unexpected BIP keyidx */
-		if (rx->sta)
-			rx->key = rcu_dereference(rx->sta->gtk[mmie_keyidx]);
-		if (!rx->key)
-			rx->key = rcu_dereference(rx->sdata->keys[mmie_keyidx]);
-	} else if (!ieee80211_has_protected(fc)) {
-		/*
-		 * The frame was not protected, so skip decryption. However, we
-		 * need to set rx->key if there is a key that could have been
-		 * used so that the frame may be dropped if encryption would
-		 * have been expected.
-		 */
-		struct ieee80211_key *key = NULL;
-		struct ieee80211_sub_if_data *sdata = rx->sdata;
-		int i;
-
-		if (ieee80211_is_mgmt(fc) &&
-		    is_multicast_ether_addr(hdr->addr1) &&
-		    (key = rcu_dereference(rx->sdata->default_mgmt_key)))
-			rx->key = key;
-		else {
-			if (rx->sta) {
-				for (i = 0; i < NUM_DEFAULT_KEYS; i++) {
-					key = rcu_dereference(rx->sta->gtk[i]);
-					if (key)
-						break;
-				}
-			}
-			if (!key) {
-				for (i = 0; i < NUM_DEFAULT_KEYS; i++) {
-					key = rcu_dereference(sdata->keys[i]);
-					if (key)
-						break;
-				}
-			}
-			if (key)
-				rx->key = key;
-		}
-		return RX_CONTINUE;
-	} else {
-		u8 keyid;
-		/*
-		 * The device doesn't give us the IV so we won't be
-		 * able to look up the key. That's ok though, we
-		 * don't need to decrypt the frame, we just won't
-		 * be able to keep statistics accurate.
-		 * Except for key threshold notifications, should
-		 * we somehow allow the driver to tell us which key
-		 * the hardware used if this flag is set?
-		 */
-		if ((status->flag & RX_FLAG_DECRYPTED) &&
-		    (status->flag & RX_FLAG_IV_STRIPPED))
-			return RX_CONTINUE;
-
-		hdrlen = ieee80211_hdrlen(fc);
-
-		if (rx->skb->len < 8 + hdrlen)
-			return RX_DROP_UNUSABLE; /* TODO: count this? */
-
-		/*
-		 * no need to call ieee80211_wep_get_keyidx,
-		 * it verifies a bunch of things we've done already
-		 */
-		skb_copy_bits(rx->skb, hdrlen + 3, &keyid, 1);
-		keyidx = keyid >> 6;
-
-		/* check per-station GTK first, if multicast packet */
-		if (is_multicast_ether_addr(hdr->addr1) && rx->sta)
-			rx->key = rcu_dereference(rx->sta->gtk[keyidx]);
-
-		/* if not found, try default key */
-		if (!rx->key) {
-			rx->key = rcu_dereference(rx->sdata->keys[keyidx]);
-
-			/*
-			 * RSNA-protected unicast frames should always be
-			 * sent with pairwise or station-to-station keys,
-			 * but for WEP we allow using a key index as well.
-			 */
-			if (rx->key &&
-			    rx->key->conf.cipher != WLAN_CIPHER_SUITE_WEP40 &&
-			    rx->key->conf.cipher != WLAN_CIPHER_SUITE_WEP104 &&
-			    !is_multicast_ether_addr(hdr->addr1))
-				rx->key = NULL;
-		}
-	}
-
-	if (rx->key) {
-		if (unlikely(rx->key->flags & KEY_FLAG_TAINTED))
-			return RX_DROP_MONITOR;
-
-		rx->key->tx_rx_count++;
-		/* TODO: add threshold stuff again */
-	} else {
-		return RX_DROP_MONITOR;
-	}
-
-	switch (rx->key->conf.cipher) {
-	case WLAN_CIPHER_SUITE_WEP40:
-	case WLAN_CIPHER_SUITE_WEP104:
-		result = ieee80211_crypto_wep_decrypt(rx);
-		break;
-	case WLAN_CIPHER_SUITE_TKIP:
-		result = ieee80211_crypto_tkip_decrypt(rx);
-		break;
-	case WLAN_CIPHER_SUITE_CCMP:
-		result = ieee80211_crypto_ccmp_decrypt(rx);
-		break;
-	case WLAN_CIPHER_SUITE_AES_CMAC:
-		result = ieee80211_crypto_aes_cmac_decrypt(rx);
-		break;
-	default:
-		/*
-		 * We can reach here only with HW-only algorithms
-		 * but why didn't it decrypt the frame?!
-		 */
-		return RX_DROP_UNUSABLE;
-	}
-
-	/* the hdr variable is invalid after the decrypt handlers */
-
-	/* either the frame has been decrypted or will be dropped */
-	status->flag |= RX_FLAG_DECRYPTED;
-
-	return result;
-}
-
 static ieee80211_rx_result debug_noinline
 ieee80211_rx_h_check_more_data(struct ieee80211_rx_data *rx)
 {
@@ -1550,6 +1349,207 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
 	return RX_CONTINUE;
 } /* ieee80211_rx_h_sta_process */
 
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
+{
+	struct sk_buff *skb = rx->skb;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+	int keyidx;
+	int hdrlen;
+	ieee80211_rx_result result = RX_DROP_UNUSABLE;
+	struct ieee80211_key *sta_ptk = NULL;
+	int mmie_keyidx = -1;
+	__le16 fc;
+
+	/*
+	 * Key selection 101
+	 *
+	 * There are four types of keys:
+	 *  - GTK (group keys)
+	 *  - IGTK (group keys for management frames)
+	 *  - PTK (pairwise keys)
+	 *  - STK (station-to-station pairwise keys)
+	 *
+	 * When selecting a key, we have to distinguish between multicast
+	 * (including broadcast) and unicast frames, the latter can only
+	 * use PTKs and STKs while the former always use GTKs and IGTKs.
+	 * Unless, of course, actual WEP keys ("pre-RSNA") are used, then
+	 * unicast frames can also use key indices like GTKs. Hence, if we
+	 * don't have a PTK/STK we check the key index for a WEP key.
+	 *
+	 * Note that in a regular BSS, multicast frames are sent by the
+	 * AP only, associated stations unicast the frame to the AP first
+	 * which then multicasts it on their behalf.
+	 *
+	 * There is also a slight problem in IBSS mode: GTKs are negotiated
+	 * with each station, that is something we don't currently handle.
+	 * The spec seems to expect that one negotiates the same key with
+	 * every station but there's no such requirement; VLANs could be
+	 * possible.
+	 */
+
+	/*
+	 * No point in finding a key and decrypting if the frame is neither
+	 * addressed to us nor a multicast frame.
+	 */
+	if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
+		return RX_CONTINUE;
+
+	/* start without a key */
+	rx->key = NULL;
+
+	if (rx->sta)
+		sta_ptk = rcu_dereference(rx->sta->ptk);
+
+	fc = hdr->frame_control;
+
+	if (!ieee80211_has_protected(fc))
+		mmie_keyidx = ieee80211_get_mmie_keyidx(rx->skb);
+
+	if (!is_multicast_ether_addr(hdr->addr1) && sta_ptk) {
+		rx->key = sta_ptk;
+		if ((status->flag & RX_FLAG_DECRYPTED) &&
+		    (status->flag & RX_FLAG_IV_STRIPPED))
+			return RX_CONTINUE;
+		/* Skip decryption if the frame is not protected. */
+		if (!ieee80211_has_protected(fc))
+			return RX_CONTINUE;
+	} else if (mmie_keyidx >= 0) {
+		/* Broadcast/multicast robust management frame / BIP */
+		if ((status->flag & RX_FLAG_DECRYPTED) &&
+		    (status->flag & RX_FLAG_IV_STRIPPED))
+			return RX_CONTINUE;
+
+		if (mmie_keyidx < NUM_DEFAULT_KEYS ||
+		    mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS)
+			return RX_DROP_MONITOR; /* unexpected BIP keyidx */
+		if (rx->sta)
+			rx->key = rcu_dereference(rx->sta->gtk[mmie_keyidx]);
+		if (!rx->key)
+			rx->key = rcu_dereference(rx->sdata->keys[mmie_keyidx]);
+	} else if (!ieee80211_has_protected(fc)) {
+		/*
+		 * The frame was not protected, so skip decryption. However, we
+		 * need to set rx->key if there is a key that could have been
+		 * used so that the frame may be dropped if encryption would
+		 * have been expected.
+		 */
+		struct ieee80211_key *key = NULL;
+		struct ieee80211_sub_if_data *sdata = rx->sdata;
+		int i;
+
+		if (ieee80211_is_mgmt(fc) &&
+		    is_multicast_ether_addr(hdr->addr1) &&
+		    (key = rcu_dereference(rx->sdata->default_mgmt_key)))
+			rx->key = key;
+		else {
+			if (rx->sta) {
+				for (i = 0; i < NUM_DEFAULT_KEYS; i++) {
+					key = rcu_dereference(rx->sta->gtk[i]);
+					if (key)
+						break;
+				}
+			}
+			if (!key) {
+				for (i = 0; i < NUM_DEFAULT_KEYS; i++) {
+					key = rcu_dereference(sdata->keys[i]);
+					if (key)
+						break;
+				}
+			}
+			if (key)
+				rx->key = key;
+		}
+		return RX_CONTINUE;
+	} else {
+		u8 keyid;
+		/*
+		 * The device doesn't give us the IV so we won't be
+		 * able to look up the key. That's ok though, we
+		 * don't need to decrypt the frame, we just won't
+		 * be able to keep statistics accurate.
+		 * Except for key threshold notifications, should
+		 * we somehow allow the driver to tell us which key
+		 * the hardware used if this flag is set?
+		 */
+		if ((status->flag & RX_FLAG_DECRYPTED) &&
+		    (status->flag & RX_FLAG_IV_STRIPPED))
+			return RX_CONTINUE;
+
+		hdrlen = ieee80211_hdrlen(fc);
+
+		if (rx->skb->len < 8 + hdrlen)
+			return RX_DROP_UNUSABLE; /* TODO: count this? */
+
+		/*
+		 * no need to call ieee80211_wep_get_keyidx,
+		 * it verifies a bunch of things we've done already
+		 */
+		skb_copy_bits(rx->skb, hdrlen + 3, &keyid, 1);
+		keyidx = keyid >> 6;
+
+		/* check per-station GTK first, if multicast packet */
+		if (is_multicast_ether_addr(hdr->addr1) && rx->sta)
+			rx->key = rcu_dereference(rx->sta->gtk[keyidx]);
+
+		/* if not found, try default key */
+		if (!rx->key) {
+			rx->key = rcu_dereference(rx->sdata->keys[keyidx]);
+
+			/*
+			 * RSNA-protected unicast frames should always be
+			 * sent with pairwise or station-to-station keys,
+			 * but for WEP we allow using a key index as well.
+			 */
+			if (rx->key &&
+			    rx->key->conf.cipher != WLAN_CIPHER_SUITE_WEP40 &&
+			    rx->key->conf.cipher != WLAN_CIPHER_SUITE_WEP104 &&
+			    !is_multicast_ether_addr(hdr->addr1))
+				rx->key = NULL;
+		}
+	}
+
+	if (rx->key) {
+		if (unlikely(rx->key->flags & KEY_FLAG_TAINTED))
+			return RX_DROP_MONITOR;
+
+		rx->key->tx_rx_count++;
+		/* TODO: add threshold stuff again */
+	} else {
+		return RX_DROP_MONITOR;
+	}
+
+	switch (rx->key->conf.cipher) {
+	case WLAN_CIPHER_SUITE_WEP40:
+	case WLAN_CIPHER_SUITE_WEP104:
+		result = ieee80211_crypto_wep_decrypt(rx);
+		break;
+	case WLAN_CIPHER_SUITE_TKIP:
+		result = ieee80211_crypto_tkip_decrypt(rx);
+		break;
+	case WLAN_CIPHER_SUITE_CCMP:
+		result = ieee80211_crypto_ccmp_decrypt(rx);
+		break;
+	case WLAN_CIPHER_SUITE_AES_CMAC:
+		result = ieee80211_crypto_aes_cmac_decrypt(rx);
+		break;
+	default:
+		/*
+		 * We can reach here only with HW-only algorithms
+		 * but why didn't it decrypt the frame?!
+		 */
+		return RX_DROP_UNUSABLE;
+	}
+
+	/* the hdr variable is invalid after the decrypt handlers */
+
+	/* either the frame has been decrypted or will be dropped */
+	status->flag |= RX_FLAG_DECRYPTED;
+
+	return result;
+}
+
 static inline struct ieee80211_fragment_entry *
 ieee80211_reassemble_add(struct ieee80211_sub_if_data *sdata,
 			 unsigned int frag, unsigned int seq, int rx_queue,
@@ -2933,10 +2933,10 @@ static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx,
 		 */
 		rx->skb = skb;
 
-		CALL_RXH(ieee80211_rx_h_decrypt)
 		CALL_RXH(ieee80211_rx_h_check_more_data)
 		CALL_RXH(ieee80211_rx_h_uapsd_and_pspoll)
 		CALL_RXH(ieee80211_rx_h_sta_process)
+		CALL_RXH(ieee80211_rx_h_decrypt)
 		CALL_RXH(ieee80211_rx_h_defragment)
 		CALL_RXH(ieee80211_rx_h_michael_mic_verify)
 		/* must be after MMIC verify so header is counted in MPDU mic */
-- 
cgit v1.2.3


From d51b70ff5122d31e27733ba03c3afd62bb86bd63 Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <simon.wunderlich@s2003.tu-chemnitz.de>
Date: Fri, 9 Aug 2013 16:35:17 +0200
Subject: mac80211: move ibss presp generation in own function

Channel Switch will later require to generate beacons without setting
them immediately. Therefore split the presp generation in an own
function. Splitting the original very long function might be a good idea
anyway.

Signed-off-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Mathias Kretschmer <mathias.kretschmer@fokus.fraunhofer.de>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ibss.c | 185 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 109 insertions(+), 76 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index 79e294e9b5cc..74de0f10558a 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -34,13 +34,12 @@
 
 #define IEEE80211_IBSS_MAX_STA_ENTRIES 128
 
-
-static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
-				      const u8 *bssid, const int beacon_int,
-				      struct ieee80211_channel *chan,
-				      const u32 basic_rates,
-				      const u16 capability, u64 tsf,
-				      bool creator)
+static struct beacon_data *
+ieee80211_ibss_build_presp(struct ieee80211_sub_if_data *sdata,
+			   const int beacon_int, const u32 basic_rates,
+			   const u16 capability, u64 tsf,
+			   struct cfg80211_chan_def *chandef,
+			   bool *have_higher_than_11mbit)
 {
 	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
 	struct ieee80211_local *local = sdata->local;
@@ -48,70 +47,11 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_mgmt *mgmt;
 	u8 *pos;
 	struct ieee80211_supported_band *sband;
-	struct cfg80211_bss *bss;
-	u32 bss_change, rate_flags, rates = 0, rates_added = 0;
-	struct cfg80211_chan_def chandef;
-	enum nl80211_bss_scan_width scan_width;
-	bool have_higher_than_11mbit = false;
+	u32 rate_flags, rates = 0, rates_added = 0;
 	struct beacon_data *presp;
 	int frame_len;
 	int shift;
 
-	sdata_assert_lock(sdata);
-
-	/* Reset own TSF to allow time synchronization work. */
-	drv_reset_tsf(local, sdata);
-
-	if (!ether_addr_equal(ifibss->bssid, bssid))
-		sta_info_flush(sdata);
-
-	/* if merging, indicate to driver that we leave the old IBSS */
-	if (sdata->vif.bss_conf.ibss_joined) {
-		sdata->vif.bss_conf.ibss_joined = false;
-		sdata->vif.bss_conf.ibss_creator = false;
-		sdata->vif.bss_conf.enable_beacon = false;
-		netif_carrier_off(sdata->dev);
-		ieee80211_bss_info_change_notify(sdata,
-						 BSS_CHANGED_IBSS |
-						 BSS_CHANGED_BEACON_ENABLED);
-	}
-
-	presp = rcu_dereference_protected(ifibss->presp,
-					  lockdep_is_held(&sdata->wdev.mtx));
-	rcu_assign_pointer(ifibss->presp, NULL);
-	if (presp)
-		kfree_rcu(presp, rcu_head);
-
-	sdata->drop_unencrypted = capability & WLAN_CAPABILITY_PRIVACY ? 1 : 0;
-
-	chandef = ifibss->chandef;
-	if (!cfg80211_reg_can_beacon(local->hw.wiphy, &chandef)) {
-		if (chandef.width == NL80211_CHAN_WIDTH_5 ||
-		    chandef.width == NL80211_CHAN_WIDTH_10 ||
-		    chandef.width == NL80211_CHAN_WIDTH_20_NOHT ||
-		    chandef.width == NL80211_CHAN_WIDTH_20) {
-			sdata_info(sdata,
-				   "Failed to join IBSS, beacons forbidden\n");
-			return;
-		}
-		chandef.width = NL80211_CHAN_WIDTH_20;
-		chandef.center_freq1 = chan->center_freq;
-	}
-
-	ieee80211_vif_release_channel(sdata);
-	if (ieee80211_vif_use_channel(sdata, &chandef,
-				      ifibss->fixed_channel ?
-					IEEE80211_CHANCTX_SHARED :
-					IEEE80211_CHANCTX_EXCLUSIVE)) {
-		sdata_info(sdata, "Failed to join IBSS, no channel context\n");
-		return;
-	}
-
-	memcpy(ifibss->bssid, bssid, ETH_ALEN);
-
-	sband = local->hw.wiphy->bands[chan->band];
-	shift = ieee80211_vif_get_shift(&sdata->vif);
-
 	/* Build IBSS probe response */
 	frame_len = sizeof(struct ieee80211_hdr_3addr) +
 		    12 /* struct ieee80211_mgmt.u.beacon */ +
@@ -125,7 +65,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 		    ifibss->ie_len;
 	presp = kzalloc(sizeof(*presp) + frame_len, GFP_KERNEL);
 	if (!presp)
-		return;
+		return NULL;
 
 	presp->head = (void *)(presp + 1);
 
@@ -146,12 +86,19 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	memcpy(pos, ifibss->ssid, ifibss->ssid_len);
 	pos += ifibss->ssid_len;
 
-	rate_flags = ieee80211_chandef_rate_flags(&chandef);
+	sband = local->hw.wiphy->bands[chandef->chan->band];
+	rate_flags = ieee80211_chandef_rate_flags(chandef);
+	shift = ieee80211_chandef_get_shift(chandef);
+	rates_n = 0;
+	if (have_higher_than_11mbit)
+		*have_higher_than_11mbit = false;
+
 	for (i = 0; i < sband->n_bitrates; i++) {
 		if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
 			continue;
-		if (sband->bitrates[i].bitrate > 110)
-			have_higher_than_11mbit = true;
+		if (sband->bitrates[i].bitrate > 110 &&
+		    have_higher_than_11mbit)
+			*have_higher_than_11mbit = true;
 
 		rates |= BIT(i);
 		rates_n++;
@@ -178,7 +125,8 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	if (sband->band == IEEE80211_BAND_2GHZ) {
 		*pos++ = WLAN_EID_DS_PARAMS;
 		*pos++ = 1;
-		*pos++ = ieee80211_frequency_to_channel(chan->center_freq);
+		*pos++ = ieee80211_frequency_to_channel(
+				chandef->chan->center_freq);
 	}
 
 	*pos++ = WLAN_EID_IBSS_PARAMS;
@@ -210,9 +158,9 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	}
 
 	/* add HT capability and information IEs */
-	if (chandef.width != NL80211_CHAN_WIDTH_20_NOHT &&
-	    chandef.width != NL80211_CHAN_WIDTH_5 &&
-	    chandef.width != NL80211_CHAN_WIDTH_10 &&
+	if (chandef->width != NL80211_CHAN_WIDTH_20_NOHT &&
+	    chandef->width != NL80211_CHAN_WIDTH_5 &&
+	    chandef->width != NL80211_CHAN_WIDTH_10 &&
 	    sband->ht_cap.ht_supported) {
 		struct ieee80211_sta_ht_cap ht_cap;
 
@@ -226,7 +174,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 		 * keep them at 0
 		 */
 		pos = ieee80211_ie_build_ht_oper(pos, &sband->ht_cap,
-						 &chandef, 0);
+						 chandef, 0);
 	}
 
 	if (local->hw.queues >= IEEE80211_NUM_ACS) {
@@ -243,9 +191,94 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 
 	presp->head_len = pos - presp->head;
 	if (WARN_ON(presp->head_len > frame_len))
+		goto error;
+
+	return presp;
+error:
+	kfree(presp);
+	return NULL;
+}
+
+static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
+				      const u8 *bssid, const int beacon_int,
+				      struct ieee80211_channel *chan,
+				      const u32 basic_rates,
+				      const u16 capability, u64 tsf,
+				      bool creator)
+{
+	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_supported_band *sband;
+	struct ieee80211_mgmt *mgmt;
+	struct cfg80211_bss *bss;
+	u32 bss_change;
+	struct cfg80211_chan_def chandef;
+	struct beacon_data *presp;
+	enum nl80211_bss_scan_width scan_width;
+	bool have_higher_than_11mbit;
+
+	sdata_assert_lock(sdata);
+
+	/* Reset own TSF to allow time synchronization work. */
+	drv_reset_tsf(local, sdata);
+
+	if (!ether_addr_equal(ifibss->bssid, bssid))
+		sta_info_flush(sdata);
+
+	/* if merging, indicate to driver that we leave the old IBSS */
+	if (sdata->vif.bss_conf.ibss_joined) {
+		sdata->vif.bss_conf.ibss_joined = false;
+		sdata->vif.bss_conf.ibss_creator = false;
+		sdata->vif.bss_conf.enable_beacon = false;
+		netif_carrier_off(sdata->dev);
+		ieee80211_bss_info_change_notify(sdata,
+						 BSS_CHANGED_IBSS |
+						 BSS_CHANGED_BEACON_ENABLED);
+	}
+
+	presp = rcu_dereference_protected(ifibss->presp,
+					  lockdep_is_held(&sdata->wdev.mtx));
+	rcu_assign_pointer(ifibss->presp, NULL);
+	if (presp)
+		kfree_rcu(presp, rcu_head);
+
+	sdata->drop_unencrypted = capability & WLAN_CAPABILITY_PRIVACY ? 1 : 0;
+
+	chandef = ifibss->chandef;
+	if (!cfg80211_reg_can_beacon(local->hw.wiphy, &chandef)) {
+		if (chandef.width == NL80211_CHAN_WIDTH_5 ||
+		    chandef.width == NL80211_CHAN_WIDTH_10 ||
+		    chandef.width == NL80211_CHAN_WIDTH_20_NOHT ||
+		    chandef.width == NL80211_CHAN_WIDTH_20) {
+			sdata_info(sdata,
+				   "Failed to join IBSS, beacons forbidden\n");
+			return;
+		}
+		chandef.width = NL80211_CHAN_WIDTH_20;
+		chandef.center_freq1 = chan->center_freq;
+	}
+
+	ieee80211_vif_release_channel(sdata);
+	if (ieee80211_vif_use_channel(sdata, &chandef,
+				      ifibss->fixed_channel ?
+					IEEE80211_CHANCTX_SHARED :
+					IEEE80211_CHANCTX_EXCLUSIVE)) {
+		sdata_info(sdata, "Failed to join IBSS, no channel context\n");
+		return;
+	}
+
+	memcpy(ifibss->bssid, bssid, ETH_ALEN);
+
+	sband = local->hw.wiphy->bands[chan->band];
+
+	presp = ieee80211_ibss_build_presp(sdata, beacon_int, basic_rates,
+					   capability, tsf, &chandef,
+					   &have_higher_than_11mbit);
+	if (!presp)
 		return;
 
 	rcu_assign_pointer(ifibss->presp, presp);
+	mgmt = (void *)presp->head;
 
 	sdata->vif.bss_conf.enable_beacon = true;
 	sdata->vif.bss_conf.beacon_int = beacon_int;
-- 
cgit v1.2.3


From 27b3eb9c06a7193bdc9800cd00764a130343bc8a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 7 Aug 2013 20:11:55 +0200
Subject: mac80211: add APIs to allow keeping connections after WoWLAN

In order to be able to (securely) keep connections alive after
the system was suspended for WoWLAN, we need some additional
APIs. We already have API (ieee80211_gtk_rekey_notify) to tell
wpa_supplicant about the new replay counter if GTK rekeying
was done by the device while the host was asleep, but that's
not sufficient.

If GTK rekeying wasn't done, we need to tell the host about
sequence counters for the GTK (and PTK regardless of rekeying)
that was used while asleep, add ieee80211_set_key_rx_seq() for
that.

If GTK rekeying was done, then we need to be able to disable
the old keys (with ieee80211_remove_key()) and allocate the
new GTK key(s) in mac80211 (with ieee80211_gtk_rekey_add()).

If protocol offload (e.g. ARP) is implemented, then also the
TX sequence counter for the PTK must be updated, using the new
ieee80211_set_key_tx_seq() function.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h |  83 ++++++++++++++++++++++++++
 net/mac80211/key.c     | 154 ++++++++++++++++++++++++++++++++++++++++++++++---
 net/mac80211/util.c    |   2 +-
 3 files changed, 231 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index df93c77c97ab..e3e303778936 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -3687,6 +3687,89 @@ void ieee80211_get_key_tx_seq(struct ieee80211_key_conf *keyconf,
 void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf,
 			      int tid, struct ieee80211_key_seq *seq);
 
+/**
+ * ieee80211_set_key_tx_seq - set key TX sequence counter
+ *
+ * @keyconf: the parameter passed with the set key
+ * @seq: new sequence data
+ *
+ * This function allows a driver to set the current TX IV/PNs for the
+ * given key. This is useful when resuming from WoWLAN sleep and the
+ * device may have transmitted frames using the PTK, e.g. replies to
+ * ARP requests.
+ *
+ * Note that this function may only be called when no TX processing
+ * can be done concurrently.
+ */
+void ieee80211_set_key_tx_seq(struct ieee80211_key_conf *keyconf,
+			      struct ieee80211_key_seq *seq);
+
+/**
+ * ieee80211_set_key_rx_seq - set key RX sequence counter
+ *
+ * @keyconf: the parameter passed with the set key
+ * @tid: The TID, or -1 for the management frame value (CCMP only);
+ *	the value on TID 0 is also used for non-QoS frames. For
+ *	CMAC, only TID 0 is valid.
+ * @seq: new sequence data
+ *
+ * This function allows a driver to set the current RX IV/PNs for the
+ * given key. This is useful when resuming from WoWLAN sleep and GTK
+ * rekey may have been done while suspended. It should not be called
+ * if IV checking is done by the device and not by mac80211.
+ *
+ * Note that this function may only be called when no RX processing
+ * can be done concurrently.
+ */
+void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf,
+			      int tid, struct ieee80211_key_seq *seq);
+
+/**
+ * ieee80211_remove_key - remove the given key
+ * @keyconf: the parameter passed with the set key
+ *
+ * Remove the given key. If the key was uploaded to the hardware at the
+ * time this function is called, it is not deleted in the hardware but
+ * instead assumed to have been removed already.
+ *
+ * Note that due to locking considerations this function can (currently)
+ * only be called during key iteration (ieee80211_iter_keys().)
+ */
+void ieee80211_remove_key(struct ieee80211_key_conf *keyconf);
+
+/**
+ * ieee80211_gtk_rekey_add - add a GTK key from rekeying during WoWLAN
+ * @vif: the virtual interface to add the key on
+ * @keyconf: new key data
+ *
+ * When GTK rekeying was done while the system was suspended, (a) new
+ * key(s) will be available. These will be needed by mac80211 for proper
+ * RX processing, so this function allows setting them.
+ *
+ * The function returns the newly allocated key structure, which will
+ * have similar contents to the passed key configuration but point to
+ * mac80211-owned memory. In case of errors, the function returns an
+ * ERR_PTR(), use IS_ERR() etc.
+ *
+ * Note that this function assumes the key isn't added to hardware
+ * acceleration, so no TX will be done with the key. Since it's a GTK
+ * on managed (station) networks, this is true anyway. If the driver
+ * calls this function from the resume callback and subsequently uses
+ * the return code 1 to reconfigure the device, this key will be part
+ * of the reconfiguration.
+ *
+ * Note that the driver should also call ieee80211_set_key_rx_seq()
+ * for the new key for each TID to set up sequence counters properly.
+ *
+ * IMPORTANT: If this replaces a key that is present in the hardware,
+ * then it will attempt to remove it during this call. In many cases
+ * this isn't what you want, so call ieee80211_remove_key() first for
+ * the key that's being replaced.
+ */
+struct ieee80211_key_conf *
+ieee80211_gtk_rekey_add(struct ieee80211_vif *vif,
+			struct ieee80211_key_conf *keyconf);
+
 /**
  * ieee80211_gtk_rekey_notify - notify userspace supplicant of rekeying
  * @vif: virtual interface the rekeying was done on
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index e39cc91d0cf1..620677e897bd 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -93,6 +93,9 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
 
 	might_sleep();
 
+	if (key->flags & KEY_FLAG_TAINTED)
+		return -EINVAL;
+
 	if (!key->local->ops->set_key)
 		goto out_unsupported;
 
@@ -455,6 +458,7 @@ int ieee80211_key_link(struct ieee80211_key *key,
 		       struct ieee80211_sub_if_data *sdata,
 		       struct sta_info *sta)
 {
+	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_key *old_key;
 	int idx, ret;
 	bool pairwise;
@@ -484,10 +488,13 @@ int ieee80211_key_link(struct ieee80211_key *key,
 
 	ieee80211_debugfs_key_add(key);
 
-	ret = ieee80211_key_enable_hw_accel(key);
-
-	if (ret)
-		ieee80211_key_free(key, true);
+	if (!local->wowlan) {
+		ret = ieee80211_key_enable_hw_accel(key);
+		if (ret)
+			ieee80211_key_free(key, true);
+	} else {
+		ret = 0;
+	}
 
 	mutex_unlock(&sdata->local->key_mtx);
 
@@ -540,7 +547,7 @@ void ieee80211_iter_keys(struct ieee80211_hw *hw,
 			 void *iter_data)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
-	struct ieee80211_key *key;
+	struct ieee80211_key *key, *tmp;
 	struct ieee80211_sub_if_data *sdata;
 
 	ASSERT_RTNL();
@@ -548,13 +555,14 @@ void ieee80211_iter_keys(struct ieee80211_hw *hw,
 	mutex_lock(&local->key_mtx);
 	if (vif) {
 		sdata = vif_to_sdata(vif);
-		list_for_each_entry(key, &sdata->key_list, list)
+		list_for_each_entry_safe(key, tmp, &sdata->key_list, list)
 			iter(hw, &sdata->vif,
 			     key->sta ? &key->sta->sta : NULL,
 			     &key->conf, iter_data);
 	} else {
 		list_for_each_entry(sdata, &local->interfaces, list)
-			list_for_each_entry(key, &sdata->key_list, list)
+			list_for_each_entry_safe(key, tmp,
+						 &sdata->key_list, list)
 				iter(hw, &sdata->vif,
 				     key->sta ? &key->sta->sta : NULL,
 				     &key->conf, iter_data);
@@ -751,3 +759,135 @@ void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf,
 	}
 }
 EXPORT_SYMBOL(ieee80211_get_key_rx_seq);
+
+void ieee80211_set_key_tx_seq(struct ieee80211_key_conf *keyconf,
+			      struct ieee80211_key_seq *seq)
+{
+	struct ieee80211_key *key;
+	u64 pn64;
+
+	key = container_of(keyconf, struct ieee80211_key, conf);
+
+	switch (key->conf.cipher) {
+	case WLAN_CIPHER_SUITE_TKIP:
+		key->u.tkip.tx.iv32 = seq->tkip.iv32;
+		key->u.tkip.tx.iv16 = seq->tkip.iv16;
+		break;
+	case WLAN_CIPHER_SUITE_CCMP:
+		pn64 = (u64)seq->ccmp.pn[5] |
+		       ((u64)seq->ccmp.pn[4] << 8) |
+		       ((u64)seq->ccmp.pn[3] << 16) |
+		       ((u64)seq->ccmp.pn[2] << 24) |
+		       ((u64)seq->ccmp.pn[1] << 32) |
+		       ((u64)seq->ccmp.pn[0] << 40);
+		atomic64_set(&key->u.ccmp.tx_pn, pn64);
+		break;
+	case WLAN_CIPHER_SUITE_AES_CMAC:
+		pn64 = (u64)seq->aes_cmac.pn[5] |
+		       ((u64)seq->aes_cmac.pn[4] << 8) |
+		       ((u64)seq->aes_cmac.pn[3] << 16) |
+		       ((u64)seq->aes_cmac.pn[2] << 24) |
+		       ((u64)seq->aes_cmac.pn[1] << 32) |
+		       ((u64)seq->aes_cmac.pn[0] << 40);
+		atomic64_set(&key->u.aes_cmac.tx_pn, pn64);
+		break;
+	default:
+		WARN_ON(1);
+		break;
+	}
+}
+EXPORT_SYMBOL_GPL(ieee80211_set_key_tx_seq);
+
+void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf,
+			      int tid, struct ieee80211_key_seq *seq)
+{
+	struct ieee80211_key *key;
+	u8 *pn;
+
+	key = container_of(keyconf, struct ieee80211_key, conf);
+
+	switch (key->conf.cipher) {
+	case WLAN_CIPHER_SUITE_TKIP:
+		if (WARN_ON(tid < 0 || tid >= IEEE80211_NUM_TIDS))
+			return;
+		key->u.tkip.rx[tid].iv32 = seq->tkip.iv32;
+		key->u.tkip.rx[tid].iv16 = seq->tkip.iv16;
+		break;
+	case WLAN_CIPHER_SUITE_CCMP:
+		if (WARN_ON(tid < -1 || tid >= IEEE80211_NUM_TIDS))
+			return;
+		if (tid < 0)
+			pn = key->u.ccmp.rx_pn[IEEE80211_NUM_TIDS];
+		else
+			pn = key->u.ccmp.rx_pn[tid];
+		memcpy(pn, seq->ccmp.pn, IEEE80211_CCMP_PN_LEN);
+		break;
+	case WLAN_CIPHER_SUITE_AES_CMAC:
+		if (WARN_ON(tid != 0))
+			return;
+		pn = key->u.aes_cmac.rx_pn;
+		memcpy(pn, seq->aes_cmac.pn, IEEE80211_CMAC_PN_LEN);
+		break;
+	default:
+		WARN_ON(1);
+		break;
+	}
+}
+EXPORT_SYMBOL_GPL(ieee80211_set_key_rx_seq);
+
+void ieee80211_remove_key(struct ieee80211_key_conf *keyconf)
+{
+	struct ieee80211_key *key;
+
+	key = container_of(keyconf, struct ieee80211_key, conf);
+
+	assert_key_lock(key->local);
+
+	/*
+	 * if key was uploaded, we assume the driver will/has remove(d)
+	 * it, so adjust bookkeeping accordingly
+	 */
+	if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) {
+		key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE;
+
+		if (!((key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC) ||
+		      (key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV) ||
+		      (key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE)))
+			increment_tailroom_need_count(key->sdata);
+	}
+
+	ieee80211_key_free(key, false);
+}
+EXPORT_SYMBOL_GPL(ieee80211_remove_key);
+
+struct ieee80211_key_conf *
+ieee80211_gtk_rekey_add(struct ieee80211_vif *vif,
+			struct ieee80211_key_conf *keyconf)
+{
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_key *key;
+	int err;
+
+	if (WARN_ON(!local->wowlan))
+		return ERR_PTR(-EINVAL);
+
+	if (WARN_ON(vif->type != NL80211_IFTYPE_STATION))
+		return ERR_PTR(-EINVAL);
+
+	key = ieee80211_key_alloc(keyconf->cipher, keyconf->keyidx,
+				  keyconf->keylen, keyconf->key,
+				  0, NULL);
+	if (IS_ERR(key))
+		return ERR_PTR(PTR_ERR(key));
+
+	if (sdata->u.mgd.mfp != IEEE80211_MFP_DISABLED)
+		key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT;
+
+	err = ieee80211_key_link(key, sdata, NULL);
+	if (err)
+		return ERR_PTR(err);
+
+	return &key->conf;
+}
+EXPORT_SYMBOL_GPL(ieee80211_gtk_rekey_add);
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index d23c5a705a68..e1b34a18b243 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -1453,8 +1453,8 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 		local->resuming = true;
 
 	if (local->wowlan) {
-		local->wowlan = false;
 		res = drv_resume(local);
+		local->wowlan = false;
 		if (res < 0) {
 			local->resuming = false;
 			return res;
-- 
cgit v1.2.3


From e3fec5a1c5a1ab4a85ca3f4e41c626fb953ce162 Mon Sep 17 00:00:00 2001
From: Rami Rosen <ramirose@gmail.com>
Date: Mon, 19 Aug 2013 15:47:11 +0300
Subject: xfrm: remove irrelevant comment in xfrm_input().

This patch removes a comment in xfrm_input() which became irrelevant
due to commit 2774c13, "xfrm: Handle blackhole route creation via afinfo".
That commit removed returning -EREMOTE in the xfrm_lookup() method when the
packet should be discarded  and also removed the correspoinding -EREMOTE
handlers. This was replaced by calling the make_blackhole() method. Therefore
the comment about -EREMOTE is not relevant anymore.

Signed-off-by: Rami Rosen <ramirose@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index f7078eb5b64e..ad8cc7bcf065 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2132,8 +2132,6 @@ restart:
 		 * have the xfrm_state's. We need to wait for KM to
 		 * negotiate new SA's or bail out with error.*/
 		if (net->xfrm.sysctl_larval_drop) {
-			/* EREMOTE tells the caller to generate
-			 * a one-shot blackhole route. */
 			dst_release(dst);
 			xfrm_pols_put(pols, drop_pols);
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
-- 
cgit v1.2.3


From 58264848a5a7b91195f43c4729072e8cc980288d Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Mon, 19 Aug 2013 11:23:34 -0700
Subject: openvswitch: Add vxlan tunneling support.

Following patch adds vxlan vport type for openvswitch using
vxlan api. So now there is vxlan dependency for openvswitch.

CC: Jesse Gross <jesse@nicira.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h |  11 +++
 net/openvswitch/Kconfig          |  13 +++
 net/openvswitch/Makefile         |   4 +
 net/openvswitch/vport-vxlan.c    | 204 +++++++++++++++++++++++++++++++++++++++
 net/openvswitch/vport.c          |   3 +
 net/openvswitch/vport.h          |   1 +
 6 files changed, 236 insertions(+)
 create mode 100644 net/openvswitch/vport-vxlan.c

(limited to 'net')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index c55efaaa9bb4..52490b0e62b5 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -165,6 +165,7 @@ enum ovs_vport_type {
 	OVS_VPORT_TYPE_NETDEV,   /* network device */
 	OVS_VPORT_TYPE_INTERNAL, /* network device implemented by datapath */
 	OVS_VPORT_TYPE_GRE,      /* GRE tunnel. */
+	OVS_VPORT_TYPE_VXLAN,	 /* VXLAN tunnel. */
 	__OVS_VPORT_TYPE_MAX
 };
 
@@ -211,6 +212,16 @@ enum ovs_vport_attr {
 
 #define OVS_VPORT_ATTR_MAX (__OVS_VPORT_ATTR_MAX - 1)
 
+/* OVS_VPORT_ATTR_OPTIONS attributes for tunnels.
+ */
+enum {
+	OVS_TUNNEL_ATTR_UNSPEC,
+	OVS_TUNNEL_ATTR_DST_PORT, /* 16-bit UDP port, used by L4 tunnels. */
+	__OVS_TUNNEL_ATTR_MAX
+};
+
+#define OVS_TUNNEL_ATTR_MAX (__OVS_TUNNEL_ATTR_MAX - 1)
+
 /* Flows. */
 
 #define OVS_FLOW_FAMILY  "ovs_flow"
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index 27ee56b688a3..bed30e69baa7 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -40,3 +40,16 @@ config OPENVSWITCH_GRE
 	  Say N to exclude this support and reduce the binary size.
 
 	  If unsure, say Y.
+
+config OPENVSWITCH_VXLAN
+	bool "Open vSwitch VXLAN tunneling support"
+	depends on INET
+	depends on OPENVSWITCH
+	depends on VXLAN && !(OPENVSWITCH=y && VXLAN=m)
+	default y
+	---help---
+	  If you say Y here, then the Open vSwitch will be able create vxlan vport.
+
+	  Say N to exclude this support and reduce the binary size.
+
+	  If unsure, say Y.
diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile
index 01bddb2991e3..82e4ee54a44b 100644
--- a/net/openvswitch/Makefile
+++ b/net/openvswitch/Makefile
@@ -13,3 +13,7 @@ openvswitch-y := \
 	vport-gre.o \
 	vport-internal_dev.o \
 	vport-netdev.o
+
+ifneq ($(CONFIG_OPENVSWITCH_VXLAN),)
+openvswitch-y += vport-vxlan.o
+endif
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
new file mode 100644
index 000000000000..36848bd54a77
--- /dev/null
+++ b/net/openvswitch/vport-vxlan.c
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2013 Nicira, Inc.
+ * Copyright (c) 2013 Cisco Systems, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/net.h>
+#include <linux/rculist.h>
+#include <linux/udp.h>
+
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/udp.h>
+#include <net/ip_tunnels.h>
+#include <net/udp.h>
+#include <net/rtnetlink.h>
+#include <net/route.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/vxlan.h>
+
+#include "datapath.h"
+#include "vport.h"
+
+/**
+ * struct vxlan_port - Keeps track of open UDP ports
+ * @vs: vxlan_sock created for the port.
+ * @name: vport name.
+ */
+struct vxlan_port {
+	struct vxlan_sock *vs;
+	char name[IFNAMSIZ];
+};
+
+static inline struct vxlan_port *vxlan_vport(const struct vport *vport)
+{
+	return vport_priv(vport);
+}
+
+/* Called with rcu_read_lock and BH disabled. */
+static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni)
+{
+	struct ovs_key_ipv4_tunnel tun_key;
+	struct vport *vport = vs->data;
+	struct iphdr *iph;
+	__be64 key;
+
+	/* Save outer tunnel values */
+	iph = ip_hdr(skb);
+	key = cpu_to_be64(ntohl(vx_vni) >> 8);
+	ovs_flow_tun_key_init(&tun_key, iph, key, TUNNEL_KEY);
+
+	ovs_vport_receive(vport, skb, &tun_key);
+}
+
+static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb)
+{
+	struct vxlan_port *vxlan_port = vxlan_vport(vport);
+	__be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport;
+
+	if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port)))
+		return -EMSGSIZE;
+	return 0;
+}
+
+static void vxlan_tnl_destroy(struct vport *vport)
+{
+	struct vxlan_port *vxlan_port = vxlan_vport(vport);
+
+	vxlan_sock_release(vxlan_port->vs);
+
+	ovs_vport_deferred_free(vport);
+}
+
+static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
+{
+	struct net *net = ovs_dp_get_net(parms->dp);
+	struct nlattr *options = parms->options;
+	struct vxlan_port *vxlan_port;
+	struct vxlan_sock *vs;
+	struct vport *vport;
+	struct nlattr *a;
+	u16 dst_port;
+	int err;
+
+	if (!options) {
+		err = -EINVAL;
+		goto error;
+	}
+	a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT);
+	if (a && nla_len(a) == sizeof(u16)) {
+		dst_port = nla_get_u16(a);
+	} else {
+		/* Require destination port from userspace. */
+		err = -EINVAL;
+		goto error;
+	}
+
+	vport = ovs_vport_alloc(sizeof(struct vxlan_port),
+				&ovs_vxlan_vport_ops, parms);
+	if (IS_ERR(vport))
+		return vport;
+
+	vxlan_port = vxlan_vport(vport);
+	strncpy(vxlan_port->name, parms->name, IFNAMSIZ);
+
+	vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true);
+	if (IS_ERR(vs)) {
+		ovs_vport_free(vport);
+		return (void *)vs;
+	}
+	vxlan_port->vs = vs;
+
+	return vport;
+
+error:
+	return ERR_PTR(err);
+}
+
+static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
+{
+	struct net *net = ovs_dp_get_net(vport->dp);
+	struct vxlan_port *vxlan_port = vxlan_vport(vport);
+	__be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport;
+	struct rtable *rt;
+	struct flowi4 fl;
+	__be16 src_port;
+	int port_min;
+	int port_max;
+	__be16 df;
+	int err;
+
+	if (unlikely(!OVS_CB(skb)->tun_key)) {
+		err = -EINVAL;
+		goto error;
+	}
+
+	/* Route lookup */
+	memset(&fl, 0, sizeof(fl));
+	fl.daddr = OVS_CB(skb)->tun_key->ipv4_dst;
+	fl.saddr = OVS_CB(skb)->tun_key->ipv4_src;
+	fl.flowi4_tos = RT_TOS(OVS_CB(skb)->tun_key->ipv4_tos);
+	fl.flowi4_mark = skb->mark;
+	fl.flowi4_proto = IPPROTO_UDP;
+
+	rt = ip_route_output_key(net, &fl);
+	if (IS_ERR(rt)) {
+		err = PTR_ERR(rt);
+		goto error;
+	}
+
+	df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ?
+		htons(IP_DF) : 0;
+
+	skb->local_df = 1;
+
+	inet_get_local_port_range(&port_min, &port_max);
+	src_port = vxlan_src_port(port_min, port_max, skb);
+
+	err = vxlan_xmit_skb(net, vxlan_port->vs, rt, skb,
+			     fl.saddr, OVS_CB(skb)->tun_key->ipv4_dst,
+			     OVS_CB(skb)->tun_key->ipv4_tos,
+			     OVS_CB(skb)->tun_key->ipv4_ttl, df,
+			     src_port, dst_port,
+			     htonl(be64_to_cpu(OVS_CB(skb)->tun_key->tun_id) << 8));
+	if (err < 0)
+		ip_rt_put(rt);
+error:
+	return err;
+}
+
+static const char *vxlan_get_name(const struct vport *vport)
+{
+	struct vxlan_port *vxlan_port = vxlan_vport(vport);
+	return vxlan_port->name;
+}
+
+const struct vport_ops ovs_vxlan_vport_ops = {
+	.type		= OVS_VPORT_TYPE_VXLAN,
+	.create		= vxlan_tnl_create,
+	.destroy	= vxlan_tnl_destroy,
+	.get_name	= vxlan_get_name,
+	.get_options	= vxlan_get_options,
+	.send		= vxlan_tnl_send,
+};
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index d4c7fa04ce08..d69e0c06dfde 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -42,6 +42,9 @@ static const struct vport_ops *vport_ops_list[] = {
 #ifdef CONFIG_OPENVSWITCH_GRE
 	&ovs_gre_vport_ops,
 #endif
+#ifdef CONFIG_OPENVSWITCH_VXLAN
+	&ovs_vxlan_vport_ops,
+#endif
 };
 
 /* Protected by RCU read lock for reading, ovs_mutex for writing. */
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index 376045c42f8b..1a9fbcec6e1b 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -199,6 +199,7 @@ void ovs_vport_record_error(struct vport *, enum vport_err_type err_type);
 extern const struct vport_ops ovs_netdev_vport_ops;
 extern const struct vport_ops ovs_internal_vport_ops;
 extern const struct vport_ops ovs_gre_vport_ops;
+extern const struct vport_ops ovs_vxlan_vport_ops;
 
 static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb,
 				      const void *start, unsigned int len)
-- 
cgit v1.2.3


From 84ce1ddfefc3d5a8af5ede6fe16546c143117616 Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Fri, 16 Aug 2013 21:59:54 +0200
Subject: 6lowpan: init ipv6hdr buffer to zero

This patch simplify the handling to set fields inside of struct ipv6hdr
to zero. Instead of setting some memory regions with memset to zero we
initialize the whole ipv6hdr to zero.

This is a simplification for parsing the 6lowpan header for the upcomming
patches.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Reviewed-by: Werner Almesberger <werner@almesberger.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ieee802154/6lowpan.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c
index 3b9d5f20bd1c..92429db0b237 100644
--- a/net/ieee802154/6lowpan.c
+++ b/net/ieee802154/6lowpan.c
@@ -223,10 +223,6 @@ lowpan_uncompress_addr(struct sk_buff *skb, struct in6_addr *ipaddr,
 	if (prefcount > 0)
 		memcpy(ipaddr, prefix, prefcount);
 
-	if (prefcount + postcount < 16)
-		memset(&ipaddr->s6_addr[prefcount], 0,
-					16 - (prefcount + postcount));
-
 	if (postcount > 0) {
 		memcpy(&ipaddr->s6_addr[16 - postcount], skb->data, postcount);
 		skb_pull(skb, postcount);
@@ -723,7 +719,7 @@ frame_err:
 static int
 lowpan_process_data(struct sk_buff *skb)
 {
-	struct ipv6hdr hdr;
+	struct ipv6hdr hdr = {};
 	u8 tmp, iphc0, iphc1, num_context = 0;
 	u8 *_saddr, *_daddr;
 	int err;
@@ -868,8 +864,6 @@ lowpan_process_data(struct sk_buff *skb)
 
 		hdr.priority = ((tmp >> 2) & 0x0f);
 		hdr.flow_lbl[0] = ((tmp << 6) & 0xC0) | ((tmp >> 2) & 0x30);
-		hdr.flow_lbl[1] = 0;
-		hdr.flow_lbl[2] = 0;
 		break;
 	/*
 	 * Flow Label carried in-line
@@ -885,10 +879,6 @@ lowpan_process_data(struct sk_buff *skb)
 		break;
 	/* Traffic Class and Flow Label are elided */
 	case 3: /* 11b */
-		hdr.priority = 0;
-		hdr.flow_lbl[0] = 0;
-		hdr.flow_lbl[1] = 0;
-		hdr.flow_lbl[2] = 0;
 		break;
 	default:
 		break;
-- 
cgit v1.2.3


From 31afe1f73e46221650acfcb411e6949f4a8f7571 Mon Sep 17 00:00:00 2001
From: David Hauweele <david@hauweele.net>
Date: Fri, 16 Aug 2013 21:59:55 +0200
Subject: 6lowpan: Fix fragmentation with link-local compressed addresses

When a new 6lowpan fragment is received, a skbuff is allocated for
the reassembled packet. However when a 6lowpan packet compresses
link-local addresses based on link-layer addresses, the processing
function relies on the skb mac control block to find the related
link-layer address.

This patch copies the control block from the first fragment into
the newly allocated skb to keep a trace of the link-layer addresses
in case of a link-local compressed address.

Edit: small changes on comment issue

Signed-off-by: David Hauweele <david@hauweele.net>
Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Reviewed-by: Werner Almesberger <werner@almesberger.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ieee802154/6lowpan.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c
index 92429db0b237..632b3fdf46a4 100644
--- a/net/ieee802154/6lowpan.c
+++ b/net/ieee802154/6lowpan.c
@@ -698,6 +698,12 @@ lowpan_alloc_new_frame(struct sk_buff *skb, u16 len, u16 tag)
 	skb_reserve(frame->skb, sizeof(struct ipv6hdr));
 	skb_put(frame->skb, frame->length);
 
+	/* copy the first control block to keep a
+	 * trace of the link-layer addresses in case
+	 * of a link-local compressed address
+	 */
+	memcpy(frame->skb->cb, skb->cb, sizeof(skb->cb));
+
 	init_timer(&frame->timer);
 	/* time out is the same as for ipv6 - 60 sec */
 	frame->timer.expires = jiffies + LOWPAN_FRAG_TIMEOUT;
-- 
cgit v1.2.3


From 4666669fc3b83f0d01b08c6b17432563f2eedcba Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Fri, 16 Aug 2013 21:59:56 +0200
Subject: 6lowpan: introduce lowpan_fetch_skb function

This patch adds a helper function to parse the ipv6 header to a
6lowpan header in stream.

This function checks first if we can pull data with a specific
length from a skb. If this seems to be okay, we copy skb data to
a destination pointer and run skb_pull.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Reviewed-by: Werner Almesberger <werner@almesberger.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ieee802154/6lowpan.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'net')

diff --git a/net/ieee802154/6lowpan.h b/net/ieee802154/6lowpan.h
index 4b8f917658b5..a636545d8dcf 100644
--- a/net/ieee802154/6lowpan.h
+++ b/net/ieee802154/6lowpan.h
@@ -230,4 +230,16 @@
 					dest = 16 bit inline */
 #define LOWPAN_NHC_UDP_CS_P_11	0xF3 /* source & dest = 0xF0B + 4bit inline */
 
+static inline bool lowpan_fetch_skb(struct sk_buff *skb,
+		void *data, const unsigned int len)
+{
+	if (unlikely(!pskb_may_pull(skb, len)))
+		return true;
+
+	skb_copy_from_linear_data(skb, data, len);
+	skb_pull(skb, len);
+
+	return false;
+}
+
 #endif /* __6LOWPAN_H__ */
-- 
cgit v1.2.3


From 84c2e7bcf531a3881d602035243eda80e62d700a Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Fri, 16 Aug 2013 21:59:57 +0200
Subject: 6lowpan: add function to uncompress multicast addr

Add function to uncompress multicast address.
This function split the uncompress function for a multicast address
in a seperate function.

To uncompress a multicast address is different than a other
non-multicasts addresses according to rfc6282.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Reviewed-by: Werner Almesberger <werner@almesberger.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ieee802154/6lowpan.c | 78 ++++++++++++++++++++++++++++++++++++------------
 1 file changed, 59 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c
index 632b3fdf46a4..9aea7ce8b83d 100644
--- a/net/ieee802154/6lowpan.c
+++ b/net/ieee802154/6lowpan.c
@@ -88,15 +88,6 @@ static const u8 lowpan_unc_llconf[] = {0x0f, 0x28, 0x22, 0x20};
  */
 static const u8 lowpan_unc_ctxconf[] = {0x00, 0x88, 0x82, 0x80};
 
-/*
- * Uncompression of ctx-base
- *   0 -> 0 bits from packet
- *   1 -> 2 bytes from prefix - bunch of zeroes 5 from packet
- *   2 -> 2 bytes from prefix - zeroes + 3 from packet
- *   3 -> 2 bytes from prefix - infer 1 bytes from lladdr
- */
-static const u8 lowpan_unc_mxconf[] = {0x0f, 0x25, 0x23, 0x21};
-
 /* Link local prefix */
 static const u8 lowpan_llprefix[] = {0xfe, 0x80};
 
@@ -240,6 +231,63 @@ lowpan_uncompress_addr(struct sk_buff *skb, struct in6_addr *ipaddr,
 	return 0;
 }
 
+/* Uncompress function for multicast destination address,
+ * when M bit is set.
+ */
+static int
+lowpan_uncompress_multicast_daddr(struct sk_buff *skb,
+		struct in6_addr *ipaddr,
+		const u8 dam)
+{
+	bool fail;
+
+	switch (dam) {
+	case LOWPAN_IPHC_DAM_00:
+		/* 00:  128 bits.  The full address
+		 * is carried in-line.
+		 */
+		fail = lowpan_fetch_skb(skb, ipaddr->s6_addr, 16);
+		break;
+	case LOWPAN_IPHC_DAM_01:
+		/* 01:  48 bits.  The address takes
+		 * the form ffXX::00XX:XXXX:XXXX.
+		 */
+		ipaddr->s6_addr[0] = 0xFF;
+		fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[1], 1);
+		fail |= lowpan_fetch_skb(skb, &ipaddr->s6_addr[11], 5);
+		break;
+	case LOWPAN_IPHC_DAM_10:
+		/* 10:  32 bits.  The address takes
+		 * the form ffXX::00XX:XXXX.
+		 */
+		ipaddr->s6_addr[0] = 0xFF;
+		fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[1], 1);
+		fail |= lowpan_fetch_skb(skb, &ipaddr->s6_addr[13], 3);
+		break;
+	case LOWPAN_IPHC_DAM_11:
+		/* 11:  8 bits.  The address takes
+		 * the form ff02::00XX.
+		 */
+		ipaddr->s6_addr[0] = 0xFF;
+		ipaddr->s6_addr[1] = 0x02;
+		fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[15], 1);
+		break;
+	default:
+		pr_debug("DAM value has a wrong value: 0x%x\n", dam);
+		return -EINVAL;
+	}
+
+	if (fail) {
+		pr_debug("Failed to fetch skb data\n");
+		return -EIO;
+	}
+
+	lowpan_raw_dump_inline(NULL, "Reconstructed ipv6 multicast addr is:\n",
+			ipaddr->s6_addr, 16);
+
+	return 0;
+}
+
 static void
 lowpan_compress_udp_header(u8 **hc06_ptr, struct sk_buff *skb)
 {
@@ -927,16 +975,8 @@ lowpan_process_data(struct sk_buff *skb)
 			pr_debug("dest: context-based mcast compression\n");
 			/* TODO: implement this */
 		} else {
-			u8 prefix[] = {0xff, 0x02};
-
-			pr_debug("dest: non context-based mcast compression\n");
-			if (0 < tmp && tmp < 3) {
-				if (lowpan_fetch_skb_u8(skb, &prefix[1]))
-					goto drop;
-			}
-
-			err = lowpan_uncompress_addr(skb, &hdr.daddr, prefix,
-					lowpan_unc_mxconf[tmp], NULL);
+			err = lowpan_uncompress_multicast_daddr(
+					skb, &hdr.daddr, tmp);
 			if (err)
 				goto drop;
 		}
-- 
cgit v1.2.3


From ce2463b283a0cb63e0e4de5e7d971b4c92be542a Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Fri, 16 Aug 2013 21:59:58 +0200
Subject: 6lowpan: lowpan_uncompress_addr with address_mode

This patch drops the pre and postcount calculation from the
lowpan_uncompress_addr function.We use instead a switch/case
over address_mode value. The original implementation has several
bugs in this function and it was hard to decrypt how it works.
To make it maintainable and fix these bugs this patch basically
reimplements lowpan_uncompress_addr from scratch.

A list of bugs we found in the current implementation:

1) Properly support uncompression of short-address based IPv6 addresses
   (instead of basically copying garbage)

2) Fix use and uncompression of long-addresses based IPv6 addresses

3) Add missing ff:fe00 in the case of SAM/DAM = 2 and M = 0

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Reviewed-by: Werner Almesberger <werner@almesberger.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ieee802154/6lowpan.c | 145 +++++++++++++++++++++++++----------------------
 net/ieee802154/6lowpan.h |   8 ++-
 2 files changed, 82 insertions(+), 71 deletions(-)

(limited to 'net')

diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c
index 9aea7ce8b83d..5ef9157c5f1b 100644
--- a/net/ieee802154/6lowpan.c
+++ b/net/ieee802154/6lowpan.c
@@ -67,30 +67,6 @@ static const u8 lowpan_ttl_values[] = {0, 1, 64, 255};
 
 static LIST_HEAD(lowpan_devices);
 
-/*
- * Uncompression of linklocal:
- *   0 -> 16 bytes from packet
- *   1 -> 2  bytes from prefix - bunch of zeroes and 8 from packet
- *   2 -> 2  bytes from prefix - zeroes + 2 from packet
- *   3 -> 2  bytes from prefix - infer 8 bytes from lladdr
- *
- *  NOTE: => the uncompress function does change 0xf to 0x10
- *  NOTE: 0x00 => no-autoconfig => unspecified
- */
-static const u8 lowpan_unc_llconf[] = {0x0f, 0x28, 0x22, 0x20};
-
-/*
- * Uncompression of ctx-based:
- *   0 -> 0 bits  from packet [unspecified / reserved]
- *   1 -> 8 bytes from prefix - bunch of zeroes and 8 from packet
- *   2 -> 8 bytes from prefix - zeroes + 2 from packet
- *   3 -> 8 bytes from prefix - infer 8 bytes from lladdr
- */
-static const u8 lowpan_unc_ctxconf[] = {0x00, 0x88, 0x82, 0x80};
-
-/* Link local prefix */
-static const u8 lowpan_llprefix[] = {0xfe, 0x80};
-
 /* private device info */
 struct lowpan_dev_info {
 	struct net_device	*real_dev; /* real WPAN device ptr */
@@ -182,51 +158,86 @@ lowpan_compress_addr_64(u8 **hc06_ptr, u8 shift, const struct in6_addr *ipaddr,
 	return rol8(val, shift);
 }
 
-static void
-lowpan_uip_ds6_set_addr_iid(struct in6_addr *ipaddr, unsigned char *lladdr)
-{
-	memcpy(&ipaddr->s6_addr[8], lladdr, IEEE802154_ADDR_LEN);
-	/* second bit-flip (Universe/Local) is done according RFC2464 */
-	ipaddr->s6_addr[8] ^= 0x02;
-}
-
 /*
- * Uncompress addresses based on a prefix and a postfix with zeroes in
- * between. If the postfix is zero in length it will use the link address
- * to configure the IP address (autoconf style).
- * pref_post_count takes a byte where the first nibble specify prefix count
- * and the second postfix count (NOTE: 15/0xf => 16 bytes copy).
+ * Uncompress address function for source and
+ * destination address(non-multicast).
+ *
+ * address_mode is sam value or dam value.
  */
 static int
-lowpan_uncompress_addr(struct sk_buff *skb, struct in6_addr *ipaddr,
-	u8 const *prefix, u8 pref_post_count, unsigned char *lladdr)
+lowpan_uncompress_addr(struct sk_buff *skb,
+		struct in6_addr *ipaddr,
+		const u8 address_mode,
+		const struct ieee802154_addr *lladdr)
 {
-	u8 prefcount = pref_post_count >> 4;
-	u8 postcount = pref_post_count & 0x0f;
-
-	/* full nibble 15 => 16 */
-	prefcount = (prefcount == 15 ? 16 : prefcount);
-	postcount = (postcount == 15 ? 16 : postcount);
-
-	if (lladdr)
-		lowpan_raw_dump_inline(__func__, "linklocal address",
-						lladdr,	IEEE802154_ADDR_LEN);
-	if (prefcount > 0)
-		memcpy(ipaddr, prefix, prefcount);
-
-	if (postcount > 0) {
-		memcpy(&ipaddr->s6_addr[16 - postcount], skb->data, postcount);
-		skb_pull(skb, postcount);
-	} else if (prefcount > 0) {
-		if (lladdr == NULL)
+	bool fail;
+
+	switch (address_mode) {
+	case LOWPAN_IPHC_ADDR_00:
+		/* for global link addresses */
+		fail = lowpan_fetch_skb(skb, ipaddr->s6_addr, 16);
+		break;
+	case LOWPAN_IPHC_ADDR_01:
+		/* fe:80::XXXX:XXXX:XXXX:XXXX */
+		ipaddr->s6_addr[0] = 0xFE;
+		ipaddr->s6_addr[1] = 0x80;
+		fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[8], 8);
+		break;
+	case LOWPAN_IPHC_ADDR_02:
+		/* fe:80::ff:fe00:XXXX */
+		ipaddr->s6_addr[0] = 0xFE;
+		ipaddr->s6_addr[1] = 0x80;
+		ipaddr->s6_addr[11] = 0xFF;
+		ipaddr->s6_addr[12] = 0xFE;
+		fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[14], 2);
+		break;
+	case LOWPAN_IPHC_ADDR_03:
+		fail = false;
+		switch (lladdr->addr_type) {
+		case IEEE802154_ADDR_LONG:
+			/* fe:80::XXXX:XXXX:XXXX:XXXX
+			 *        \_________________/
+			 *              hwaddr
+			 */
+			ipaddr->s6_addr[0] = 0xFE;
+			ipaddr->s6_addr[1] = 0x80;
+			memcpy(&ipaddr->s6_addr[8], lladdr->hwaddr,
+					IEEE802154_ADDR_LEN);
+			/* second bit-flip (Universe/Local)
+			 * is done according RFC2464
+			 */
+			ipaddr->s6_addr[8] ^= 0x02;
+			break;
+		case IEEE802154_ADDR_SHORT:
+			/* fe:80::ff:fe00:XXXX
+			 *		  \__/
+			 *	       short_addr
+			 *
+			 * Universe/Local bit is zero.
+			 */
+			ipaddr->s6_addr[0] = 0xFE;
+			ipaddr->s6_addr[1] = 0x80;
+			ipaddr->s6_addr[11] = 0xFF;
+			ipaddr->s6_addr[12] = 0xFE;
+			ipaddr->s6_addr16[7] = htons(lladdr->short_addr);
+			break;
+		default:
+			pr_debug("Invalid addr_type set\n");
 			return -EINVAL;
+		}
+		break;
+	default:
+		pr_debug("Invalid address mode value: 0x%x\n", address_mode);
+		return -EINVAL;
+	}
 
-		/* no IID based configuration if no prefix and no data */
-		lowpan_uip_ds6_set_addr_iid(ipaddr, lladdr);
+	if (fail) {
+		pr_debug("Failed to fetch skb data\n");
+		return -EIO;
 	}
 
-	pr_debug("uncompressing %d + %d => ", prefcount, postcount);
-	lowpan_raw_dump_inline(NULL, NULL, ipaddr->s6_addr, 16);
+	lowpan_raw_dump_inline(NULL, "Reconstructed ipv6 addr is:\n",
+			ipaddr->s6_addr, 16);
 
 	return 0;
 }
@@ -775,7 +786,7 @@ lowpan_process_data(struct sk_buff *skb)
 {
 	struct ipv6hdr hdr = {};
 	u8 tmp, iphc0, iphc1, num_context = 0;
-	u8 *_saddr, *_daddr;
+	const struct ieee802154_addr *_saddr, *_daddr;
 	int err;
 
 	lowpan_raw_dump_table(__func__, "raw skb data dump", skb->data,
@@ -878,8 +889,8 @@ lowpan_process_data(struct sk_buff *skb)
 	if (lowpan_fetch_skb_u8(skb, &iphc1))
 		goto drop;
 
-	_saddr = mac_cb(skb)->sa.hwaddr;
-	_daddr = mac_cb(skb)->da.hwaddr;
+	_saddr = &mac_cb(skb)->sa;
+	_daddr = &mac_cb(skb)->da;
 
 	pr_debug("iphc0 = %02x, iphc1 = %02x\n", iphc0, iphc1);
 
@@ -961,8 +972,7 @@ lowpan_process_data(struct sk_buff *skb)
 
 	/* Source address uncompression */
 	pr_debug("source address stateless compression\n");
-	err = lowpan_uncompress_addr(skb, &hdr.saddr, lowpan_llprefix,
-				lowpan_unc_llconf[tmp], skb->data);
+	err = lowpan_uncompress_addr(skb, &hdr.saddr, tmp, _saddr);
 	if (err)
 		goto drop;
 
@@ -982,8 +992,7 @@ lowpan_process_data(struct sk_buff *skb)
 		}
 	} else {
 		pr_debug("dest: stateless compression\n");
-		err = lowpan_uncompress_addr(skb, &hdr.daddr, lowpan_llprefix,
-				lowpan_unc_llconf[tmp], skb->data);
+		err = lowpan_uncompress_addr(skb, &hdr.daddr, tmp, _daddr);
 		if (err)
 			goto drop;
 	}
diff --git a/net/ieee802154/6lowpan.h b/net/ieee802154/6lowpan.h
index a636545d8dcf..2869c0526dad 100644
--- a/net/ieee802154/6lowpan.h
+++ b/net/ieee802154/6lowpan.h
@@ -193,10 +193,12 @@
 /* Values of fields within the IPHC encoding second byte */
 #define LOWPAN_IPHC_CID		0x80
 
+#define LOWPAN_IPHC_ADDR_00	0x00
+#define LOWPAN_IPHC_ADDR_01	0x01
+#define LOWPAN_IPHC_ADDR_02	0x02
+#define LOWPAN_IPHC_ADDR_03	0x03
+
 #define LOWPAN_IPHC_SAC		0x40
-#define LOWPAN_IPHC_SAM_00	0x00
-#define LOWPAN_IPHC_SAM_01	0x10
-#define LOWPAN_IPHC_SAM_10	0x20
 #define LOWPAN_IPHC_SAM		0x30
 
 #define LOWPAN_IPHC_SAM_BIT	4
-- 
cgit v1.2.3


From 65d892c8ac8f131f668edd55db291a7c68f8338f Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Fri, 16 Aug 2013 21:59:59 +0200
Subject: 6lowpan: handle context based source address

Handle context based address when an unspecified address is given.
For other context based address we print a warning and drop the packet
because we don't support it right now.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Reviewed-by: Werner Almesberger <werner@almesberger.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ieee802154/6lowpan.c | 49 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 46 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c
index 5ef9157c5f1b..c85e71e0c7ff 100644
--- a/net/ieee802154/6lowpan.c
+++ b/net/ieee802154/6lowpan.c
@@ -242,6 +242,40 @@ lowpan_uncompress_addr(struct sk_buff *skb,
 	return 0;
 }
 
+/* Uncompress address function for source context
+ * based address(non-multicast).
+ */
+static int
+lowpan_uncompress_context_based_src_addr(struct sk_buff *skb,
+		struct in6_addr *ipaddr,
+		const u8 sam)
+{
+	switch (sam) {
+	case LOWPAN_IPHC_ADDR_00:
+		/* unspec address ::
+		 * Do nothing, address is already ::
+		 */
+		break;
+	case LOWPAN_IPHC_ADDR_01:
+		/* TODO */
+	case LOWPAN_IPHC_ADDR_02:
+		/* TODO */
+	case LOWPAN_IPHC_ADDR_03:
+		/* TODO */
+		netdev_warn(skb->dev, "SAM value 0x%x not supported\n", sam);
+		return -EINVAL;
+	default:
+		pr_debug("Invalid sam value: 0x%x\n", sam);
+		return -EINVAL;
+	}
+
+	lowpan_raw_dump_inline(NULL,
+			"Reconstructed context based ipv6 src addr is:\n",
+			ipaddr->s6_addr, 16);
+
+	return 0;
+}
+
 /* Uncompress function for multicast destination address,
  * when M bit is set.
  */
@@ -970,9 +1004,18 @@ lowpan_process_data(struct sk_buff *skb)
 	/* Extract SAM to the tmp variable */
 	tmp = ((iphc1 & LOWPAN_IPHC_SAM) >> LOWPAN_IPHC_SAM_BIT) & 0x03;
 
-	/* Source address uncompression */
-	pr_debug("source address stateless compression\n");
-	err = lowpan_uncompress_addr(skb, &hdr.saddr, tmp, _saddr);
+	if (iphc1 & LOWPAN_IPHC_SAC) {
+		/* Source address context based uncompression */
+		pr_debug("SAC bit is set. Handle context based source address.\n");
+		err = lowpan_uncompress_context_based_src_addr(
+				skb, &hdr.saddr, tmp);
+	} else {
+		/* Source address uncompression */
+		pr_debug("source address stateless compression\n");
+		err = lowpan_uncompress_addr(skb, &hdr.saddr, tmp, _saddr);
+	}
+
+	/* Check on error of previous branch */
 	if (err)
 		goto drop;
 
-- 
cgit v1.2.3


From ba3542e15cf8b9f3dee4c64a383c7102c0d8fab3 Mon Sep 17 00:00:00 2001
From: dingtianhong <dingtianhong@huawei.com>
Date: Sat, 17 Aug 2013 10:27:04 +0800
Subject: ipv6: convert the uses of ADBG and remove the superfluous parentheses

Just follow the Joe Perches's opinion, it is a better way to fix the
style errors.

Suggested-by: Joe Perches <joe@perches.com>
Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index ad12f7c43f25..3ca23e89f517 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -99,9 +99,9 @@
 #define ACONF_DEBUG 2
 
 #if ACONF_DEBUG >= 3
-#define ADBG(x) printk x
+#define ADBG(fmt, ...) printk(fmt, ##__VA_ARGS__)
 #else
-#define ADBG(x)
+#define ADBG(fmt, ...) do { if (0) printk(fmt, ##__VA_ARGS__); } while (0)
 #endif
 
 #define	INFINITY_LIFE_TIME	0xFFFFFFFF
@@ -374,9 +374,9 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
 	dev_hold(dev);
 
 	if (snmp6_alloc_dev(ndev) < 0) {
-		ADBG((KERN_WARNING
+		ADBG(KERN_WARNING
 			"%s: cannot allocate memory for statistics; dev=%s.\n",
-			__func__, dev->name));
+			__func__, dev->name);
 		neigh_parms_release(&nd_tbl, ndev->nd_parms);
 		dev_put(dev);
 		kfree(ndev);
@@ -384,9 +384,9 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
 	}
 
 	if (snmp6_register_dev(ndev) < 0) {
-		ADBG((KERN_WARNING
+		ADBG(KERN_WARNING
 			"%s: cannot create /proc/net/dev_snmp6/%s\n",
-			__func__, dev->name));
+			__func__, dev->name);
 		neigh_parms_release(&nd_tbl, ndev->nd_parms);
 		ndev->dead = 1;
 		in6_dev_finish_destroy(ndev);
@@ -849,7 +849,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 
 	/* Ignore adding duplicate addresses on an interface */
 	if (ipv6_chk_same_addr(dev_net(idev->dev), addr, idev->dev)) {
-		ADBG(("ipv6_add_addr: already assigned\n"));
+		ADBG("ipv6_add_addr: already assigned\n");
 		err = -EEXIST;
 		goto out;
 	}
@@ -857,7 +857,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 	ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC);
 
 	if (ifa == NULL) {
-		ADBG(("ipv6_add_addr: malloc failed\n"));
+		ADBG("ipv6_add_addr: malloc failed\n");
 		err = -ENOBUFS;
 		goto out;
 	}
@@ -2057,7 +2057,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
 	pinfo = (struct prefix_info *) opt;
 
 	if (len < sizeof(struct prefix_info)) {
-		ADBG(("addrconf: prefix option too short\n"));
+		ADBG("addrconf: prefix option too short\n");
 		return;
 	}
 
@@ -3637,8 +3637,8 @@ restart:
 	if (time_before(next_sched, jiffies + ADDRCONF_TIMER_FUZZ_MAX))
 		next_sched = jiffies + ADDRCONF_TIMER_FUZZ_MAX;
 
-	ADBG((KERN_DEBUG "now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n",
-	      now, next, next_sec, next_sched));
+	ADBG(KERN_DEBUG "now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n",
+	      now, next, next_sec, next_sched);
 
 	addr_chk_timer.expires = next_sched;
 	add_timer(&addr_chk_timer);
-- 
cgit v1.2.3


From df8372ca747f6da9e8590775721d9363c1dfc87e Mon Sep 17 00:00:00 2001
From: dingtianhong <dingtianhong@huawei.com>
Date: Sat, 17 Aug 2013 10:27:15 +0800
Subject: ipv6: fix checkpatch errors in net/ipv6/addrconf.c

ERROR: code indent should use tabs where possible: fix 2.
ERROR: do not use assignment in if condition: fix 5.

Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 3ca23e89f517..21638d415016 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1110,8 +1110,8 @@ retry:
 	spin_unlock_bh(&ifp->lock);
 
 	regen_advance = idev->cnf.regen_max_retry *
-	                idev->cnf.dad_transmits *
-	                idev->nd_parms->retrans_time / HZ;
+			idev->cnf.dad_transmits *
+			idev->nd_parms->retrans_time / HZ;
 	write_unlock(&idev->lock);
 
 	/* A temporary address is created only if this calculated Preferred
@@ -2501,7 +2501,8 @@ static int inet6_addr_del(struct net *net, int ifindex, const struct in6_addr *p
 	if (!dev)
 		return -ENODEV;
 
-	if ((idev = __in6_dev_get(dev)) == NULL)
+	idev = __in6_dev_get(dev);
+	if (!idev)
 		return -ENXIO;
 
 	read_lock_bh(&idev->lock);
@@ -2640,7 +2641,8 @@ static void init_loopback(struct net_device *dev)
 
 	ASSERT_RTNL();
 
-	if ((idev = ipv6_find_idev(dev)) == NULL) {
+	idev = ipv6_find_idev(dev);
+	if (!idev) {
 		pr_debug("%s: add_dev failed\n", __func__);
 		return;
 	}
@@ -2738,7 +2740,8 @@ static void addrconf_sit_config(struct net_device *dev)
 	 * our v4 addrs in the tunnel
 	 */
 
-	if ((idev = ipv6_find_idev(dev)) == NULL) {
+	idev = ipv6_find_idev(dev);
+	if (!idev) {
 		pr_debug("%s: add_dev failed\n", __func__);
 		return;
 	}
@@ -2770,7 +2773,8 @@ static void addrconf_gre_config(struct net_device *dev)
 
 	ASSERT_RTNL();
 
-	if ((idev = ipv6_find_idev(dev)) == NULL) {
+	idev = ipv6_find_idev(dev);
+	if (!idev) {
 		pr_debug("%s: add_dev failed\n", __func__);
 		return;
 	}
@@ -2801,11 +2805,11 @@ static void ip6_tnl_add_linklocal(struct inet6_dev *idev)
 	struct net *net = dev_net(idev->dev);
 
 	/* first try to inherit the link-local address from the link device */
-	if (idev->dev->iflink &&
-	    (link_dev = __dev_get_by_index(net, idev->dev->iflink))) {
-		if (!ipv6_inherit_linklocal(idev, link_dev))
+	if (idev->dev->iflink)
+		link_dev = __dev_get_by_index(net, idev->dev->iflink);
+		if (link_dev && !ipv6_inherit_linklocal(idev, link_dev))
 			return;
-	}
+
 	/* then try to inherit it from any device */
 	for_each_netdev(net, link_dev) {
 		if (!ipv6_inherit_linklocal(idev, link_dev))
-- 
cgit v1.2.3


From 397b41746333ad386d91d23ea0f79481320dcdcc Mon Sep 17 00:00:00 2001
From: Christoph Paasch <christoph.paasch@uclouvain.be>
Date: Sun, 18 Aug 2013 17:34:38 +0200
Subject: tcp: trivial: Remove nocache argument from tcp_v4_send_synack

The nocache-argument was used in tcp_v4_send_synack as an argument to
inet_csk_route_req. However, since ba3f7f04ef2b (ipv4: Kill
FLOWI_FLAG_RT_NOCACHE and associated code.) this is no more used.

This patch removes the unsued argument from tcp_v4_send_synack.

Signed-off-by: Christoph Paasch <christoph.paasch@uclouvain.be>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 05a3d45d3102..09d45d718973 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -821,8 +821,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
  */
 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 			      struct request_sock *req,
-			      u16 queue_mapping,
-			      bool nocache)
+			      u16 queue_mapping)
 {
 	const struct inet_request_sock *ireq = inet_rsk(req);
 	struct flowi4 fl4;
@@ -852,7 +851,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 
 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
 {
-	int res = tcp_v4_send_synack(sk, NULL, req, 0, false);
+	int res = tcp_v4_send_synack(sk, NULL, req, 0);
 
 	if (!res)
 		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
-- 
cgit v1.2.3


From 734d2725db879f3f6fcdc2b1d2a5deae105f5e95 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 18 Aug 2013 19:08:07 -0700
Subject: ipv4: raise IP_MAX_MTU to theoretical limit

As discussed last year [1], there is no compelling reason
to limit IPv4 MTU to 0xFFF0, while real limit is 0xFFFF

[1] : http://marc.info/?l=linux-netdev&m=135607247609434&w=2

Willem raised this issue again because some of our internal
regression tests broke after lo mtu being set to 65536.

IP_MTU reports 0xFFF0, and the test attempts to send a RAW datagram of
mtu + 1 bytes, expecting the send() to fail, but it does not.

Alexey raised interesting points about TCP MSS, that should be addressed
in follow-up patches in TCP stack if needed, as someone could also set
an odd mtu anyway.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index e805481eff72..727f4365bcdf 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -112,7 +112,8 @@
 #define RT_FL_TOS(oldflp4) \
 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 
-#define IP_MAX_MTU	0xFFF0
+/* IPv4 datagram length is stored into 16bit field (tot_len) */
+#define IP_MAX_MTU	0xFFFF
 
 #define RT_GC_TIMEOUT (300*HZ)
 
@@ -1227,10 +1228,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
 			mtu = 576;
 	}
 
-	if (mtu > IP_MAX_MTU)
-		mtu = IP_MAX_MTU;
-
-	return mtu;
+	return min_t(unsigned int, mtu, IP_MAX_MTU);
 }
 
 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
-- 
cgit v1.2.3


From ea857f28ab5ca82508e70ffbc8cb296705133f01 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Mon, 19 Aug 2013 10:05:10 +0300
Subject: ipip: dereferencing an ERR_PTR in ip_tunnel_init_net()

We need to move the derefernce after the IS_ERR() check.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_tunnel.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index a4d9126c7b51..24549b4aeae2 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -854,14 +854,14 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
 
 	rtnl_lock();
 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
-	/* FB netdevice is special: we have one, and only one per netns.
-	 * Allowing to move it to another netns is clearly unsafe.
-	 */
-	itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
 	rtnl_unlock();
 
 	if (IS_ERR(itn->fb_tunnel_dev))
 		return PTR_ERR(itn->fb_tunnel_dev);
+	/* FB netdevice is special: we have one, and only one per netns.
+	 * Allowing to move it to another netns is clearly unsafe.
+	 */
+	itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
 
 	return 0;
 }
-- 
cgit v1.2.3


From 7eaa48a45ca74d8dbc65e7dedaa8c0736154fd5f Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 20 Aug 2013 23:44:39 -0700
Subject: Revert "ipv6: fix checkpatch errors in net/ipv6/addrconf.c"

This reverts commit df8372ca747f6da9e8590775721d9363c1dfc87e.

These changes are buggy and make unintended semantic changes
to ip6_tnl_add_linklocal().

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 21638d415016..3ca23e89f517 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1110,8 +1110,8 @@ retry:
 	spin_unlock_bh(&ifp->lock);
 
 	regen_advance = idev->cnf.regen_max_retry *
-			idev->cnf.dad_transmits *
-			idev->nd_parms->retrans_time / HZ;
+	                idev->cnf.dad_transmits *
+	                idev->nd_parms->retrans_time / HZ;
 	write_unlock(&idev->lock);
 
 	/* A temporary address is created only if this calculated Preferred
@@ -2501,8 +2501,7 @@ static int inet6_addr_del(struct net *net, int ifindex, const struct in6_addr *p
 	if (!dev)
 		return -ENODEV;
 
-	idev = __in6_dev_get(dev);
-	if (!idev)
+	if ((idev = __in6_dev_get(dev)) == NULL)
 		return -ENXIO;
 
 	read_lock_bh(&idev->lock);
@@ -2641,8 +2640,7 @@ static void init_loopback(struct net_device *dev)
 
 	ASSERT_RTNL();
 
-	idev = ipv6_find_idev(dev);
-	if (!idev) {
+	if ((idev = ipv6_find_idev(dev)) == NULL) {
 		pr_debug("%s: add_dev failed\n", __func__);
 		return;
 	}
@@ -2740,8 +2738,7 @@ static void addrconf_sit_config(struct net_device *dev)
 	 * our v4 addrs in the tunnel
 	 */
 
-	idev = ipv6_find_idev(dev);
-	if (!idev) {
+	if ((idev = ipv6_find_idev(dev)) == NULL) {
 		pr_debug("%s: add_dev failed\n", __func__);
 		return;
 	}
@@ -2773,8 +2770,7 @@ static void addrconf_gre_config(struct net_device *dev)
 
 	ASSERT_RTNL();
 
-	idev = ipv6_find_idev(dev);
-	if (!idev) {
+	if ((idev = ipv6_find_idev(dev)) == NULL) {
 		pr_debug("%s: add_dev failed\n", __func__);
 		return;
 	}
@@ -2805,11 +2801,11 @@ static void ip6_tnl_add_linklocal(struct inet6_dev *idev)
 	struct net *net = dev_net(idev->dev);
 
 	/* first try to inherit the link-local address from the link device */
-	if (idev->dev->iflink)
-		link_dev = __dev_get_by_index(net, idev->dev->iflink);
-		if (link_dev && !ipv6_inherit_linklocal(idev, link_dev))
+	if (idev->dev->iflink &&
+	    (link_dev = __dev_get_by_index(net, idev->dev->iflink))) {
+		if (!ipv6_inherit_linklocal(idev, link_dev))
 			return;
-
+	}
 	/* then try to inherit it from any device */
 	for_each_netdev(net, link_dev) {
 		if (!ipv6_inherit_linklocal(idev, link_dev))
-- 
cgit v1.2.3


From e837735ec406a347756e0ef3aea124a6d51ea1ba Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Tue, 20 Aug 2013 12:16:06 +0200
Subject: ip6_tunnel: ensure to always have a link local address

When an Xin6 tunnel is set up, we check other netdevices to inherit the link-
local address. If none is available, the interface will not have any link-local
address. RFC4862 expects that each interface has a link local address.

Now than this kind of tunnels supports x-netns, it's easy to fall in this case
(by creating the tunnel in a netns where ethernet interfaces stand and then
moving it to a other netns where no ethernet interface is available).

RFC4291, Appendix A suggests two methods: the first is the one currently
implemented, the second is to generate a unique identifier, so that we can
always generate the link-local address. Let's use eth_random_addr() to generate
this interface indentifier.

I remove completly the previous method, hence for the whole life of the
interface, the link-local address remains the same (previously, it depends on
which ethernet interfaces were up when the tunnel interface was set up).

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c   | 56 +++++++++++++--------------------------------------
 net/ipv6/ip6_tunnel.c |  4 ++++
 2 files changed, 18 insertions(+), 42 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 3ca23e89f517..8549a5d0d167 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1814,6 +1814,16 @@ static int addrconf_ifid_gre(u8 *eui, struct net_device *dev)
 	return __ipv6_isatap_ifid(eui, *(__be32 *)dev->dev_addr);
 }
 
+static int addrconf_ifid_ip6tnl(u8 *eui, struct net_device *dev)
+{
+	memcpy(eui, dev->perm_addr, 3);
+	memcpy(eui + 5, dev->perm_addr + 3, 3);
+	eui[3] = 0xFF;
+	eui[4] = 0xFE;
+	eui[0] ^= 2;
+	return 0;
+}
+
 static int ipv6_generate_eui64(u8 *eui, struct net_device *dev)
 {
 	switch (dev->type) {
@@ -1832,6 +1842,8 @@ static int ipv6_generate_eui64(u8 *eui, struct net_device *dev)
 		return addrconf_ifid_eui64(eui, dev);
 	case ARPHRD_IEEE1394:
 		return addrconf_ifid_ieee1394(eui, dev);
+	case ARPHRD_TUNNEL6:
+		return addrconf_ifid_ip6tnl(eui, dev);
 	}
 	return -1;
 }
@@ -2709,7 +2721,8 @@ static void addrconf_dev_config(struct net_device *dev)
 	    (dev->type != ARPHRD_ARCNET) &&
 	    (dev->type != ARPHRD_INFINIBAND) &&
 	    (dev->type != ARPHRD_IEEE802154) &&
-	    (dev->type != ARPHRD_IEEE1394)) {
+	    (dev->type != ARPHRD_IEEE1394) &&
+	    (dev->type != ARPHRD_TUNNEL6)) {
 		/* Alas, we support only Ethernet autoconfiguration. */
 		return;
 	}
@@ -2795,44 +2808,6 @@ ipv6_inherit_linklocal(struct inet6_dev *idev, struct net_device *link_dev)
 	return -1;
 }
 
-static void ip6_tnl_add_linklocal(struct inet6_dev *idev)
-{
-	struct net_device *link_dev;
-	struct net *net = dev_net(idev->dev);
-
-	/* first try to inherit the link-local address from the link device */
-	if (idev->dev->iflink &&
-	    (link_dev = __dev_get_by_index(net, idev->dev->iflink))) {
-		if (!ipv6_inherit_linklocal(idev, link_dev))
-			return;
-	}
-	/* then try to inherit it from any device */
-	for_each_netdev(net, link_dev) {
-		if (!ipv6_inherit_linklocal(idev, link_dev))
-			return;
-	}
-	pr_debug("init ip6-ip6: add_linklocal failed\n");
-}
-
-/*
- * Autoconfigure tunnel with a link-local address so routing protocols,
- * DHCPv6, MLD etc. can be run over the virtual link
- */
-
-static void addrconf_ip6_tnl_config(struct net_device *dev)
-{
-	struct inet6_dev *idev;
-
-	ASSERT_RTNL();
-
-	idev = addrconf_add_dev(dev);
-	if (IS_ERR(idev)) {
-		pr_debug("init ip6-ip6: add_dev failed\n");
-		return;
-	}
-	ip6_tnl_add_linklocal(idev);
-}
-
 static int addrconf_notify(struct notifier_block *this, unsigned long event,
 			   void *ptr)
 {
@@ -2900,9 +2875,6 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
 			addrconf_gre_config(dev);
 			break;
 #endif
-		case ARPHRD_TUNNEL6:
-			addrconf_ip6_tnl_config(dev);
-			break;
 		case ARPHRD_LOOPBACK:
 			init_loopback(dev);
 			break;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index cc3bb201b8b0..d6e00a39274c 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -41,6 +41,7 @@
 #include <linux/netfilter_ipv6.h>
 #include <linux/slab.h>
 #include <linux/hash.h>
+#include <linux/etherdevice.h>
 
 #include <asm/uaccess.h>
 #include <linux/atomic.h>
@@ -1471,6 +1472,9 @@ static void ip6_tnl_dev_setup(struct net_device *dev)
 	dev->flags |= IFF_NOARP;
 	dev->addr_len = sizeof(struct in6_addr);
 	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+	/* This perm addr will be used as interface identifier by IPv6 */
+	dev->addr_assign_type = NET_ADDR_RANDOM;
+	eth_random_addr(dev->perm_addr);
 }
 
 
-- 
cgit v1.2.3


From 846989635b368c41efd9079ec6fe0e8d2845ec11 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Tue, 20 Aug 2013 12:22:00 +0200
Subject: net: ipv6: igmp6_event_query: use msecs_to_jiffies

Use proper API functions to calculate jiffies from milliseconds and
not the crude method of dividing HZ by a value. This ensures more
accurate values even in the case of strange HZ values. While at it,
also simplify code in the mlh2 case by using max().

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/mcast.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 6c76df9909bf..57863e206d56 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1150,9 +1150,7 @@ int igmp6_event_query(struct sk_buff *skb)
 		int switchback;
 		/* MLDv1 router present */
 
-		/* Translate milliseconds to jiffies */
-		max_delay = (ntohs(mld->mld_maxdelay)*HZ)/1000;
-
+		max_delay = msecs_to_jiffies(ntohs(mld->mld_maxdelay));
 		switchback = (idev->mc_qrv + 1) * max_delay;
 		idev->mc_v1_seen = jiffies + switchback;
 
@@ -1169,10 +1167,11 @@ int igmp6_event_query(struct sk_buff *skb)
 			return -EINVAL;
 
 		mlh2 = (struct mld2_query *)skb_transport_header(skb);
-		max_delay = (MLDV2_MRC(ntohs(mlh2->mld2q_mrc))*HZ)/1000;
-		if (!max_delay)
-			max_delay = 1;
+
+		max_delay = max(msecs_to_jiffies(MLDV2_MRC(ntohs(mlh2->mld2q_mrc))), 1UL);
+
 		idev->mc_maxdelay = max_delay;
+
 		if (mlh2->mld2q_qrv)
 			idev->mc_qrv = mlh2->mld2q_qrv;
 		if (group_type == IPV6_ADDR_ANY) { /* general query */
-- 
cgit v1.2.3


From c2cef4e88808cc566956605a39a96080912c0566 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Tue, 20 Aug 2013 12:22:01 +0200
Subject: net: ipv6: minor: *_start_timer: rather use unsigned long

For the functions mld_gq_start_timer(), mld_ifc_start_timer(),
and mld_dad_start_timer(), rather use unsigned long than int
as we operate only on unsigned values anyway. This seems more
appropriate as there is no good reason to do type conversions
to int, that could lead to future errors.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/mcast.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 57863e206d56..33f98d9ac376 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -996,24 +996,24 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
 
 static void mld_gq_start_timer(struct inet6_dev *idev)
 {
-	int tv = net_random() % idev->mc_maxdelay;
+	unsigned long tv = net_random() % idev->mc_maxdelay;
 
 	idev->mc_gq_running = 1;
 	if (!mod_timer(&idev->mc_gq_timer, jiffies+tv+2))
 		in6_dev_hold(idev);
 }
 
-static void mld_ifc_start_timer(struct inet6_dev *idev, int delay)
+static void mld_ifc_start_timer(struct inet6_dev *idev, unsigned long delay)
 {
-	int tv = net_random() % delay;
+	unsigned long tv = net_random() % delay;
 
 	if (!mod_timer(&idev->mc_ifc_timer, jiffies+tv+2))
 		in6_dev_hold(idev);
 }
 
-static void mld_dad_start_timer(struct inet6_dev *idev, int delay)
+static void mld_dad_start_timer(struct inet6_dev *idev, unsigned long delay)
 {
-	int tv = net_random() % delay;
+	unsigned long tv = net_random() % delay;
 
 	if (!mod_timer(&idev->mc_dad_timer, jiffies+tv+2))
 		in6_dev_hold(idev);
-- 
cgit v1.2.3


From 9fd0784164047583fa856c9a5aeda1d6c6fb0399 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Tue, 20 Aug 2013 12:22:02 +0200
Subject: net: ipv6: mcast: minor: use defines for rfc3810/8.1 lengths

Instead of hard-coding length values, use a define to make it clear
where those lengths come from.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/mcast.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 33f98d9ac376..98ead2b1a669 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -107,9 +107,12 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca,
 static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml,
 			    struct inet6_dev *idev);
 
-
 #define MLD_QRV_DEFAULT		2
 
+/* RFC3810, 8.1 Query Version Distinctions */
+#define MLD_V1_QUERY_LEN	24
+#define MLD_V2_QUERY_LEN_MIN	28
+
 #define MLD_V1_SEEN(idev) (dev_net((idev)->dev)->ipv6.devconf_all->force_mld_version == 1 || \
 		(idev)->cnf.force_mld_version == 1 || \
 		((idev)->mc_v1_seen && \
@@ -1146,7 +1149,7 @@ int igmp6_event_query(struct sk_buff *skb)
 	    !(group_type&IPV6_ADDR_MULTICAST))
 		return -EINVAL;
 
-	if (len == 24) {
+	if (len == MLD_V1_QUERY_LEN) {
 		int switchback;
 		/* MLDv1 router present */
 
@@ -1160,7 +1163,7 @@ int igmp6_event_query(struct sk_buff *skb)
 			__in6_dev_put(idev);
 		/* clear deleted report items */
 		mld_clear_delrec(idev);
-	} else if (len >= 28) {
+	} else if (len >= MLD_V2_QUERY_LEN_MIN) {
 		int srcs_offset = sizeof(struct mld2_query) -
 				  sizeof(struct icmp6hdr);
 		if (!pskb_may_pull(skb, srcs_offset))
-- 
cgit v1.2.3


From 9d225d2208a6f17da5987ff4e7710b9e805cb5d6 Mon Sep 17 00:00:00 2001
From: Johan Hedberg <johan.hedberg@intel.com>
Date: Thu, 8 Aug 2013 14:53:56 +0300
Subject: Bluetooth: Fix getting SCO socket options in deferred state

When a socket is in deferred state there does actually exist an
underlying connection even though the connection state is not yet
BT_CONNECTED. In the deferred state it should therefore be allowed to
get socket options that usually depend on a connection, such as
SCO_OPTIONS and SCO_CONNINFO.

This patch fixes the behavior of some user space code that behaves as
follows without it:

$ sudo tools/btiotest -i 00:1B:DC:xx:xx:xx -d -s
accept=2 reject=-1 discon=-1 defer=1 sec=0 update_sec=0 prio=0 voice=0x0000
Listening for SCO connections
bt_io_get(OPT_DEST): getsockopt(SCO_OPTIONS): Transport endpoint is not connected (107)
Accepting connection
Successfully connected to 60:D8:19:xx:xx:xx. handle=43, class=000000

The conditions that the patch updates the if-statements to is taken from
similar code in l2cap_sock.c which correctly handles the deferred state.

Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
---
 net/bluetooth/sco.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index e7bd4eea575c..2de7150a6304 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -765,7 +765,9 @@ static int sco_sock_getsockopt_old(struct socket *sock, int optname, char __user
 
 	switch (optname) {
 	case SCO_OPTIONS:
-		if (sk->sk_state != BT_CONNECTED) {
+		if (sk->sk_state != BT_CONNECTED &&
+		    !(sk->sk_state == BT_CONNECT2 &&
+		      test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags))) {
 			err = -ENOTCONN;
 			break;
 		}
@@ -781,7 +783,9 @@ static int sco_sock_getsockopt_old(struct socket *sock, int optname, char __user
 		break;
 
 	case SCO_CONNINFO:
-		if (sk->sk_state != BT_CONNECTED) {
+		if (sk->sk_state != BT_CONNECTED &&
+		    !(sk->sk_state == BT_CONNECT2 &&
+		      test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags))) {
 			err = -ENOTCONN;
 			break;
 		}
-- 
cgit v1.2.3


From c7882cbd1151011ca8e6fb13530cd09eae1c39ee Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Tue, 13 Aug 2013 10:00:54 -0700
Subject: Bluetooth: Set different event mask for LE-only controllers

In case of a Low Energy only controller it makes no sense to configure
the full BR/EDR event mask. It will just enable events that can not be
send anyway and there is no guarantee that such a controller will accept
this value.

Use event mask 0x90 0xe8 0x04 0x02 0x00 0x80 0x00 0x20 for LE-only
controllers which enables the following events:

          Disconnection Complete
          Encryption Change
          Read Remote Version Information Complete
          Command Complete
          Command Status
          Hardware Error
          Number of Completed Packets
          Data Buffer Overflow
          Encryption Key Refresh Complete
          LE Meta

This is according to Core Specification, Part E, Section 3.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
---
 net/bluetooth/hci_core.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'net')

diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index b821b199b333..8d9b87df292f 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -454,6 +454,18 @@ static void hci_setup_event_mask(struct hci_request *req)
 		events[4] |= 0x04; /* Read Remote Extended Features Complete */
 		events[5] |= 0x08; /* Synchronous Connection Complete */
 		events[5] |= 0x10; /* Synchronous Connection Changed */
+	} else {
+		/* Use a different default for LE-only devices */
+		memset(events, 0, sizeof(events));
+		events[0] |= 0x10; /* Disconnection Complete */
+		events[0] |= 0x80; /* Encryption Change */
+		events[1] |= 0x08; /* Read Remote Version Information Complete */
+		events[1] |= 0x20; /* Command Complete */
+		events[1] |= 0x40; /* Command Status */
+		events[1] |= 0x80; /* Hardware Error */
+		events[2] |= 0x04; /* Number of Completed Packets */
+		events[3] |= 0x02; /* Data Buffer Overflow */
+		events[5] |= 0x80; /* Encryption Key Refresh Complete */
 	}
 
 	if (lmp_inq_rssi_capable(hdev))
-- 
cgit v1.2.3


From 396dc223dd36edd218650d042a07c5e61f022c5b Mon Sep 17 00:00:00 2001
From: Gianluca Anzolin <gianluca@sottospazio.it>
Date: Mon, 29 Jul 2013 17:08:08 +0200
Subject: Bluetooth: Take proper tty_struct references

In net/bluetooth/rfcomm/tty.c the struct tty_struct is used without
taking references. This may lead to a use-after-free of the rfcomm tty.

Fix this by taking references properly, using the tty_port_* helpers
when possible.

The raw assignments of dev->port.tty in rfcomm_tty_open/close are
addressed in the later commit 'rfcomm: Implement .activate, .shutdown
and .carrier_raised methods'.

Signed-off-by: Gianluca Anzolin <gianluca@sottospazio.it>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
---
 net/bluetooth/rfcomm/tty.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index b6e44ad6cca6..cd7ff370be38 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -333,10 +333,9 @@ static inline unsigned int rfcomm_room(struct rfcomm_dlc *dlc)
 static void rfcomm_wfree(struct sk_buff *skb)
 {
 	struct rfcomm_dev *dev = (void *) skb->sk;
-	struct tty_struct *tty = dev->port.tty;
 	atomic_sub(skb->truesize, &dev->wmem_alloc);
-	if (test_bit(RFCOMM_TTY_ATTACHED, &dev->flags) && tty)
-		tty_wakeup(tty);
+	if (test_bit(RFCOMM_TTY_ATTACHED, &dev->flags))
+		tty_port_tty_wakeup(&dev->port);
 	tty_port_put(&dev->port);
 }
 
@@ -410,6 +409,7 @@ static int rfcomm_release_dev(void __user *arg)
 {
 	struct rfcomm_dev_req req;
 	struct rfcomm_dev *dev;
+	struct tty_struct *tty;
 
 	if (copy_from_user(&req, arg, sizeof(req)))
 		return -EFAULT;
@@ -429,8 +429,11 @@ static int rfcomm_release_dev(void __user *arg)
 		rfcomm_dlc_close(dev->dlc, 0);
 
 	/* Shut down TTY synchronously before freeing rfcomm_dev */
-	if (dev->port.tty)
-		tty_vhangup(dev->port.tty);
+	tty = tty_port_tty_get(&dev->port);
+	if (tty) {
+		tty_vhangup(tty);
+		tty_kref_put(tty);
+	}
 
 	if (!test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags))
 		rfcomm_dev_del(dev);
@@ -563,6 +566,7 @@ static void rfcomm_dev_data_ready(struct rfcomm_dlc *dlc, struct sk_buff *skb)
 static void rfcomm_dev_state_change(struct rfcomm_dlc *dlc, int err)
 {
 	struct rfcomm_dev *dev = dlc->owner;
+	struct tty_struct *tty;
 	if (!dev)
 		return;
 
@@ -572,7 +576,8 @@ static void rfcomm_dev_state_change(struct rfcomm_dlc *dlc, int err)
 	wake_up_interruptible(&dev->wait);
 
 	if (dlc->state == BT_CLOSED) {
-		if (!dev->port.tty) {
+		tty = tty_port_tty_get(&dev->port);
+		if (!tty) {
 			if (test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags)) {
 				/* Drop DLC lock here to avoid deadlock
 				 * 1. rfcomm_dev_get will take rfcomm_dev_lock
@@ -591,8 +596,10 @@ static void rfcomm_dev_state_change(struct rfcomm_dlc *dlc, int err)
 				tty_port_put(&dev->port);
 				rfcomm_dlc_lock(dlc);
 			}
-		} else
-			tty_hangup(dev->port.tty);
+		} else {
+			tty_hangup(tty);
+			tty_kref_put(tty);
+		}
 	}
 }
 
@@ -604,10 +611,8 @@ static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig)
 
 	BT_DBG("dlc %p dev %p v24_sig 0x%02x", dlc, dev, v24_sig);
 
-	if ((dev->modem_status & TIOCM_CD) && !(v24_sig & RFCOMM_V24_DV)) {
-		if (dev->port.tty && !C_CLOCAL(dev->port.tty))
-			tty_hangup(dev->port.tty);
-	}
+	if ((dev->modem_status & TIOCM_CD) && !(v24_sig & RFCOMM_V24_DV))
+		tty_port_tty_hangup(&dev->port, true);
 
 	dev->modem_status =
 		((v24_sig & RFCOMM_V24_RTC) ? (TIOCM_DSR | TIOCM_DTR) : 0) |
-- 
cgit v1.2.3


From ebe937f74b8a72cf3ceeae5c2194a160bb092901 Mon Sep 17 00:00:00 2001
From: Gianluca Anzolin <gianluca@sottospazio.it>
Date: Mon, 29 Jul 2013 17:08:09 +0200
Subject: Bluetooth: Remove the device from the list in the destructor

The current code removes the device from the device list in several
places. Do it only in the destructor instead and in the error path of
rfcomm_add_dev() if the device couldn't be initialized.

Signed-off-by: Gianluca Anzolin <gianluca@sottospazio.it>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
---
 net/bluetooth/rfcomm/tty.c | 27 ++++++---------------------
 1 file changed, 6 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index cd7ff370be38..9c0e142041bd 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -76,13 +76,6 @@ static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig);
 
 /* ---- Device functions ---- */
 
-/*
- * The reason this isn't actually a race, as you no doubt have a little voice
- * screaming at you in your head, is that the refcount should never actually
- * reach zero unless the device has already been taken off the list, in
- * rfcomm_dev_del(). And if that's not true, we'll hit the BUG() in
- * rfcomm_dev_destruct() anyway.
- */
 static void rfcomm_dev_destruct(struct tty_port *port)
 {
 	struct rfcomm_dev *dev = container_of(port, struct rfcomm_dev, port);
@@ -90,10 +83,9 @@ static void rfcomm_dev_destruct(struct tty_port *port)
 
 	BT_DBG("dev %p dlc %p", dev, dlc);
 
-	/* Refcount should only hit zero when called from rfcomm_dev_del()
-	   which will have taken us off the list. Everything else are
-	   refcounting bugs. */
-	BUG_ON(!list_empty(&dev->list));
+	spin_lock(&rfcomm_dev_lock);
+	list_del(&dev->list);
+	spin_unlock(&rfcomm_dev_lock);
 
 	rfcomm_dlc_lock(dlc);
 	/* Detach DLC if it's owned by this dev */
@@ -282,7 +274,9 @@ out:
 			dev->id, NULL);
 	if (IS_ERR(dev->tty_dev)) {
 		err = PTR_ERR(dev->tty_dev);
+		spin_lock(&rfcomm_dev_lock);
 		list_del(&dev->list);
+		spin_unlock(&rfcomm_dev_lock);
 		goto free;
 	}
 
@@ -315,10 +309,6 @@ static void rfcomm_dev_del(struct rfcomm_dev *dev)
 	}
 	spin_unlock_irqrestore(&dev->port.lock, flags);
 
-	spin_lock(&rfcomm_dev_lock);
-	list_del_init(&dev->list);
-	spin_unlock(&rfcomm_dev_lock);
-
 	tty_port_put(&dev->port);
 }
 
@@ -750,13 +740,8 @@ static void rfcomm_tty_close(struct tty_struct *tty, struct file *filp)
 		dev->port.tty = NULL;
 		rfcomm_dlc_unlock(dev->dlc);
 
-		if (test_bit(RFCOMM_TTY_RELEASED, &dev->flags)) {
-			spin_lock(&rfcomm_dev_lock);
-			list_del_init(&dev->list);
-			spin_unlock(&rfcomm_dev_lock);
-
+		if (test_bit(RFCOMM_TTY_RELEASED, &dev->flags))
 			tty_port_put(&dev->port);
-		}
 	} else
 		spin_unlock_irqrestore(&dev->port.lock, flags);
 
-- 
cgit v1.2.3


From 54b926a1434e817ca84cb090f36b56763e192470 Mon Sep 17 00:00:00 2001
From: Gianluca Anzolin <gianluca@sottospazio.it>
Date: Mon, 29 Jul 2013 17:08:10 +0200
Subject: Bluetooth: Move the tty initialization and cleanup out of open/close

Move the tty_struct initialization from rfcomm_tty_open() to
rfcomm_tty_install() and do the same for the cleanup moving the code from
rfcomm_tty_close() to rfcomm_tty_cleanup().

Add also extra error handling in rfcomm_tty_install() because, unlike
.open()/.close(), .cleanup() is not called if .install() fails.

Signed-off-by: Gianluca Anzolin <gianluca@sottospazio.it>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
---
 net/bluetooth/rfcomm/tty.c | 114 ++++++++++++++++++++++++++++-----------------
 1 file changed, 72 insertions(+), 42 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 9c0e142041bd..73dd61530130 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -633,49 +633,61 @@ static void rfcomm_tty_copy_pending(struct rfcomm_dev *dev)
 		tty_flip_buffer_push(&dev->port);
 }
 
-static int rfcomm_tty_open(struct tty_struct *tty, struct file *filp)
+/* do the reverse of install, clearing the tty fields and releasing the
+ * reference to tty_port
+ */
+static void rfcomm_tty_cleanup(struct tty_struct *tty)
+{
+	struct rfcomm_dev *dev = tty->driver_data;
+
+	if (dev->tty_dev->parent)
+		device_move(dev->tty_dev, NULL, DPM_ORDER_DEV_LAST);
+
+	/* Close DLC and dettach TTY */
+	rfcomm_dlc_close(dev->dlc, 0);
+
+	clear_bit(RFCOMM_TTY_ATTACHED, &dev->flags);
+
+	rfcomm_dlc_lock(dev->dlc);
+	tty->driver_data = NULL;
+	dev->port.tty = NULL;
+	rfcomm_dlc_unlock(dev->dlc);
+
+	tty_port_put(&dev->port);
+}
+
+/* we acquire the tty_port reference since it's here the tty is first used
+ * by setting the termios. We also populate the driver_data field and install
+ * the tty port
+ */
+static int rfcomm_tty_install(struct tty_driver *driver, struct tty_struct *tty)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	struct rfcomm_dev *dev;
 	struct rfcomm_dlc *dlc;
-	unsigned long flags;
-	int err, id;
-
-	id = tty->index;
+	int err;
 
-	BT_DBG("tty %p id %d", tty, id);
-
-	/* We don't leak this refcount. For reasons which are not entirely
-	   clear, the TTY layer will call our ->close() method even if the
-	   open fails. We decrease the refcount there, and decreasing it
-	   here too would cause breakage. */
-	dev = rfcomm_dev_get(id);
+	dev = rfcomm_dev_get(tty->index);
 	if (!dev)
 		return -ENODEV;
 
-	BT_DBG("dev %p dst %pMR channel %d opened %d", dev, &dev->dst,
-	       dev->channel, dev->port.count);
-
-	spin_lock_irqsave(&dev->port.lock, flags);
-	if (++dev->port.count > 1) {
-		spin_unlock_irqrestore(&dev->port.lock, flags);
-		return 0;
-	}
-	spin_unlock_irqrestore(&dev->port.lock, flags);
-
 	dlc = dev->dlc;
 
 	/* Attach TTY and open DLC */
-
 	rfcomm_dlc_lock(dlc);
 	tty->driver_data = dev;
 	dev->port.tty = tty;
 	rfcomm_dlc_unlock(dlc);
 	set_bit(RFCOMM_TTY_ATTACHED, &dev->flags);
 
+	/* install the tty_port */
+	err = tty_port_install(&dev->port, driver, tty);
+	if (err < 0)
+		goto error_no_dlc;
+
 	err = rfcomm_dlc_open(dlc, &dev->src, &dev->dst, dev->channel);
 	if (err < 0)
-		return err;
+		goto error_no_dlc;
 
 	/* Wait for DLC to connect */
 	add_wait_queue(&dev->wait, &wait);
@@ -702,15 +714,45 @@ static int rfcomm_tty_open(struct tty_struct *tty, struct file *filp)
 	set_current_state(TASK_RUNNING);
 	remove_wait_queue(&dev->wait, &wait);
 
-	if (err == 0)
-		device_move(dev->tty_dev, rfcomm_get_device(dev),
-			    DPM_ORDER_DEV_AFTER_PARENT);
+	if (err < 0)
+		goto error_no_connection;
+
+	device_move(dev->tty_dev, rfcomm_get_device(dev),
+		    DPM_ORDER_DEV_AFTER_PARENT);
+	return 0;
+
+error_no_connection:
+	rfcomm_dlc_close(dlc, err);
+error_no_dlc:
+	clear_bit(RFCOMM_TTY_ATTACHED, &dev->flags);
+	tty_port_put(&dev->port);
+	return err;
+}
+
+static int rfcomm_tty_open(struct tty_struct *tty, struct file *filp)
+{
+	struct rfcomm_dev *dev = tty->driver_data;
+	unsigned long flags;
+
+	BT_DBG("tty %p id %d", tty, tty->index);
 
+	BT_DBG("dev %p dst %pMR channel %d opened %d", dev, &dev->dst,
+	       dev->channel, dev->port.count);
+
+	spin_lock_irqsave(&dev->port.lock, flags);
+	dev->port.count++;
+	spin_unlock_irqrestore(&dev->port.lock, flags);
+
+	/*
+	 * FIXME: rfcomm should use proper flow control for
+	 * received data. This hack will be unnecessary and can
+	 * be removed when that's implemented
+	 */
 	rfcomm_tty_copy_pending(dev);
 
 	rfcomm_dlc_unthrottle(dev->dlc);
 
-	return err;
+	return 0;
 }
 
 static void rfcomm_tty_close(struct tty_struct *tty, struct file *filp)
@@ -727,25 +769,11 @@ static void rfcomm_tty_close(struct tty_struct *tty, struct file *filp)
 	spin_lock_irqsave(&dev->port.lock, flags);
 	if (!--dev->port.count) {
 		spin_unlock_irqrestore(&dev->port.lock, flags);
-		if (dev->tty_dev->parent)
-			device_move(dev->tty_dev, NULL, DPM_ORDER_DEV_LAST);
-
-		/* Close DLC and dettach TTY */
-		rfcomm_dlc_close(dev->dlc, 0);
-
-		clear_bit(RFCOMM_TTY_ATTACHED, &dev->flags);
-
-		rfcomm_dlc_lock(dev->dlc);
-		tty->driver_data = NULL;
-		dev->port.tty = NULL;
-		rfcomm_dlc_unlock(dev->dlc);
 
 		if (test_bit(RFCOMM_TTY_RELEASED, &dev->flags))
 			tty_port_put(&dev->port);
 	} else
 		spin_unlock_irqrestore(&dev->port.lock, flags);
-
-	tty_port_put(&dev->port);
 }
 
 static int rfcomm_tty_write(struct tty_struct *tty, const unsigned char *buf, int count)
@@ -1118,6 +1146,8 @@ static const struct tty_operations rfcomm_ops = {
 	.wait_until_sent	= rfcomm_tty_wait_until_sent,
 	.tiocmget		= rfcomm_tty_tiocmget,
 	.tiocmset		= rfcomm_tty_tiocmset,
+	.install                = rfcomm_tty_install,
+	.cleanup                = rfcomm_tty_cleanup,
 };
 
 int __init rfcomm_init_ttys(void)
-- 
cgit v1.2.3


From cad348a17e170451ea8688b532a6ca3e98c63b60 Mon Sep 17 00:00:00 2001
From: Gianluca Anzolin <gianluca@sottospazio.it>
Date: Mon, 29 Jul 2013 17:08:11 +0200
Subject: Bluetooth: Implement .activate, .shutdown and .carrier_raised methods

Implement .activate, .shutdown and .carrier_raised methods of tty_port
to manage the dlc, moving the code from rfcomm_tty_install() and
rfcomm_tty_cleanup() functions.

At the same time the tty .open()/.close() and .hangup() methods are
changed to use the tty_port helpers that properly call the
aforementioned tty_port methods.

Signed-off-by: Gianluca Anzolin <gianluca@sottospazio.it>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
---
 net/bluetooth/rfcomm/tty.c | 117 ++++++++++++++++++---------------------------
 1 file changed, 47 insertions(+), 70 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 73dd61530130..583f7135c811 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -58,7 +58,6 @@ struct rfcomm_dev {
 	uint			modem_status;
 
 	struct rfcomm_dlc	*dlc;
-	wait_queue_head_t       wait;
 
 	struct device		*tty_dev;
 
@@ -104,8 +103,39 @@ static void rfcomm_dev_destruct(struct tty_port *port)
 	module_put(THIS_MODULE);
 }
 
+/* device-specific initialization: open the dlc */
+static int rfcomm_dev_activate(struct tty_port *port, struct tty_struct *tty)
+{
+	struct rfcomm_dev *dev = container_of(port, struct rfcomm_dev, port);
+
+	return rfcomm_dlc_open(dev->dlc, &dev->src, &dev->dst, dev->channel);
+}
+
+/* we block the open until the dlc->state becomes BT_CONNECTED */
+static int rfcomm_dev_carrier_raised(struct tty_port *port)
+{
+	struct rfcomm_dev *dev = container_of(port, struct rfcomm_dev, port);
+
+	return (dev->dlc->state == BT_CONNECTED);
+}
+
+/* device-specific cleanup: close the dlc */
+static void rfcomm_dev_shutdown(struct tty_port *port)
+{
+	struct rfcomm_dev *dev = container_of(port, struct rfcomm_dev, port);
+
+	if (dev->tty_dev->parent)
+		device_move(dev->tty_dev, NULL, DPM_ORDER_DEV_LAST);
+
+	/* close the dlc */
+	rfcomm_dlc_close(dev->dlc, 0);
+}
+
 static const struct tty_port_operations rfcomm_port_ops = {
 	.destruct = rfcomm_dev_destruct,
+	.activate = rfcomm_dev_activate,
+	.shutdown = rfcomm_dev_shutdown,
+	.carrier_raised = rfcomm_dev_carrier_raised,
 };
 
 static struct rfcomm_dev *__rfcomm_dev_get(int id)
@@ -228,7 +258,6 @@ static int rfcomm_dev_add(struct rfcomm_dev_req *req, struct rfcomm_dlc *dlc)
 
 	tty_port_init(&dev->port);
 	dev->port.ops = &rfcomm_port_ops;
-	init_waitqueue_head(&dev->wait);
 
 	skb_queue_head_init(&dev->pending);
 
@@ -563,9 +592,12 @@ static void rfcomm_dev_state_change(struct rfcomm_dlc *dlc, int err)
 	BT_DBG("dlc %p dev %p err %d", dlc, dev, err);
 
 	dev->err = err;
-	wake_up_interruptible(&dev->wait);
+	if (dlc->state == BT_CONNECTED) {
+		device_move(dev->tty_dev, rfcomm_get_device(dev),
+			    DPM_ORDER_DEV_AFTER_PARENT);
 
-	if (dlc->state == BT_CLOSED) {
+		wake_up_interruptible(&dev->port.open_wait);
+	} else if (dlc->state == BT_CLOSED) {
 		tty = tty_port_tty_get(&dev->port);
 		if (!tty) {
 			if (test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags)) {
@@ -640,17 +672,10 @@ static void rfcomm_tty_cleanup(struct tty_struct *tty)
 {
 	struct rfcomm_dev *dev = tty->driver_data;
 
-	if (dev->tty_dev->parent)
-		device_move(dev->tty_dev, NULL, DPM_ORDER_DEV_LAST);
-
-	/* Close DLC and dettach TTY */
-	rfcomm_dlc_close(dev->dlc, 0);
-
 	clear_bit(RFCOMM_TTY_ATTACHED, &dev->flags);
 
 	rfcomm_dlc_lock(dev->dlc);
 	tty->driver_data = NULL;
-	dev->port.tty = NULL;
 	rfcomm_dlc_unlock(dev->dlc);
 
 	tty_port_put(&dev->port);
@@ -662,7 +687,6 @@ static void rfcomm_tty_cleanup(struct tty_struct *tty)
  */
 static int rfcomm_tty_install(struct tty_driver *driver, struct tty_struct *tty)
 {
-	DECLARE_WAITQUEUE(wait, current);
 	struct rfcomm_dev *dev;
 	struct rfcomm_dlc *dlc;
 	int err;
@@ -676,72 +700,30 @@ static int rfcomm_tty_install(struct tty_driver *driver, struct tty_struct *tty)
 	/* Attach TTY and open DLC */
 	rfcomm_dlc_lock(dlc);
 	tty->driver_data = dev;
-	dev->port.tty = tty;
 	rfcomm_dlc_unlock(dlc);
 	set_bit(RFCOMM_TTY_ATTACHED, &dev->flags);
 
 	/* install the tty_port */
 	err = tty_port_install(&dev->port, driver, tty);
-	if (err < 0)
-		goto error_no_dlc;
-
-	err = rfcomm_dlc_open(dlc, &dev->src, &dev->dst, dev->channel);
-	if (err < 0)
-		goto error_no_dlc;
+	if (err)
+		rfcomm_tty_cleanup(tty);
 
-	/* Wait for DLC to connect */
-	add_wait_queue(&dev->wait, &wait);
-	while (1) {
-		set_current_state(TASK_INTERRUPTIBLE);
-
-		if (dlc->state == BT_CLOSED) {
-			err = -dev->err;
-			break;
-		}
-
-		if (dlc->state == BT_CONNECTED)
-			break;
-
-		if (signal_pending(current)) {
-			err = -EINTR;
-			break;
-		}
-
-		tty_unlock(tty);
-		schedule();
-		tty_lock(tty);
-	}
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&dev->wait, &wait);
-
-	if (err < 0)
-		goto error_no_connection;
-
-	device_move(dev->tty_dev, rfcomm_get_device(dev),
-		    DPM_ORDER_DEV_AFTER_PARENT);
-	return 0;
-
-error_no_connection:
-	rfcomm_dlc_close(dlc, err);
-error_no_dlc:
-	clear_bit(RFCOMM_TTY_ATTACHED, &dev->flags);
-	tty_port_put(&dev->port);
 	return err;
 }
 
 static int rfcomm_tty_open(struct tty_struct *tty, struct file *filp)
 {
 	struct rfcomm_dev *dev = tty->driver_data;
-	unsigned long flags;
+	int err;
 
 	BT_DBG("tty %p id %d", tty, tty->index);
 
 	BT_DBG("dev %p dst %pMR channel %d opened %d", dev, &dev->dst,
 	       dev->channel, dev->port.count);
 
-	spin_lock_irqsave(&dev->port.lock, flags);
-	dev->port.count++;
-	spin_unlock_irqrestore(&dev->port.lock, flags);
+	err = tty_port_open(&dev->port, tty, filp);
+	if (err)
+		return err;
 
 	/*
 	 * FIXME: rfcomm should use proper flow control for
@@ -758,7 +740,6 @@ static int rfcomm_tty_open(struct tty_struct *tty, struct file *filp)
 static void rfcomm_tty_close(struct tty_struct *tty, struct file *filp)
 {
 	struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
-	unsigned long flags;
 
 	if (!dev)
 		return;
@@ -766,14 +747,10 @@ static void rfcomm_tty_close(struct tty_struct *tty, struct file *filp)
 	BT_DBG("tty %p dev %p dlc %p opened %d", tty, dev, dev->dlc,
 						dev->port.count);
 
-	spin_lock_irqsave(&dev->port.lock, flags);
-	if (!--dev->port.count) {
-		spin_unlock_irqrestore(&dev->port.lock, flags);
+	tty_port_close(&dev->port, tty, filp);
 
-		if (test_bit(RFCOMM_TTY_RELEASED, &dev->flags))
-			tty_port_put(&dev->port);
-	} else
-		spin_unlock_irqrestore(&dev->port.lock, flags);
+	if (test_bit(RFCOMM_TTY_RELEASED, &dev->flags))
+		tty_port_put(&dev->port);
 }
 
 static int rfcomm_tty_write(struct tty_struct *tty, const unsigned char *buf, int count)
@@ -1076,7 +1053,7 @@ static void rfcomm_tty_hangup(struct tty_struct *tty)
 	if (!dev)
 		return;
 
-	rfcomm_tty_flush_buffer(tty);
+	tty_port_hangup(&dev->port);
 
 	if (test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags)) {
 		if (rfcomm_dev_get(dev->id) == NULL)
@@ -1166,7 +1143,7 @@ int __init rfcomm_init_ttys(void)
 	rfcomm_tty_driver->subtype	= SERIAL_TYPE_NORMAL;
 	rfcomm_tty_driver->flags	= TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV;
 	rfcomm_tty_driver->init_termios	= tty_std_termios;
-	rfcomm_tty_driver->init_termios.c_cflag	= B9600 | CS8 | CREAD | HUPCL | CLOCAL;
+	rfcomm_tty_driver->init_termios.c_cflag	= B9600 | CS8 | CREAD | HUPCL;
 	rfcomm_tty_driver->init_termios.c_lflag &= ~ICANON;
 	tty_set_operations(rfcomm_tty_driver, &rfcomm_ops);
 
-- 
cgit v1.2.3


From ece3150dea382c7c961fe2604332ed3474960d25 Mon Sep 17 00:00:00 2001
From: Gianluca Anzolin <gianluca@sottospazio.it>
Date: Mon, 29 Jul 2013 17:08:12 +0200
Subject: Bluetooth: Fix the reference counting of tty_port

The tty_port can be released in two cases: when we get a HUP in the
functions rfcomm_tty_hangup() and rfcomm_dev_state_change(). Or when the
user releases the device in rfcomm_release_dev().

In these cases we set the flag RFCOMM_TTY_RELEASED so that no other
function can get a reference to the tty_port.
The use of !test_and_set_bit(RFCOMM_TTY_RELEASED) ensures that the
'initial' tty_port reference is only dropped once.

The rfcomm_dev_del function is removed becase it isn't used anymore.

Signed-off-by: Gianluca Anzolin <gianluca@sottospazio.it>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
---
 net/bluetooth/rfcomm/tty.c | 46 ++++++++++++----------------------------------
 1 file changed, 12 insertions(+), 34 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 583f7135c811..3e078b73cc22 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -324,23 +324,6 @@ free:
 	return err;
 }
 
-static void rfcomm_dev_del(struct rfcomm_dev *dev)
-{
-	unsigned long flags;
-	BT_DBG("dev %p", dev);
-
-	BUG_ON(test_and_set_bit(RFCOMM_TTY_RELEASED, &dev->flags));
-
-	spin_lock_irqsave(&dev->port.lock, flags);
-	if (dev->port.count > 0) {
-		spin_unlock_irqrestore(&dev->port.lock, flags);
-		return;
-	}
-	spin_unlock_irqrestore(&dev->port.lock, flags);
-
-	tty_port_put(&dev->port);
-}
-
 /* ---- Send buffer ---- */
 static inline unsigned int rfcomm_room(struct rfcomm_dlc *dlc)
 {
@@ -454,8 +437,9 @@ static int rfcomm_release_dev(void __user *arg)
 		tty_kref_put(tty);
 	}
 
-	if (!test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags))
-		rfcomm_dev_del(dev);
+	if (!test_and_set_bit(RFCOMM_TTY_RELEASED, &dev->flags))
+		tty_port_put(&dev->port);
+
 	tty_port_put(&dev->port);
 	return 0;
 }
@@ -607,6 +591,9 @@ static void rfcomm_dev_state_change(struct rfcomm_dlc *dlc, int err)
 				 *    rfcomm_dev_lock -> dlc lock
 				 * 2. tty_port_put will deadlock if it's
 				 *    the last reference
+				 *
+				 * FIXME: when we release the lock anything
+				 * could happen to dev, even its destruction
 				 */
 				rfcomm_dlc_unlock(dlc);
 				if (rfcomm_dev_get(dev->id) == NULL) {
@@ -614,7 +601,10 @@ static void rfcomm_dev_state_change(struct rfcomm_dlc *dlc, int err)
 					return;
 				}
 
-				rfcomm_dev_del(dev);
+				if (!test_and_set_bit(RFCOMM_TTY_RELEASED,
+						      &dev->flags))
+					tty_port_put(&dev->port);
+
 				tty_port_put(&dev->port);
 				rfcomm_dlc_lock(dlc);
 			}
@@ -741,16 +731,10 @@ static void rfcomm_tty_close(struct tty_struct *tty, struct file *filp)
 {
 	struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
 
-	if (!dev)
-		return;
-
 	BT_DBG("tty %p dev %p dlc %p opened %d", tty, dev, dev->dlc,
 						dev->port.count);
 
 	tty_port_close(&dev->port, tty, filp);
-
-	if (test_bit(RFCOMM_TTY_RELEASED, &dev->flags))
-		tty_port_put(&dev->port);
 }
 
 static int rfcomm_tty_write(struct tty_struct *tty, const unsigned char *buf, int count)
@@ -1050,17 +1034,11 @@ static void rfcomm_tty_hangup(struct tty_struct *tty)
 
 	BT_DBG("tty %p dev %p", tty, dev);
 
-	if (!dev)
-		return;
-
 	tty_port_hangup(&dev->port);
 
-	if (test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags)) {
-		if (rfcomm_dev_get(dev->id) == NULL)
-			return;
-		rfcomm_dev_del(dev);
+	if (test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags) &&
+	    !test_and_set_bit(RFCOMM_TTY_RELEASED, &dev->flags))
 		tty_port_put(&dev->port);
-	}
 }
 
 static int rfcomm_tty_tiocmget(struct tty_struct *tty)
-- 
cgit v1.2.3


From ffe6b68cc5999a3f91a15b6667e69e14186e337d Mon Sep 17 00:00:00 2001
From: Gianluca Anzolin <gianluca@sottospazio.it>
Date: Mon, 29 Jul 2013 17:08:13 +0200
Subject: Bluetooth: Purge the dlc->tx_queue to avoid circular dependency

In rfcomm_tty_cleanup we purge the dlc->tx_queue which may contain
socket buffers referencing the tty_port and thus preventing the tty_port
destruction.

Signed-off-by: Gianluca Anzolin <gianluca@sottospazio.it>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
---
 net/bluetooth/rfcomm/tty.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 3e078b73cc22..6d126faf145f 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -668,6 +668,12 @@ static void rfcomm_tty_cleanup(struct tty_struct *tty)
 	tty->driver_data = NULL;
 	rfcomm_dlc_unlock(dev->dlc);
 
+	/*
+	 * purge the dlc->tx_queue to avoid circular dependencies
+	 * between dev and dlc
+	 */
+	skb_queue_purge(&dev->dlc->tx_queue);
+
 	tty_port_put(&dev->port);
 }
 
-- 
cgit v1.2.3


From e660ed6c70370dae0887e2b5224d8b6c1e138120 Mon Sep 17 00:00:00 2001
From: Frédéric Dalleau <frederic.dalleau@linux.intel.com>
Date: Mon, 19 Aug 2013 14:23:54 +0200
Subject: Bluetooth: Use hci_connect_sco directly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

hci_connect is a super function for connecting hci protocols. But the
voice_setting parameter (introduced in subsequent patches) is only
needed by SCO and security requirements are not needed for SCO channels.
Thus, it makes sense to have a separate function for SCO.

Signed-off-by: Frédéric Dalleau <frederic.dalleau@linux.intel.com>
Acked-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
---
 include/net/bluetooth/hci_core.h | 2 ++
 net/bluetooth/hci_conn.c         | 8 ++------
 net/bluetooth/sco.c              | 3 +--
 3 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index f77885ea78c2..307a19269020 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -584,6 +584,8 @@ struct hci_chan *hci_chan_lookup_handle(struct hci_dev *hdev, __u16 handle);
 
 struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst,
 			     __u8 dst_type, __u8 sec_level, __u8 auth_type);
+struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type,
+				 bdaddr_t *dst);
 int hci_conn_check_link_mode(struct hci_conn *conn);
 int hci_conn_check_secure(struct hci_conn *conn, __u8 sec_level);
 int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 6c7f36379722..5f1f448dd0f2 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -560,13 +560,12 @@ static struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst,
 	return acl;
 }
 
-static struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type,
-				bdaddr_t *dst, u8 sec_level, u8 auth_type)
+struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst)
 {
 	struct hci_conn *acl;
 	struct hci_conn *sco;
 
-	acl = hci_connect_acl(hdev, dst, sec_level, auth_type);
+	acl = hci_connect_acl(hdev, dst, BT_SECURITY_LOW, HCI_AT_NO_BONDING);
 	if (IS_ERR(acl))
 		return acl;
 
@@ -612,9 +611,6 @@ struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst,
 		return hci_connect_le(hdev, dst, dst_type, sec_level, auth_type);
 	case ACL_LINK:
 		return hci_connect_acl(hdev, dst, sec_level, auth_type);
-	case SCO_LINK:
-	case ESCO_LINK:
-		return hci_connect_sco(hdev, type, dst, sec_level, auth_type);
 	}
 
 	return ERR_PTR(-EINVAL);
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 2de7150a6304..ab2502cbecfb 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -176,8 +176,7 @@ static int sco_connect(struct sock *sk)
 	else
 		type = SCO_LINK;
 
-	hcon = hci_connect(hdev, type, dst, BDADDR_BREDR, BT_SECURITY_LOW,
-			   HCI_AT_NO_BONDING);
+	hcon = hci_connect_sco(hdev, type, dst);
 	if (IS_ERR(hcon)) {
 		err = PTR_ERR(hcon);
 		goto done;
-- 
cgit v1.2.3


From 33f2404823f000f9b5fc570b0a08b1008e241578 Mon Sep 17 00:00:00 2001
From: Frédéric Dalleau <frederic.dalleau@linux.intel.com>
Date: Mon, 19 Aug 2013 14:23:55 +0200
Subject: Bluetooth: Remove unused mask parameter in sco_conn_defer_accept
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

From Bluetooth Core v4.0 specification, 7.1.8 Accept Connection Request
Command "When accepting synchronous connection request, the Role
parameter is not used and will be ignored by the BR/EDR Controller."

Signed-off-by: Frédéric Dalleau <frederic.dalleau@linux.intel.com>
Acked-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
---
 net/bluetooth/sco.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index ab2502cbecfb..acdca68806db 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -651,7 +651,7 @@ static int sco_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
 	return err;
 }
 
-static void sco_conn_defer_accept(struct hci_conn *conn, int mask)
+static void sco_conn_defer_accept(struct hci_conn *conn)
 {
 	struct hci_dev *hdev = conn->hdev;
 
@@ -663,11 +663,7 @@ static void sco_conn_defer_accept(struct hci_conn *conn, int mask)
 		struct hci_cp_accept_conn_req cp;
 
 		bacpy(&cp.bdaddr, &conn->dst);
-
-		if (lmp_rswitch_capable(hdev) && (mask & HCI_LM_MASTER))
-			cp.role = 0x00; /* Become master */
-		else
-			cp.role = 0x01; /* Remain slave */
+		cp.role = 0x00; /* Ignored */
 
 		hci_send_cmd(hdev, HCI_OP_ACCEPT_CONN_REQ, sizeof(cp), &cp);
 	} else {
@@ -697,7 +693,7 @@ static int sco_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 
 	if (sk->sk_state == BT_CONNECT2 &&
 	    test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) {
-		sco_conn_defer_accept(pi->conn->hcon, 0);
+		sco_conn_defer_accept(pi->conn->hcon);
 		sk->sk_state = BT_CONFIG;
 		msg->msg_namelen = 0;
 
-- 
cgit v1.2.3


From ad10b1a48754b1381582d96f070a39832e41382d Mon Sep 17 00:00:00 2001
From: Frédéric Dalleau <frederic.dalleau@linux.intel.com>
Date: Mon, 19 Aug 2013 14:23:56 +0200
Subject: Bluetooth: Add Bluetooth socket voice option
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch extends the current Bluetooth socket options with BT_VOICE.
This is intended to choose voice data type at runtime. It only applies
to SCO sockets. Incoming connections shall be setup during deferred
setup. Outgoing connections shall be setup before connect(). The desired
setting is stored in the SCO socket info. This patch declares needed
members, modifies getsockopt() and setsockopt().

Signed-off-by: Frédéric Dalleau <frederic.dalleau@linux.intel.com>
Acked-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
---
 include/net/bluetooth/bluetooth.h |  8 ++++++++
 include/net/bluetooth/sco.h       |  1 +
 net/bluetooth/sco.c               | 40 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 48 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index 10eb9b389014..10d43d8c7037 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -107,6 +107,14 @@ struct bt_power {
  */
 #define BT_CHANNEL_POLICY_AMP_PREFERRED		2
 
+#define BT_VOICE		11
+struct bt_voice {
+	__u16 setting;
+};
+
+#define BT_VOICE_TRANSPARENT			0x0003
+#define BT_VOICE_CVSD_16BIT			0x0060
+
 __printf(1, 2)
 int bt_info(const char *fmt, ...);
 __printf(1, 2)
diff --git a/include/net/bluetooth/sco.h b/include/net/bluetooth/sco.h
index 1e35c43657c8..e252a31ee6b6 100644
--- a/include/net/bluetooth/sco.h
+++ b/include/net/bluetooth/sco.h
@@ -73,6 +73,7 @@ struct sco_conn {
 struct sco_pinfo {
 	struct bt_sock	bt;
 	__u32		flags;
+	__u16		setting;
 	struct sco_conn	*conn;
 };
 
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index acdca68806db..678747e2e389 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -416,6 +416,8 @@ static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, int pro
 	sk->sk_protocol = proto;
 	sk->sk_state    = BT_OPEN;
 
+	sco_pi(sk)->setting = BT_VOICE_CVSD_16BIT;
+
 	setup_timer(&sk->sk_timer, sco_sock_timeout, (unsigned long)sk);
 
 	bt_sock_link(&sco_sk_list, sk);
@@ -709,7 +711,8 @@ static int sco_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 static int sco_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
 {
 	struct sock *sk = sock->sk;
-	int err = 0;
+	int len, err = 0;
+	struct bt_voice voice;
 	u32 opt;
 
 	BT_DBG("sk %p", sk);
@@ -735,6 +738,31 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, char
 			clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags);
 		break;
 
+	case BT_VOICE:
+		if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND &&
+		    sk->sk_state != BT_CONNECT2) {
+			err = -EINVAL;
+			break;
+		}
+
+		voice.setting = sco_pi(sk)->setting;
+
+		len = min_t(unsigned int, sizeof(voice), optlen);
+		if (copy_from_user((char *) &voice, optval, len)) {
+			err = -EFAULT;
+			break;
+		}
+
+		/* Explicitly check for these values */
+		if (voice.setting != BT_VOICE_TRANSPARENT &&
+		    voice.setting != BT_VOICE_CVSD_16BIT) {
+			err = -EINVAL;
+			break;
+		}
+
+		sco_pi(sk)->setting = voice.setting;
+		break;
+
 	default:
 		err = -ENOPROTOOPT;
 		break;
@@ -808,6 +836,7 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char
 {
 	struct sock *sk = sock->sk;
 	int len, err = 0;
+	struct bt_voice voice;
 
 	BT_DBG("sk %p", sk);
 
@@ -833,6 +862,15 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char
 
 		break;
 
+	case BT_VOICE:
+		voice.setting = sco_pi(sk)->setting;
+
+		len = min_t(unsigned int, len, sizeof(voice));
+		if (copy_to_user(optval, (char *)&voice, len))
+			err = -EFAULT;
+
+		break;
+
 	default:
 		err = -ENOPROTOOPT;
 		break;
-- 
cgit v1.2.3


From 2f69a82acf6f971a9e184dd32b24c79a14388a6c Mon Sep 17 00:00:00 2001
From: Frédéric Dalleau <frederic.dalleau@linux.intel.com>
Date: Mon, 19 Aug 2013 14:23:58 +0200
Subject: Bluetooth: Use voice setting in deferred SCO connection request
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When an incoming eSCO connection is requested, check the selected voice
setting and reply appropriately. Voice setting should have been
negotiated previously.  For example, in case of HFP, the codec is
negotiated using AT commands on the RFCOMM channel. This patch only
changes replies for socket with deferred setup enabled.

Signed-off-by: Frédéric Dalleau <frederic.dalleau@linux.intel.com>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Acked-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
---
 net/bluetooth/sco.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 678747e2e389..b1016c82c908 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -653,7 +653,7 @@ static int sco_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
 	return err;
 }
 
-static void sco_conn_defer_accept(struct hci_conn *conn)
+static void sco_conn_defer_accept(struct hci_conn *conn, u16 setting)
 {
 	struct hci_dev *hdev = conn->hdev;
 
@@ -676,9 +676,21 @@ static void sco_conn_defer_accept(struct hci_conn *conn)
 
 		cp.tx_bandwidth   = __constant_cpu_to_le32(0x00001f40);
 		cp.rx_bandwidth   = __constant_cpu_to_le32(0x00001f40);
-		cp.max_latency    = __constant_cpu_to_le16(0xffff);
-		cp.content_format = cpu_to_le16(hdev->voice_setting);
-		cp.retrans_effort = 0xff;
+		cp.content_format = cpu_to_le16(setting);
+
+		switch (setting & SCO_AIRMODE_MASK) {
+		case SCO_AIRMODE_TRANSP:
+			if (conn->pkt_type & ESCO_2EV3)
+				cp.max_latency = __constant_cpu_to_le16(0x0008);
+			else
+				cp.max_latency = __constant_cpu_to_le16(0x000D);
+			cp.retrans_effort = 0x02;
+			break;
+		case SCO_AIRMODE_CVSD:
+			cp.max_latency = __constant_cpu_to_le16(0xffff);
+			cp.retrans_effort = 0xff;
+			break;
+		}
 
 		hci_send_cmd(hdev, HCI_OP_ACCEPT_SYNC_CONN_REQ,
 			     sizeof(cp), &cp);
@@ -695,7 +707,7 @@ static int sco_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 
 	if (sk->sk_state == BT_CONNECT2 &&
 	    test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) {
-		sco_conn_defer_accept(pi->conn->hcon);
+		sco_conn_defer_accept(pi->conn->hcon, pi->setting);
 		sk->sk_state = BT_CONFIG;
 		msg->msg_namelen = 0;
 
-- 
cgit v1.2.3


From 10c62ddc6f032c3a096401ca3ce7e5b2d5780859 Mon Sep 17 00:00:00 2001
From: Frédéric Dalleau <frederic.dalleau@linux.intel.com>
Date: Mon, 19 Aug 2013 14:23:59 +0200
Subject: Bluetooth: Parameters for outgoing SCO connections
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In order to establish a transparent SCO connection, the correct settings
must be specified in the Setup Synchronous Connection request. For that,
a setting field is added to ACL connection data to set up the desired
parameters. The patch also removes usage of hdev->voice_setting in CVSD
connection and makes use of T2 parameters for transparent data.

Signed-off-by: Frédéric Dalleau <frederic.dalleau@linux.intel.com>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Acked-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
---
 include/net/bluetooth/hci_core.h |  5 +++--
 net/bluetooth/hci_conn.c         | 24 +++++++++++++++++++-----
 net/bluetooth/sco.c              |  2 +-
 3 files changed, 23 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index f403509b4d12..61ca2ce5b063 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -320,6 +320,7 @@ struct hci_conn {
 	__u32		passkey_notify;
 	__u8		passkey_entered;
 	__u16		disc_timeout;
+	__u16		setting;
 	unsigned long	flags;
 
 	__u8		remote_cap;
@@ -584,8 +585,8 @@ struct hci_chan *hci_chan_lookup_handle(struct hci_dev *hdev, __u16 handle);
 
 struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst,
 			     __u8 dst_type, __u8 sec_level, __u8 auth_type);
-struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type,
-				 bdaddr_t *dst);
+struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst,
+				 __u16 setting);
 int hci_conn_check_link_mode(struct hci_conn *conn);
 int hci_conn_check_secure(struct hci_conn *conn, __u8 sec_level);
 int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 5f1f448dd0f2..c0e56a512ff3 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -185,13 +185,24 @@ void hci_setup_sync(struct hci_conn *conn, __u16 handle)
 	conn->attempt++;
 
 	cp.handle   = cpu_to_le16(handle);
-	cp.pkt_type = cpu_to_le16(conn->pkt_type);
 
 	cp.tx_bandwidth   = __constant_cpu_to_le32(0x00001f40);
 	cp.rx_bandwidth   = __constant_cpu_to_le32(0x00001f40);
-	cp.max_latency    = __constant_cpu_to_le16(0xffff);
-	cp.voice_setting  = cpu_to_le16(hdev->voice_setting);
-	cp.retrans_effort = 0xff;
+	cp.voice_setting  = cpu_to_le16(conn->setting);
+
+	switch (conn->setting & SCO_AIRMODE_MASK) {
+	case SCO_AIRMODE_TRANSP:
+		cp.pkt_type = __constant_cpu_to_le16(EDR_ESCO_MASK &
+						     ~ESCO_2EV3);
+		cp.max_latency = __constant_cpu_to_le16(0x000d);
+		cp.retrans_effort = 0x02;
+		break;
+	case SCO_AIRMODE_CVSD:
+		cp.pkt_type = cpu_to_le16(conn->pkt_type);
+		cp.max_latency = __constant_cpu_to_le16(0xffff);
+		cp.retrans_effort = 0xff;
+		break;
+	}
 
 	hci_send_cmd(hdev, HCI_OP_SETUP_SYNC_CONN, sizeof(cp), &cp);
 }
@@ -560,7 +571,8 @@ static struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst,
 	return acl;
 }
 
-struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst)
+struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst,
+				 __u16 setting)
 {
 	struct hci_conn *acl;
 	struct hci_conn *sco;
@@ -583,6 +595,8 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst)
 
 	hci_conn_hold(sco);
 
+	sco->setting = setting;
+
 	if (acl->state == BT_CONNECTED &&
 	    (sco->state == BT_OPEN || sco->state == BT_CLOSED)) {
 		set_bit(HCI_CONN_POWER_SAVE, &acl->flags);
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index b1016c82c908..ed581b41e034 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -176,7 +176,7 @@ static int sco_connect(struct sock *sk)
 	else
 		type = SCO_LINK;
 
-	hcon = hci_connect_sco(hdev, type, dst);
+	hcon = hci_connect_sco(hdev, type, dst, sco_pi(sk)->setting);
 	if (IS_ERR(hcon)) {
 		err = PTR_ERR(hcon);
 		goto done;
-- 
cgit v1.2.3


From 79dc0087c33f06a8c35d8c9e37ea6307b790bc4c Mon Sep 17 00:00:00 2001
From: Frédéric Dalleau <frederic.dalleau@linux.intel.com>
Date: Mon, 19 Aug 2013 14:24:01 +0200
Subject: Bluetooth: Prevent transparent SCO on older devices
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Older Bluetooth devices may not support Setup Synchronous Connection or
SCO transparent data. This is indicated by the corresponding LMP feature
bits. It is not possible to know if the adapter support these features
before setting BT_VOICE option since the socket is not bound to an
adapter. An adapter can also be added after the socket is created. The
socket can be bound to an address before adapter is plugged in.

Thus, on a such adapters, if user request BT_VOICE_TRANSPARENT, outgoing
connections fail on connect() and returns -EOPNOTSUPP. Incoming
connections do not fail. However, they should only be allowed depending
on what was specified in Write_Voice_Settings command.

EOPNOTSUPP is choosen because connect() system call is failing after
selecting route but before any connection attempt.

Signed-off-by: Frédéric Dalleau <frederic.dalleau@linux.intel.com>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Acked-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
---
 net/bluetooth/sco.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index ed581b41e034..96bd388d93a4 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -176,6 +176,12 @@ static int sco_connect(struct sock *sk)
 	else
 		type = SCO_LINK;
 
+	if (sco_pi(sk)->setting == BT_VOICE_TRANSPARENT &&
+	    (!lmp_transp_capable(hdev) || !lmp_esco_capable(hdev))) {
+		err = -EOPNOTSUPP;
+		goto done;
+	}
+
 	hcon = hci_connect_sco(hdev, type, dst, sco_pi(sk)->setting);
 	if (IS_ERR(hcon)) {
 		err = PTR_ERR(hcon);
-- 
cgit v1.2.3


From 1a4c958cf9b1e159bc63d63b9e362904dd2c4ac3 Mon Sep 17 00:00:00 2001
From: Frédéric Dalleau <frederic.dalleau@linux.intel.com>
Date: Mon, 19 Aug 2013 14:24:02 +0200
Subject: Bluetooth: Handle specific error for SCO connection fallback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Synchronous Connection Complete event can return error "Connection
Rejected due to Limited resources (0x10)".
Handling this error is required for SCO connection fallback. This error
happens when the server tried to accept the connection but failed to
negotiate settings.
This error code has been verified experimentally by sending a T2 request
to a T1 only SCO listener.

Client dump follows :

< HCI Command (0x01|0x0028) plen 17 [hci0] 3.696064
        Handle: 12
        Transmit bandwidth: 8000
        Receive bandwidth: 8000
        Max latency: 13
        Setting: 0x0003
        Retransmission effort: Optimize for link quality (0x02)
        Packet type: 0x0380
> HCI Event (0x0f) plen 4 [hci0] 3.697034
      Setup Synchronous Connection (0x01|0x0028) ncmd 1
        Status: Success (0x00)
> HCI Event (0x2c) plen 17 [hci0] 3.736059
        Status: Connection Rejected due to Limited Resources (0x0d)
        Handle: 0
        Address: xx:xx:xx:xx:xx:AB (OUI 70-F3-95)
        Link type: eSCO (0x02)
        Transmission interval: 0x0c
        Retransmission window: 0x06
        RX packet length: 60
        TX packet length: 60
        Air mode: Transparent (0x03)

Server dump follows :

> HCI Event (0x04) plen 10 [hci0] 4.741513
        Address: xx:xx:xx:xx:xx:D9 (OUI 20-68-9D)
        Class: 0x620100
          Major class: Computer (desktop, notebook, PDA, organizers)
          Minor class: Uncategorized, code for device not assigned
          Networking (LAN, Ad hoc)
          Audio (Speaker, Microphone, Headset)
          Telephony (Cordless telephony, Modem, Headset)
        Link type: eSCO (0x02)
< HCI Command (0x01|0x0029) plen 21 [hci0] 4.743269
        Address: xx:xx:xx:xx:xx:D9 (OUI 20-68-9D)
        Transmit bandwidth: 8000
        Receive bandwidth: 8000
        Max latency: 13
        Setting: 0x0003
        Retransmission effort: Optimize for link quality (0x02)
        Packet type: 0x03c1
> HCI Event (0x0f) plen 4 [hci0] 4.745517
      Accept Synchronous Connection (0x01|0x0029) ncmd 1
        Status: Success (0x00)
> HCI Event (0x2c) plen 17 [hci0] 4.749508
        Status: Connection Rejected due to Limited Resources (0x0d)
        Handle: 0
        Address: xx:xx:xx:xx:xx:D9 (OUI 20-68-9D)
        Link type: eSCO (0x02)
        Transmission interval: 0x0c
        Retransmission window: 0x06
        RX packet length: 60
        TX packet length: 60
        Air mode: Transparent (0x03)

Signed-off-by: Frédéric Dalleau <frederic.dalleau@linux.intel.com>
Acked-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
---
 net/bluetooth/hci_event.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 50e39f4ad429..a8bb7bb3f483 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -2904,6 +2904,7 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev,
 		hci_conn_add_sysfs(conn);
 		break;
 
+	case 0x0d:	/* Connection Rejected due to Limited Resources */
 	case 0x11:	/* Unsupported Feature or Parameter Value */
 	case 0x1c:	/* SCO interval rejected */
 	case 0x1a:	/* Unsupported Remote Feature */
-- 
cgit v1.2.3


From 2dea632f9acad076370fe871d4ccc93868621403 Mon Sep 17 00:00:00 2001
From: Frédéric Dalleau <frederic.dalleau@linux.intel.com>
Date: Mon, 19 Aug 2013 14:24:03 +0200
Subject: Bluetooth: Add SCO connection fallback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When initiating a transparent eSCO connection, make use of T2 settings
at first try. T2 is the recommended settings from HFP 1.6 WideBand
Speech. Upon connection failure, try T1 settings.

When CVSD is requested and eSCO is supported, try to establish eSCO
connection using S3 settings. If it fails, fallback in sequence to S2,
S1, D1, D0 settings.

To know which setting should be used, conn->attempt is used. It
indicates the currently ongoing SCO connection attempt and can be used
as the index for the fallback settings table.

These setting and the fallback order are described in Bluetooth HFP 1.6
specification p. 101.

Signed-off-by: Frédéric Dalleau <frederic.dalleau@linux.intel.com>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
---
 include/net/bluetooth/hci_core.h |  2 +-
 net/bluetooth/hci_conn.c         | 44 ++++++++++++++++++++++++++++++++--------
 net/bluetooth/hci_event.c        |  6 +++---
 3 files changed, 40 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index b2bfab801840..3ede820d328f 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -570,7 +570,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_state(struct hci_dev *hdev,
 }
 
 void hci_disconnect(struct hci_conn *conn, __u8 reason);
-void hci_setup_sync(struct hci_conn *conn, __u16 handle);
+bool hci_setup_sync(struct hci_conn *conn, __u16 handle);
 void hci_sco_setup(struct hci_conn *conn, __u8 status);
 
 struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index c0e56a512ff3..f0817121ec5e 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -31,6 +31,24 @@
 #include <net/bluetooth/a2mp.h>
 #include <net/bluetooth/smp.h>
 
+struct sco_param {
+	u16 pkt_type;
+	u16 max_latency;
+};
+
+static const struct sco_param sco_param_cvsd[] = {
+	{ EDR_ESCO_MASK & ~ESCO_2EV3, 0x000a }, /* S3 */
+	{ EDR_ESCO_MASK & ~ESCO_2EV3, 0x0007 }, /* S2 */
+	{ EDR_ESCO_MASK | ESCO_EV3,   0x0007 }, /* S1 */
+	{ EDR_ESCO_MASK | ESCO_HV3,   0xffff }, /* D1 */
+	{ EDR_ESCO_MASK | ESCO_HV1,   0xffff }, /* D0 */
+};
+
+static const struct sco_param sco_param_wideband[] = {
+	{ EDR_ESCO_MASK & ~ESCO_2EV3, 0x000d }, /* T2 */
+	{ EDR_ESCO_MASK | ESCO_EV3,   0x0008 }, /* T1 */
+};
+
 static void hci_le_create_connection(struct hci_conn *conn)
 {
 	struct hci_dev *hdev = conn->hdev;
@@ -172,10 +190,11 @@ static void hci_add_sco(struct hci_conn *conn, __u16 handle)
 	hci_send_cmd(hdev, HCI_OP_ADD_SCO, sizeof(cp), &cp);
 }
 
-void hci_setup_sync(struct hci_conn *conn, __u16 handle)
+bool hci_setup_sync(struct hci_conn *conn, __u16 handle)
 {
 	struct hci_dev *hdev = conn->hdev;
 	struct hci_cp_setup_sync_conn cp;
+	const struct sco_param *param;
 
 	BT_DBG("hcon %p", conn);
 
@@ -192,19 +211,28 @@ void hci_setup_sync(struct hci_conn *conn, __u16 handle)
 
 	switch (conn->setting & SCO_AIRMODE_MASK) {
 	case SCO_AIRMODE_TRANSP:
-		cp.pkt_type = __constant_cpu_to_le16(EDR_ESCO_MASK &
-						     ~ESCO_2EV3);
-		cp.max_latency = __constant_cpu_to_le16(0x000d);
+		if (conn->attempt > ARRAY_SIZE(sco_param_wideband))
+			return false;
 		cp.retrans_effort = 0x02;
+		param = &sco_param_wideband[conn->attempt - 1];
 		break;
 	case SCO_AIRMODE_CVSD:
-		cp.pkt_type = cpu_to_le16(conn->pkt_type);
-		cp.max_latency = __constant_cpu_to_le16(0xffff);
-		cp.retrans_effort = 0xff;
+		if (conn->attempt > ARRAY_SIZE(sco_param_cvsd))
+			return false;
+		cp.retrans_effort = 0x01;
+		param = &sco_param_cvsd[conn->attempt - 1];
 		break;
+	default:
+		return false;
 	}
 
-	hci_send_cmd(hdev, HCI_OP_SETUP_SYNC_CONN, sizeof(cp), &cp);
+	cp.pkt_type = __cpu_to_le16(param->pkt_type);
+	cp.max_latency = __cpu_to_le16(param->max_latency);
+
+	if (hci_send_cmd(hdev, HCI_OP_SETUP_SYNC_CONN, sizeof(cp), &cp) < 0)
+		return false;
+
+	return true;
 }
 
 void hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max,
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index a8bb7bb3f483..94aab73f89d4 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -2909,11 +2909,11 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev,
 	case 0x1c:	/* SCO interval rejected */
 	case 0x1a:	/* Unsupported Remote Feature */
 	case 0x1f:	/* Unspecified error */
-		if (conn->out && conn->attempt < 2) {
+		if (conn->out) {
 			conn->pkt_type = (hdev->esco_type & SCO_ESCO_MASK) |
 					(hdev->esco_type & EDR_ESCO_MASK);
-			hci_setup_sync(conn, conn->link->handle);
-			goto unlock;
+			if (hci_setup_sync(conn, conn->link->handle))
+				goto unlock;
 		}
 		/* fall through */
 
-- 
cgit v1.2.3


From e133fae263090f5795b8024a4024b81e06770132 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 22 Aug 2013 08:36:41 +0200
Subject: mac80211: minstrel_ht: don't use control.flags in TX status path

Sujith reports that my commit af61a165187bb94b1dc7628ef815c23d0eacf40b
("mac80211: add control port protocol TX control flag") broke ath9k
(aggregation). The reason is that I made minstrel_ht use the flag in
the TX status path, where it can have been overwritten by the driver.
Since we have no more space in info->flags, revert that part of the
change for now, until we can reshuffle the flags or so.

Reported-by: Sujith Manoharan <c_manoha@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rc80211_minstrel_ht.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 9eff3824f2d1..c397ff54fe71 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -439,13 +439,12 @@ minstrel_aggr_check(struct ieee80211_sta *pubsta, struct sk_buff *skb)
 {
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
 	struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
-	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	u16 tid;
 
 	if (unlikely(!ieee80211_is_data_qos(hdr->frame_control)))
 		return;
 
-	if (unlikely(info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO))
+	if (unlikely(skb->protocol == cpu_to_be16(ETH_P_PAE)))
 		return;
 
 	tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
-- 
cgit v1.2.3


From 0f7cc9a3c2bd89b15720dbf358e9b9e62af27126 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 21 Aug 2013 17:29:23 -0700
Subject: tcp: increase throughput when reordering is high

The stack currently detects reordering and avoid spurious
retransmission very well. However the throughput is sub-optimal under
high reordering because cwnd is increased only if the data is deliverd
in order. I.e., FLAG_DATA_ACKED check in tcp_ack().  The more packet
are reordered the worse the throughput is.

Therefore when reordering is proven high, cwnd should advance whenever
the data is delivered regardless of its ordering. If reordering is low,
conservatively advance cwnd only on ordered deliveries in Open state,
and retain cwnd in Disordered state (RFC5681).

Using netperf on a qdisc setup of 20Mbps BW and random RTT from 45ms
to 55ms (for reordering effect). This change increases TCP throughput
by 20 - 25% to near bottleneck BW.

A special case is the stretched ACK with new SACK and/or ECE mark.
For example, a receiver may receive an out of order or ECN packet with
unacked data buffered because of LRO or delayed ACK. The principle on
such an ACK is to advance cwnd on the cummulative acked part first,
then reduce cwnd in tcp_fastretrans_alert().

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e965cc7b87ff..ec492eae0cd7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2485,8 +2485,6 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
 
 	if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
 		tcp_try_keep_open(sk);
-		if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
-			tcp_moderate_cwnd(tp);
 	} else {
 		tcp_cwnd_reduction(sk, prior_unsacked, 0);
 	}
@@ -3128,11 +3126,24 @@ static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
 		inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
 }
 
+/* Decide wheather to run the increase function of congestion control. */
 static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
 {
-	const struct tcp_sock *tp = tcp_sk(sk);
-	return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
-		!tcp_in_cwnd_reduction(sk);
+	if (tcp_in_cwnd_reduction(sk))
+		return false;
+
+	/* If reordering is high then always grow cwnd whenever data is
+	 * delivered regardless of its ordering. Otherwise stay conservative
+	 * and only grow cwnd on in-order delivery in Open state, and retain
+	 * cwnd in Disordered state (RFC5681). A stretched ACK with
+	 * new SACK or ECE mark may first advance cwnd here and later reduce
+	 * cwnd in tcp_fastretrans_alert() based on more states.
+	 */
+	if (tcp_sk(sk)->reordering > sysctl_tcp_reordering)
+		return flag & FLAG_FORWARD_PROGRESS;
+
+	return inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
+	       flag & FLAG_DATA_ACKED;
 }
 
 /* Check that window update is acceptable.
@@ -3352,18 +3363,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt);
 	acked -= tp->packets_out;
 
+	/* Advance cwnd if state allows */
+	if (tcp_may_raise_cwnd(sk, flag))
+		tcp_cong_avoid(sk, ack, prior_in_flight);
+
 	if (tcp_ack_is_dubious(sk, flag)) {
-		/* Advance CWND, if state allows this. */
-		if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
-			tcp_cong_avoid(sk, ack, prior_in_flight);
 		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
 		tcp_fastretrans_alert(sk, acked, prior_unsacked,
 				      is_dupack, flag);
-	} else {
-		if (flag & FLAG_DATA_ACKED)
-			tcp_cong_avoid(sk, ack, prior_in_flight);
 	}
-
 	if (tp->tlp_high_seq)
 		tcp_process_tlp_ack(sk, ack, flag);
 
-- 
cgit v1.2.3


From b4c1c1d03842e6b24e87e48c07769e105df99985 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Wed, 21 Aug 2013 19:47:58 +0200
Subject: net: tcp_probe: also include rcv_wnd next to snd_wnd

It is helpful to sometimes know the TCP window sizes of an established
socket e.g. to confirm that window scaling is working or to tweak the
window size to improve high-latency connections, etc etc. Currently the
TCP snooper only exports the send window size, but not the receive window
size. Therefore, also add the receive window size to the end of the
output line.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_probe.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index d4943f67aff2..fae788b5b03e 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -60,6 +60,7 @@ struct tcp_log {
 	u32	snd_nxt;
 	u32	snd_una;
 	u32	snd_wnd;
+	u32	rcv_wnd;
 	u32	snd_cwnd;
 	u32	ssthresh;
 	u32	srtt;
@@ -116,6 +117,7 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			p->snd_una = tp->snd_una;
 			p->snd_cwnd = tp->snd_cwnd;
 			p->snd_wnd = tp->snd_wnd;
+			p->rcv_wnd = tp->rcv_wnd;
 			p->ssthresh = tcp_current_ssthresh(sk);
 			p->srtt = tp->srtt >> 3;
 
@@ -157,13 +159,13 @@ static int tcpprobe_sprint(char *tbuf, int n)
 		= ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start));
 
 	return scnprintf(tbuf, n,
-			"%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n",
+			"%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u %u\n",
 			(unsigned long) tv.tv_sec,
 			(unsigned long) tv.tv_nsec,
 			&p->saddr, ntohs(p->sport),
 			&p->daddr, ntohs(p->dport),
 			p->length, p->snd_nxt, p->snd_una,
-			p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt);
+			p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd);
 }
 
 static ssize_t tcpprobe_read(struct file *file, char __user *buf,
-- 
cgit v1.2.3


From d8cdeda6ddbcb33debbb87590a7d42ff7f5b5cfd Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Wed, 21 Aug 2013 19:47:59 +0200
Subject: net: tcp_probe: kprobes: adapt jtcp_rcv_established signature

This patches fixes a rather unproblematic function signature mismatch
as the const specifier was missing for the th variable; and next to
that it adds a build-time assertion so that future function signature
mismatches for kprobes will not end badly, similarly as commit 22222997
("net: sctp: add build check for sctp_sf_eat_sack_6_2/jsctp_sf_eat_sack")
did it for SCTP.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_probe.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index fae788b5b03e..a2392f48ba35 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -92,7 +92,7 @@ static inline int tcp_probe_avail(void)
  * Note: arguments must match tcp_rcv_established()!
  */
 static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
-			       struct tcphdr *th, unsigned int len)
+				const struct tcphdr *th, unsigned int len)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	const struct inet_sock *inet = inet_sk(sk);
@@ -225,6 +225,13 @@ static __init int tcpprobe_init(void)
 {
 	int ret = -ENOMEM;
 
+	/* Warning: if the function signature of tcp_rcv_established,
+	 * has been changed, you also have to change the signature of
+	 * jtcp_rcv_established, otherwise you end up right here!
+	 */
+	BUILD_BUG_ON(__same_type(tcp_rcv_established,
+				 jtcp_rcv_established) == 0);
+
 	init_waitqueue_head(&tcp_probe.wait);
 	spin_lock_init(&tcp_probe.lock);
 
-- 
cgit v1.2.3


From f925d0a62db3f1b6e463ef956d0855006538d002 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Wed, 21 Aug 2013 19:48:00 +0200
Subject: net: tcp_probe: add IPv6 support

The tcp_probe currently only supports analysis of IPv4 connections.
Therefore, it would be nice to have IPv6 supported as well. Since we
have the recently added %pISpc specifier that is IPv4/IPv6 generic,
build related sockaddress structures from the flow information and
pass this to our format string. Tested with SSH and HTTP sessions
on IPv4 and IPv6.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_probe.c | 54 ++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 44 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index a2392f48ba35..301a3effe579 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -54,8 +54,11 @@ static const char procname[] = "tcpprobe";
 
 struct tcp_log {
 	ktime_t tstamp;
-	__be32	saddr, daddr;
-	__be16	sport, dport;
+	union {
+		struct sockaddr		raw;
+		struct sockaddr_in	v4;
+		struct sockaddr_in6	v6;
+	}	src, dst;
 	u16	length;
 	u32	snd_nxt;
 	u32	snd_una;
@@ -87,6 +90,30 @@ static inline int tcp_probe_avail(void)
 	return bufsize - tcp_probe_used() - 1;
 }
 
+#define tcp_probe_copy_fl_to_si4(inet, si4, mem)		\
+	do {							\
+		si4.sin_family = AF_INET;			\
+		si4.sin_port = inet->inet_##mem##port;		\
+		si4.sin_addr.s_addr = inet->inet_##mem##addr;	\
+	} while (0)						\
+
+#if IS_ENABLED(CONFIG_IPV6)
+#define tcp_probe_copy_fl_to_si6(inet, si6, mem)		\
+	do {							\
+		struct ipv6_pinfo *pi6 = inet->pinet6;		\
+		si6.sin6_family = AF_INET6;			\
+		si6.sin6_port = inet->inet_##mem##port;		\
+		si6.sin6_addr = pi6->mem##addr;			\
+		si6.sin6_flowinfo = 0; /* No need here. */	\
+		si6.sin6_scope_id = 0;	/* No need here. */	\
+	} while (0)
+#else
+#define tcp_probe_copy_fl_to_si6(fl, si6, mem)			\
+	do {							\
+		memset(&si6, 0, sizeof(si6));			\
+	} while (0)
+#endif
+
 /*
  * Hook inserted to be called before each receive packet.
  * Note: arguments must match tcp_rcv_established()!
@@ -108,10 +135,19 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			struct tcp_log *p = tcp_probe.log + tcp_probe.head;
 
 			p->tstamp = ktime_get();
-			p->saddr = inet->inet_saddr;
-			p->sport = inet->inet_sport;
-			p->daddr = inet->inet_daddr;
-			p->dport = inet->inet_dport;
+			switch (sk->sk_family) {
+			case AF_INET:
+				tcp_probe_copy_fl_to_si4(inet, p->src.v4, s);
+				tcp_probe_copy_fl_to_si4(inet, p->dst.v4, d);
+				break;
+			case AF_INET6:
+				tcp_probe_copy_fl_to_si6(inet, p->src.v6, s);
+				tcp_probe_copy_fl_to_si6(inet, p->dst.v6, d);
+				break;
+			default:
+				BUG();
+			}
+
 			p->length = skb->len;
 			p->snd_nxt = tp->snd_nxt;
 			p->snd_una = tp->snd_una;
@@ -159,12 +195,10 @@ static int tcpprobe_sprint(char *tbuf, int n)
 		= ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start));
 
 	return scnprintf(tbuf, n,
-			"%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u %u\n",
+			"%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n",
 			(unsigned long) tv.tv_sec,
 			(unsigned long) tv.tv_nsec,
-			&p->saddr, ntohs(p->sport),
-			&p->daddr, ntohs(p->dport),
-			p->length, p->snd_nxt, p->snd_una,
+			&p->src, &p->dst, p->length, p->snd_nxt, p->snd_una,
 			p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd);
 }
 
-- 
cgit v1.2.3


From 05f147ef7c5289e9aba488b3e71a4259be5a1420 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Thu, 22 Aug 2013 17:12:50 +0200
Subject: net: sctp_probe: simplify code by using %pISc format specifier

We can simply use the %pISc format specifier that was recently added
and thus remove some code that distinguishes between IPv4 and IPv6.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/probe.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/sctp/probe.c b/net/sctp/probe.c
index e62c22535be4..cd72ae57aff1 100644
--- a/net/sctp/probe.c
+++ b/net/sctp/probe.c
@@ -155,13 +155,8 @@ static sctp_disposition_t jsctp_sf_eat_sack(struct net *net,
 			if (sp == asoc->peer.primary_path)
 				printl("*");
 
-			if (sp->ipaddr.sa.sa_family == AF_INET)
-				printl("%pI4 ", &sp->ipaddr.v4.sin_addr);
-			else
-				printl("%pI6 ", &sp->ipaddr.v6.sin6_addr);
-
-			printl("%2u %8u %8u %8u %8u %8u ",
-			       sp->state, sp->cwnd, sp->ssthresh,
+			printl("%pISc %2u %8u %8u %8u %8u %8u ",
+			       &sp->ipaddr, sp->state, sp->cwnd, sp->ssthresh,
 			       sp->flight_size, sp->partial_bytes_acked,
 			       sp->pathmtu);
 		}
-- 
cgit v1.2.3


From 1fb9026000e66ffe032b11ec724c1bc7d068198e Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 21 Aug 2013 11:24:01 +0200
Subject: mac80211: move setting WIPHY_FLAG_SUPPORTS_SCHED_SCAN into drivers

mac80211 currently sets WIPHY_FLAG_SUPPORTS_SCHED_SCAN based on whether
the start_sched_scan operation is supported or not, but that will not
be correct for all drivers, we're adding scheduled scan to the iwlmvm
driver but it depends on firmware support.

Therefore, move setting WIPHY_FLAG_SUPPORTS_SCHED_SCAN into the drivers
so that they can control it regardless of implementing the operation.

This currently only affects the TI drivers since they're the only ones
implementing scheduled scan (in a mac80211 driver.)

Acked-by: Luciano Coelho <luca@coelho.fi>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ti/wlcore/main.c | 3 ++-
 net/mac80211/main.c                   | 3 ---
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c
index d1b19c38a907..38995f90040d 100644
--- a/drivers/net/wireless/ti/wlcore/main.c
+++ b/drivers/net/wireless/ti/wlcore/main.c
@@ -5623,7 +5623,8 @@ static int wl1271_init_ieee80211(struct wl1271 *wl)
 	wl->hw->wiphy->max_remain_on_channel_duration = 5000;
 
 	wl->hw->wiphy->flags |= WIPHY_FLAG_AP_UAPSD |
-				WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL;
+				WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL |
+				WIPHY_FLAG_SUPPORTS_SCHED_SCAN;
 
 	/* make sure all our channels fit in the scanned_ch bitmask */
 	BUILD_BUG_ON(ARRAY_SIZE(wl1271_channels) +
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 25eb35b01938..21d5d44444d0 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -892,9 +892,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 	if (!local->ops->remain_on_channel)
 		local->hw.wiphy->max_remain_on_channel_duration = 5000;
 
-	if (local->ops->sched_scan_start)
-		local->hw.wiphy->flags |= WIPHY_FLAG_SUPPORTS_SCHED_SCAN;
-
 	/* mac80211 based drivers don't support internal TDLS setup */
 	if (local->hw.wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS)
 		local->hw.wiphy->flags |= WIPHY_FLAG_TDLS_EXTERNAL_SETUP;
-- 
cgit v1.2.3


From a4ef66a915b957416a89a48365aea2ec2dc551f6 Mon Sep 17 00:00:00 2001
From: Chun-Yeow Yeoh <yeohchunyeow@cozybit.com>
Date: Thu, 22 Aug 2013 10:28:58 -0700
Subject: mac80211: only respond to probe request with mesh ID

Previously, the mesh STA responds to probe request from legacy STA
but now it will only respond to legacy STA if the legacy STA does include
the specific mesh ID or wildcard mesh ID in the probe request.

The iw patch "iw: scan using meshid" can be used either by legacy STA
or by mesh STA to do active scanning by inserting the mesh ID in the
probe request frame.

Signed-off-by: Chun-Yeow Yeoh <yeohchunyeow@cozybit.com>
Acked-by: Thomas Pedersen <thomas@cozybit.com>
Acked-by: Javier Cardona <javier@cozybit.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mesh.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 885a5f6e2c21..707ac61d63e5 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -832,6 +832,9 @@ ieee80211_mesh_rx_probe_req(struct ieee80211_sub_if_data *sdata,
 
 	ieee802_11_parse_elems(pos, len - baselen, false, &elems);
 
+	if (!elems.mesh_id)
+		return;
+
 	/* 802.11-2012 10.1.4.3.2 */
 	if ((!ether_addr_equal(mgmt->da, sdata->vif.addr) &&
 	     !is_broadcast_ether_addr(mgmt->da)) ||
-- 
cgit v1.2.3


From c4c205f3cd17b567b8e20098522416eac2e73960 Mon Sep 17 00:00:00 2001
From: Bob Copeland <me@bobcopeland.com>
Date: Fri, 23 Aug 2013 09:35:38 -0400
Subject: mac80211: assign seqnums for group QoS frames

According to 802.11-2012 9.3.2.10, paragraph 4, QoS
data frames with a group address in the Address 1 field
have sequence numbers allocated from the same counter
as non-QoS data and management frames.  Without this
flag, some drivers may not assign sequence numbers, and
in rare cases frames might get dropped.  Set the control
flag accordingly.

Signed-off-by: Bob Copeland <bob@cozybit.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/tx.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 098ae854ad3c..3456c0486b48 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -781,9 +781,11 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 	/*
 	 * Anything but QoS data that has a sequence number field
 	 * (is long enough) gets a sequence number from the global
-	 * counter.
+	 * counter.  QoS data frames with a multicast destination
+	 * also use the global counter (802.11-2012 9.3.2.10).
 	 */
-	if (!ieee80211_is_data_qos(hdr->frame_control)) {
+	if (!ieee80211_is_data_qos(hdr->frame_control) ||
+	    is_multicast_ether_addr(hdr->addr1)) {
 		/* driver should assign sequence number */
 		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
 		/* for pure STA mode without beacons, we can do it */
-- 
cgit v1.2.3


From 19504cf5f35fbe85db811fce9f4392a0cbdada2f Mon Sep 17 00:00:00 2001
From: Vladimir Kondratiev <qca_vkondrat@qca.qualcomm.com>
Date: Thu, 15 Aug 2013 14:51:28 +0300
Subject: cfg80211: add flags to cfg80211_rx_mgmt()

Add flags intended to report various auxiliary information
and introduce the NL80211_RXMGMT_FLAG_ANSWERED flag to report
that the frame was already answered by the device.

Signed-off-by: Vladimir Kondratiev <qca_vkondrat@qca.qualcomm.com>
[REPLIED->ANSWERED, reword commit message]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath6kl/wmi.c         |  7 +++----
 drivers/net/wireless/ath/wil6210/wmi.c        |  2 +-
 drivers/net/wireless/brcm80211/brcmfmac/p2p.c |  4 ++--
 drivers/net/wireless/mwifiex/util.c           |  4 ++--
 include/net/cfg80211.h                        |  3 ++-
 include/uapi/linux/nl80211.h                  | 16 ++++++++++++++++
 net/mac80211/rx.c                             |  3 +--
 net/wireless/mlme.c                           |  4 ++--
 net/wireless/nl80211.c                        |  6 ++++--
 net/wireless/nl80211.h                        |  2 +-
 10 files changed, 34 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/drivers/net/wireless/ath/ath6kl/wmi.c b/drivers/net/wireless/ath/ath6kl/wmi.c
index 87aefb4c4c23..546d5da0b894 100644
--- a/drivers/net/wireless/ath/ath6kl/wmi.c
+++ b/drivers/net/wireless/ath/ath6kl/wmi.c
@@ -568,8 +568,8 @@ static int ath6kl_wmi_rx_probe_req_event_rx(struct wmi *wmi, u8 *datap, int len,
 		   dlen, freq, vif->probe_req_report);
 
 	if (vif->probe_req_report || vif->nw_type == AP_NETWORK)
-		cfg80211_rx_mgmt(&vif->wdev, freq, 0,
-				 ev->data, dlen, GFP_ATOMIC);
+		cfg80211_rx_mgmt(&vif->wdev, freq, 0, ev->data, dlen, 0,
+				 GFP_ATOMIC);
 
 	return 0;
 }
@@ -608,8 +608,7 @@ static int ath6kl_wmi_rx_action_event_rx(struct wmi *wmi, u8 *datap, int len,
 		return -EINVAL;
 	}
 	ath6kl_dbg(ATH6KL_DBG_WMI, "rx_action: len=%u freq=%u\n", dlen, freq);
-	cfg80211_rx_mgmt(&vif->wdev, freq, 0,
-			 ev->data, dlen, GFP_ATOMIC);
+	cfg80211_rx_mgmt(&vif->wdev, freq, 0, ev->data, dlen, 0, GFP_ATOMIC);
 
 	return 0;
 }
diff --git a/drivers/net/wireless/ath/wil6210/wmi.c b/drivers/net/wireless/ath/wil6210/wmi.c
index dc8059ad4bab..21c791ee8178 100644
--- a/drivers/net/wireless/ath/wil6210/wmi.c
+++ b/drivers/net/wireless/ath/wil6210/wmi.c
@@ -339,7 +339,7 @@ static void wmi_evt_rx_mgmt(struct wil6210_priv *wil, int id, void *d, int len)
 		}
 	} else {
 		cfg80211_rx_mgmt(wil->wdev, freq, signal,
-				 (void *)rx_mgmt_frame, d_len, GFP_KERNEL);
+				 (void *)rx_mgmt_frame, d_len, 0, GFP_KERNEL);
 	}
 }
 
diff --git a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c
index 79555f006d53..d7a974532909 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c
+++ b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c
@@ -1430,7 +1430,7 @@ int brcmf_p2p_notify_action_frame_rx(struct brcmf_if *ifp,
 					      IEEE80211_BAND_5GHZ);
 
 	wdev = &ifp->vif->wdev;
-	cfg80211_rx_mgmt(wdev, freq, 0, (u8 *)mgmt_frame, mgmt_frame_len,
+	cfg80211_rx_mgmt(wdev, freq, 0, (u8 *)mgmt_frame, mgmt_frame_len, 0,
 			 GFP_ATOMIC);
 
 	kfree(mgmt_frame);
@@ -1895,7 +1895,7 @@ s32 brcmf_p2p_notify_rx_mgmt_p2p_probereq(struct brcmf_if *ifp,
 					      IEEE80211_BAND_2GHZ :
 					      IEEE80211_BAND_5GHZ);
 
-	cfg80211_rx_mgmt(&vif->wdev, freq, 0, mgmt_frame, mgmt_frame_len,
+	cfg80211_rx_mgmt(&vif->wdev, freq, 0, mgmt_frame, mgmt_frame_len, 0,
 			 GFP_ATOMIC);
 
 	brcmf_dbg(INFO, "mgmt_frame_len (%d) , e->datalen (%d), chanspec (%04x), freq (%d)\n",
diff --git a/drivers/net/wireless/mwifiex/util.c b/drivers/net/wireless/mwifiex/util.c
index e57ac0dd3ab5..5d9e150f4111 100644
--- a/drivers/net/wireless/mwifiex/util.c
+++ b/drivers/net/wireless/mwifiex/util.c
@@ -171,8 +171,8 @@ mwifiex_process_mgmt_packet(struct mwifiex_private *priv,
 	rx_pd->rx_pkt_length = cpu_to_le16(pkt_len);
 
 	cfg80211_rx_mgmt(priv->wdev, priv->roc_cfg.chan.center_freq,
-			 CAL_RSSI(rx_pd->snr, rx_pd->nf),
-			 skb->data, pkt_len, GFP_ATOMIC);
+			 CAL_RSSI(rx_pd->snr, rx_pd->nf), skb->data, pkt_len,
+			 0, GFP_ATOMIC);
 
 	return 0;
 }
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 9ab7a0690d93..d530c54a3662 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -4056,6 +4056,7 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr,
  * @sig_dbm: signal strength in mBm, or 0 if unknown
  * @buf: Management frame (header + body)
  * @len: length of the frame data
+ * @flags: flags, as defined in enum nl80211_rxmgmt_flags
  * @gfp: context flags
  *
  * This function is called whenever an Action frame is received for a station
@@ -4067,7 +4068,7 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr,
  * driver is responsible for rejecting the frame.
  */
 bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_dbm,
-		      const u8 *buf, size_t len, gfp_t gfp);
+		      const u8 *buf, size_t len, u32 flags, gfp_t gfp);
 
 /**
  * cfg80211_mgmt_tx_status - notification of TX status for management frame
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 1f42bc3dcb9c..fde2c021b26d 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1493,6 +1493,9 @@ enum nl80211_commands {
  * @NL80211_ATTR_CSA_C_OFF_PRESP: Offset of the channel switch counter
  *	field in the probe response (%NL80211_ATTR_PROBE_RESP).
  *
+ * @NL80211_ATTR_RXMGMT_FLAGS: flags for nl80211_send_mgmt(), u32.
+ *	As specified in the &enum nl80211_rxmgmt_flags.
+ *
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
  */
@@ -1801,6 +1804,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_CSA_C_OFF_BEACON,
 	NL80211_ATTR_CSA_C_OFF_PRESP,
 
+	NL80211_ATTR_RXMGMT_FLAGS,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -3901,4 +3906,15 @@ enum nl80211_crit_proto_id {
 /* maximum duration for critical protocol measures */
 #define NL80211_CRIT_PROTO_MAX_DURATION		5000 /* msec */
 
+/**
+ * enum nl80211_rxmgmt_flags - flags for received management frame.
+ *
+ * Used by cfg80211_rx_mgmt()
+ *
+ * @NL80211_RXMGMT_FLAG_ANSWERED: frame was answered by device/driver.
+ */
+enum nl80211_rxmgmt_flags {
+	NL80211_RXMGMT_FLAG_ANSWERED = 1 << 0,
+};
+
 #endif /* __LINUX_NL80211_H */
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index ffad155316a9..07901050812f 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2678,8 +2678,7 @@ ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx)
 		sig = status->signal;
 
 	if (cfg80211_rx_mgmt(&rx->sdata->wdev, status->freq, sig,
-			     rx->skb->data, rx->skb->len,
-			     GFP_ATOMIC)) {
+			     rx->skb->data, rx->skb->len, 0, GFP_ATOMIC)) {
 		if (rx->sta)
 			rx->sta->rx_packets++;
 		dev_kfree_skb(rx->skb);
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index bfac5e186f57..8d49c1ce3dea 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -621,7 +621,7 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
 }
 
 bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_mbm,
-		      const u8 *buf, size_t len, gfp_t gfp)
+		      const u8 *buf, size_t len, u32 flags, gfp_t gfp)
 {
 	struct wiphy *wiphy = wdev->wiphy;
 	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
@@ -664,7 +664,7 @@ bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_mbm,
 		/* Indicate the received Action frame to user space */
 		if (nl80211_send_mgmt(rdev, wdev, reg->nlportid,
 				      freq, sig_mbm,
-				      buf, len, gfp))
+				      buf, len, flags, gfp))
 			continue;
 
 		result = true;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 334697de5cc0..a51269d2d488 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -10446,7 +10446,7 @@ EXPORT_SYMBOL(cfg80211_rx_unexpected_4addr_frame);
 int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
 		      struct wireless_dev *wdev, u32 nlportid,
 		      int freq, int sig_dbm,
-		      const u8 *buf, size_t len, gfp_t gfp)
+		      const u8 *buf, size_t len, u32 flags, gfp_t gfp)
 {
 	struct net_device *netdev = wdev->netdev;
 	struct sk_buff *msg;
@@ -10469,7 +10469,9 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
 	    nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, freq) ||
 	    (sig_dbm &&
 	     nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) ||
-	    nla_put(msg, NL80211_ATTR_FRAME, len, buf))
+	    nla_put(msg, NL80211_ATTR_FRAME, len, buf) ||
+	    (flags &&
+	     nla_put_u32(msg, NL80211_ATTR_RXMGMT_FLAGS, flags)))
 		goto nla_put_failure;
 
 	genlmsg_end(msg, hdr);
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index 44341bf53cfc..2c0f2b3c07cb 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -66,7 +66,7 @@ void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev,
 int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
 		      struct wireless_dev *wdev, u32 nlpid,
 		      int freq, int sig_dbm,
-		      const u8 *buf, size_t len, gfp_t gfp);
+		      const u8 *buf, size_t len, u32 flags, gfp_t gfp);
 
 void
 nl80211_radar_notify(struct cfg80211_registered_device *rdev,
-- 
cgit v1.2.3


From d70b7616d9080ec9f868fbd31db5fd4341435d61 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 23 Aug 2013 15:48:48 +0200
Subject: mac80211: ignore (E)CSA in probe response frames

Seth reports that some APs, notably the Netgear WNDAP360, send
invalid ECSA IEs in probe response frames with the operating
class and channel number both set to zero, even when no channel
switch is being done. As a result, any scan while connected to
such an AP results in the connection being dropped.

Fix this by ignoring any channel switch announcment in probe
response frames entirely, since we're connected to the AP we
will be receiving a beacon (and maybe even an action frame) if
a channel switch is done, which is sufficient.

Cc: stable@vger.kernel.org # 3.10
Reported-by: Seth Forshee <seth.forshee@canonical.com>
Tested-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mlme.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index cc9e02d79b55..7a98d524fc81 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2851,14 +2851,6 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 		ieee80211_rx_bss_put(local, bss);
 		sdata->vif.bss_conf.beacon_rate = bss->beacon_rate;
 	}
-
-	if (!sdata->u.mgd.associated ||
-	    !ether_addr_equal(mgmt->bssid, sdata->u.mgd.associated->bssid))
-		return;
-
-	ieee80211_sta_process_chanswitch(sdata, rx_status->mactime,
-					 elems, true);
-
 }
 
 
@@ -3147,6 +3139,9 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 
 	ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems);
 
+	ieee80211_sta_process_chanswitch(sdata, rx_status->mactime,
+					 &elems, true);
+
 	if (ieee80211_sta_wmm_params(local, sdata, elems.wmm_param,
 				     elems.wmm_param_len))
 		changed |= BSS_CHANGED_QOS;
-- 
cgit v1.2.3


From d57170b1b1d71382a0d9cf31b01364a97add3f19 Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Tue, 30 Jul 2013 15:39:39 -0700
Subject: openvswitch: Use RCU lock for flow dump operation.

Flow dump operation is read-only operation.  There is no need to
take ovs-lock.  Following patch use rcu-lock for dumping flows.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 net/openvswitch/datapath.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index f2ed7600084e..e6fb866e4ed4 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1104,7 +1104,6 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp,
 				  u32 seq, u32 flags, u8 cmd)
 {
 	const int skb_orig_len = skb->len;
-	const struct sw_flow_actions *sf_acts;
 	struct nlattr *start;
 	struct ovs_flow_stats stats;
 	struct ovs_header *ovs_header;
@@ -1113,8 +1112,6 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp,
 	u8 tcp_flags;
 	int err;
 
-	sf_acts = ovsl_dereference(flow->sf_acts);
-
 	ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family, flags, cmd);
 	if (!ovs_header)
 		return -EMSGSIZE;
@@ -1161,6 +1158,11 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp,
 	 */
 	start = nla_nest_start(skb, OVS_FLOW_ATTR_ACTIONS);
 	if (start) {
+		const struct sw_flow_actions *sf_acts;
+
+		sf_acts = rcu_dereference_check(flow->sf_acts,
+						lockdep_ovsl_is_held());
+
 		err = actions_to_attr(sf_acts->actions, sf_acts->actions_len, skb);
 		if (!err)
 			nla_nest_end(skb, start);
@@ -1440,15 +1442,14 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	struct datapath *dp;
 	struct flow_table *table;
 
-	ovs_lock();
+	rcu_read_lock();
 	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
 	if (!dp) {
-		ovs_unlock();
+		rcu_read_unlock();
 		return -ENODEV;
 	}
 
-	table = ovsl_dereference(dp->table);
-
+	table = rcu_dereference(dp->table);
 	for (;;) {
 		struct sw_flow *flow;
 		u32 bucket, obj;
@@ -1468,7 +1469,7 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
 		cb->args[0] = bucket;
 		cb->args[1] = obj;
 	}
-	ovs_unlock();
+	rcu_read_unlock();
 	return skb->len;
 }
 
-- 
cgit v1.2.3


From 59a35d60af3ab80037ad2fa2d60671ce2818b9e4 Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Tue, 30 Jul 2013 15:42:19 -0700
Subject: openvswitch: Use RCU lock for dp dump operation.

RCUfy dp-dump operation which is already read-only. This
makes all ovs dump operations lockless.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 net/openvswitch/datapath.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index e6fb866e4ed4..9d97ef3c9830 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -951,9 +951,10 @@ static struct genl_ops dp_packet_genl_ops[] = {
 
 static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats)
 {
+	struct flow_table *table;
 	int i;
-	struct flow_table *table = ovsl_dereference(dp->table);
 
+	table = rcu_dereference_check(dp->table, lockdep_ovsl_is_held());
 	stats->n_flows = ovs_flow_tbl_count(table);
 
 	stats->n_hit = stats->n_missed = stats->n_lost = 0;
@@ -1665,7 +1666,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
 		goto err_destroy_local_port;
 
 	ovs_net = net_generic(ovs_dp_get_net(dp), ovs_net_id);
-	list_add_tail(&dp->list_node, &ovs_net->dps);
+	list_add_tail_rcu(&dp->list_node, &ovs_net->dps);
 
 	ovs_unlock();
 
@@ -1703,7 +1704,7 @@ static void __dp_destroy(struct datapath *dp)
 				ovs_dp_detach_port(vport);
 	}
 
-	list_del(&dp->list_node);
+	list_del_rcu(&dp->list_node);
 
 	/* OVSP_LOCAL is datapath internal port. We need to make sure that
 	 * all port in datapath are destroyed first before freeing datapath.
@@ -1808,8 +1809,8 @@ static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	int skip = cb->args[0];
 	int i = 0;
 
-	ovs_lock();
-	list_for_each_entry(dp, &ovs_net->dps, list_node) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(dp, &ovs_net->dps, list_node) {
 		if (i >= skip &&
 		    ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).portid,
 					 cb->nlh->nlmsg_seq, NLM_F_MULTI,
@@ -1817,7 +1818,7 @@ static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
 			break;
 		i++;
 	}
-	ovs_unlock();
+	rcu_read_unlock();
 
 	cb->args[0] = i;
 
-- 
cgit v1.2.3


From 76a66c7e7f6bec35bbdb0ca91db327c7c56d8c45 Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Tue, 30 Jul 2013 15:45:59 -0700
Subject: openvswitch: Use non rcu hlist_del() flow table entry.

Flow table destroy is done in rcu call-back context.  Therefore
there is no need to use rcu variant of hlist_del().

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 net/openvswitch/flow.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 1aa84dc58777..fca282520cee 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -302,7 +302,7 @@ void ovs_flow_tbl_destroy(struct flow_table *table)
 		int ver = table->node_ver;
 
 		hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) {
-			hlist_del_rcu(&flow->hash_node[ver]);
+			hlist_del(&flow->hash_node[ver]);
 			ovs_flow_free(flow);
 		}
 	}
-- 
cgit v1.2.3


From 2537b4dd0a70931a0c4d83d4402d0a5d78af3891 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Fri, 26 Jul 2013 14:01:54 +0200
Subject: openvswitch:: link upper device for port devices

Link upper device properly. That will make IFLA_MASTER filled up.
Set the master to port 0 of the datapath under which the port belongs.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 net/openvswitch/vport-netdev.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 5982f3f62835..09d93c13cfd6 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -25,6 +25,7 @@
 #include <linux/llc.h>
 #include <linux/rtnetlink.h>
 #include <linux/skbuff.h>
+#include <linux/openvswitch.h>
 
 #include <net/llc.h>
 
@@ -74,6 +75,15 @@ static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb)
 	return RX_HANDLER_CONSUMED;
 }
 
+static struct net_device *get_dpdev(struct datapath *dp)
+{
+	struct vport *local;
+
+	local = ovs_vport_ovsl(dp, OVSP_LOCAL);
+	BUG_ON(!local);
+	return netdev_vport_priv(local)->dev;
+}
+
 static struct vport *netdev_create(const struct vport_parms *parms)
 {
 	struct vport *vport;
@@ -103,10 +113,15 @@ static struct vport *netdev_create(const struct vport_parms *parms)
 	}
 
 	rtnl_lock();
+	err = netdev_master_upper_dev_link(netdev_vport->dev,
+					   get_dpdev(vport->dp));
+	if (err)
+		goto error_unlock;
+
 	err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook,
 					 vport);
 	if (err)
-		goto error_unlock;
+		goto error_master_upper_dev_unlink;
 
 	dev_set_promiscuity(netdev_vport->dev, 1);
 	netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH;
@@ -114,6 +129,8 @@ static struct vport *netdev_create(const struct vport_parms *parms)
 
 	return vport;
 
+error_master_upper_dev_unlink:
+	netdev_upper_dev_unlink(netdev_vport->dev, get_dpdev(vport->dp));
 error_unlock:
 	rtnl_unlock();
 error_put:
@@ -140,6 +157,7 @@ static void netdev_destroy(struct vport *vport)
 	rtnl_lock();
 	netdev_vport->dev->priv_flags &= ~IFF_OVS_DATAPATH;
 	netdev_rx_handler_unregister(netdev_vport->dev);
+	netdev_upper_dev_unlink(netdev_vport->dev, get_dpdev(vport->dp));
 	dev_set_promiscuity(netdev_vport->dev, -1);
 	rtnl_unlock();
 
-- 
cgit v1.2.3


From 2694838d608db701e5ed1e95d741cdbddd9aa864 Mon Sep 17 00:00:00 2001
From: Justin Pettit <jpettit@nicira.com>
Date: Mon, 19 Aug 2013 17:49:29 -0700
Subject: openvswitch: Fix argument descriptions in vport.c.

Signed-off-by: Justin Pettit <jpettit@nicira.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 net/openvswitch/vport.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index d69e0c06dfde..6f65dbe13812 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -203,7 +203,7 @@ out:
  *	ovs_vport_set_options - modify existing vport device (for kernel callers)
  *
  * @vport: vport to modify.
- * @port: New configuration.
+ * @options: New configuration.
  *
  * Modifies an existing device with the specified configuration (which is
  * dependent on device type).  ovs_mutex must be held.
@@ -328,6 +328,7 @@ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb)
  *
  * @vport: vport that received the packet
  * @skb: skb that was received
+ * @tun_key: tunnel (if any) that carried packet
  *
  * Must be called with rcu_read_lock.  The packet cannot be shared and
  * skb->data should point to the Ethernet header.
-- 
cgit v1.2.3


From 3fa34de67861abfc4846ccec886ca549d46ae56c Mon Sep 17 00:00:00 2001
From: Cong Wang <amwang@redhat.com>
Date: Tue, 20 Aug 2013 10:48:15 -0700
Subject: openvswitch: check CONFIG_OPENVSWITCH_GRE in makefile

Cc: Jesse Gross <jesse@nicira.com>
Cc: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: Cong Wang <amwang@redhat.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 net/openvswitch/Makefile    | 5 ++++-
 net/openvswitch/vport-gre.c | 3 ---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile
index 82e4ee54a44b..ea36e99089af 100644
--- a/net/openvswitch/Makefile
+++ b/net/openvswitch/Makefile
@@ -10,10 +10,13 @@ openvswitch-y := \
 	dp_notify.o \
 	flow.o \
 	vport.o \
-	vport-gre.o \
 	vport-internal_dev.o \
 	vport-netdev.o
 
 ifneq ($(CONFIG_OPENVSWITCH_VXLAN),)
 openvswitch-y += vport-vxlan.o
 endif
+
+ifneq ($(CONFIG_OPENVSWITCH_GRE),)
+openvswitch-y += vport-gre.o
+endif
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index 493e9775dcda..21d5073e148e 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -16,7 +16,6 @@
  * 02110-1301, USA
  */
 
-#ifdef CONFIG_OPENVSWITCH_GRE
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/if.h>
@@ -271,5 +270,3 @@ const struct vport_ops ovs_gre_vport_ops = {
 	.get_name	= gre_get_name,
 	.send		= gre_tnl_send,
 };
-
-#endif /* OPENVSWITCH_GRE */
-- 
cgit v1.2.3


From 03f0d916aa0317592dda11bd17c7357858719b6c Mon Sep 17 00:00:00 2001
From: Andy Zhou <azhou@nicira.com>
Date: Wed, 7 Aug 2013 20:01:00 -0700
Subject: openvswitch: Mega flow implementation

Add wildcarded flow support in kernel datapath.

Wildcarded flow can improve OVS flow set up performance by avoid sending
matching new flows to the user space program. The exact performance boost
will largely dependent on wildcarded flow hit rate.

In case all new flows hits wildcard flows, the flow set up rate is
within 5% of that of linux bridge module.

Pravin has made significant contributions to this patch. Including API
clean ups and bug fixes.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 Documentation/networking/openvswitch.txt |   40 +
 include/uapi/linux/openvswitch.h         |    9 +-
 net/openvswitch/actions.c                |    6 +-
 net/openvswitch/datapath.c               |  140 ++-
 net/openvswitch/datapath.h               |    6 +
 net/openvswitch/flow.c                   | 1387 ++++++++++++++++++++----------
 net/openvswitch/flow.h                   |   96 ++-
 7 files changed, 1171 insertions(+), 513 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/openvswitch.txt b/Documentation/networking/openvswitch.txt
index 8fa2dd1e792e..37c20ee2455e 100644
--- a/Documentation/networking/openvswitch.txt
+++ b/Documentation/networking/openvswitch.txt
@@ -91,6 +91,46 @@ Often we ellipsize arguments not important to the discussion, e.g.:
     in_port(1), eth(...), eth_type(0x0800), ipv4(...), tcp(...)
 
 
+Wildcarded flow key format
+--------------------------
+
+A wildcarded flow is described with two sequences of Netlink attributes
+passed over the Netlink socket. A flow key, exactly as described above, and an
+optional corresponding flow mask.
+
+A wildcarded flow can represent a group of exact match flows. Each '1' bit
+in the mask specifies a exact match with the corresponding bit in the flow key.
+A '0' bit specifies a don't care bit, which will match either a '1' or '0' bit
+of a incoming packet. Using wildcarded flow can improve the flow set up rate
+by reduce the number of new flows need to be processed by the user space program.
+
+Support for the mask Netlink attribute is optional for both the kernel and user
+space program. The kernel can ignore the mask attribute, installing an exact
+match flow, or reduce the number of don't care bits in the kernel to less than
+what was specified by the user space program. In this case, variations in bits
+that the kernel does not implement will simply result in additional flow setups.
+The kernel module will also work with user space programs that neither support
+nor supply flow mask attributes.
+
+Since the kernel may ignore or modify wildcard bits, it can be difficult for
+the userspace program to know exactly what matches are installed. There are
+two possible approaches: reactively install flows as they miss the kernel
+flow table (and therefore not attempt to determine wildcard changes at all)
+or use the kernel's response messages to determine the installed wildcards.
+
+When interacting with userspace, the kernel should maintain the match portion
+of the key exactly as originally installed. This will provides a handle to
+identify the flow for all future operations. However, when reporting the
+mask of an installed flow, the mask should include any restrictions imposed
+by the kernel.
+
+The behavior when using overlapping wildcarded flows is undefined. It is the
+responsibility of the user space program to ensure that any incoming packet
+can match at most one flow, wildcarded or not. The current implementation
+performs best-effort detection of overlapping wildcarded flows and may reject
+some but not all of them. However, this behavior may change in future versions.
+
+
 Basic rule for evolving flow keys
 ---------------------------------
 
diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 52490b0e62b5..de1fa5d3780f 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2007-2011 Nicira Networks.
+ * Copyright (c) 2007-2013 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -379,6 +379,12 @@ struct ovs_key_nd {
  * @OVS_FLOW_ATTR_CLEAR: If present in a %OVS_FLOW_CMD_SET request, clears the
  * last-used time, accumulated TCP flags, and statistics for this flow.
  * Otherwise ignored in requests.  Never present in notifications.
+ * @OVS_FLOW_ATTR_MASK: Nested %OVS_KEY_ATTR_* attributes specifying the
+ * mask bits for wildcarded flow match. Mask bit value '1' specifies exact
+ * match with corresponding flow key bit, while mask bit value '0' specifies
+ * a wildcarded match. Omitting attribute is treated as wildcarding all
+ * corresponding fields. Optional for all requests. If not present,
+ * all flow key bits are exact match bits.
  *
  * These attributes follow the &struct ovs_header within the Generic Netlink
  * payload for %OVS_FLOW_* commands.
@@ -391,6 +397,7 @@ enum ovs_flow_attr {
 	OVS_FLOW_ATTR_TCP_FLAGS, /* 8-bit OR'd TCP flags. */
 	OVS_FLOW_ATTR_USED,      /* u64 msecs last used in monotonic time. */
 	OVS_FLOW_ATTR_CLEAR,     /* Flag to clear stats, tcp_flags, used. */
+	OVS_FLOW_ATTR_MASK,      /* Sequence of OVS_KEY_ATTR_* attributes. */
 	__OVS_FLOW_ATTR_MAX
 };
 
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index ab101f715447..1f680222f4f7 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2012 Nicira, Inc.
+ * Copyright (c) 2007-2013 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -376,8 +376,10 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
 	const struct nlattr *a;
 	int rem;
 
+	BUG_ON(!OVS_CB(skb)->pkt_key);
+
 	upcall.cmd = OVS_PACKET_CMD_ACTION;
-	upcall.key = &OVS_CB(skb)->flow->key;
+	upcall.key = OVS_CB(skb)->pkt_key;
 	upcall.userdata = NULL;
 	upcall.portid = 0;
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 9d97ef3c9830..d29cd9aa4a67 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2012 Nicira, Inc.
+ * Copyright (c) 2007-2013 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -165,7 +165,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu)
 {
 	struct datapath *dp = container_of(rcu, struct datapath, rcu);
 
-	ovs_flow_tbl_destroy((__force struct flow_table *)dp->table);
+	ovs_flow_tbl_destroy((__force struct flow_table *)dp->table, false);
 	free_percpu(dp->stats_percpu);
 	release_net(ovs_dp_get_net(dp));
 	kfree(dp->ports);
@@ -226,19 +226,18 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
 	struct sw_flow_key key;
 	u64 *stats_counter;
 	int error;
-	int key_len;
 
 	stats = this_cpu_ptr(dp->stats_percpu);
 
 	/* Extract flow from 'skb' into 'key'. */
-	error = ovs_flow_extract(skb, p->port_no, &key, &key_len);
+	error = ovs_flow_extract(skb, p->port_no, &key);
 	if (unlikely(error)) {
 		kfree_skb(skb);
 		return;
 	}
 
 	/* Look up flow. */
-	flow = ovs_flow_tbl_lookup(rcu_dereference(dp->table), &key, key_len);
+	flow = ovs_flow_lookup(rcu_dereference(dp->table), &key);
 	if (unlikely(!flow)) {
 		struct dp_upcall_info upcall;
 
@@ -253,6 +252,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
 	}
 
 	OVS_CB(skb)->flow = flow;
+	OVS_CB(skb)->pkt_key = &key;
 
 	stats_counter = &stats->n_hit;
 	ovs_flow_used(OVS_CB(skb)->flow, skb);
@@ -435,7 +435,7 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex,
 	upcall->dp_ifindex = dp_ifindex;
 
 	nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY);
-	ovs_flow_to_nlattrs(upcall_info->key, user_skb);
+	ovs_flow_to_nlattrs(upcall_info->key, upcall_info->key, user_skb);
 	nla_nest_end(user_skb, nla);
 
 	if (upcall_info->userdata)
@@ -468,7 +468,7 @@ static int flush_flows(struct datapath *dp)
 
 	rcu_assign_pointer(dp->table, new_table);
 
-	ovs_flow_tbl_deferred_destroy(old_table);
+	ovs_flow_tbl_destroy(old_table, true);
 	return 0;
 }
 
@@ -611,10 +611,12 @@ static int validate_tp_port(const struct sw_flow_key *flow_key)
 static int validate_and_copy_set_tun(const struct nlattr *attr,
 				     struct sw_flow_actions **sfa)
 {
-	struct ovs_key_ipv4_tunnel tun_key;
+	struct sw_flow_match match;
+	struct sw_flow_key key;
 	int err, start;
 
-	err = ovs_ipv4_tun_from_nlattr(nla_data(attr), &tun_key);
+	ovs_match_init(&match, &key, NULL);
+	err = ovs_ipv4_tun_from_nlattr(nla_data(attr), &match, false);
 	if (err)
 		return err;
 
@@ -622,7 +624,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 	if (start < 0)
 		return start;
 
-	err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &tun_key, sizeof(tun_key));
+	err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &match.key->tun_key,
+			sizeof(match.key->tun_key));
 	add_nested_action_end(*sfa, start);
 
 	return err;
@@ -857,7 +860,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 	struct ethhdr *eth;
 	int len;
 	int err;
-	int key_len;
 
 	err = -EINVAL;
 	if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] ||
@@ -890,11 +892,11 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 	if (IS_ERR(flow))
 		goto err_kfree_skb;
 
-	err = ovs_flow_extract(packet, -1, &flow->key, &key_len);
+	err = ovs_flow_extract(packet, -1, &flow->key);
 	if (err)
 		goto err_flow_free;
 
-	err = ovs_flow_metadata_from_nlattrs(flow, key_len, a[OVS_PACKET_ATTR_KEY]);
+	err = ovs_flow_metadata_from_nlattrs(flow, a[OVS_PACKET_ATTR_KEY]);
 	if (err)
 		goto err_flow_free;
 	acts = ovs_flow_actions_alloc(nla_len(a[OVS_PACKET_ATTR_ACTIONS]));
@@ -908,6 +910,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 		goto err_flow_free;
 
 	OVS_CB(packet)->flow = flow;
+	OVS_CB(packet)->pkt_key = &flow->key;
 	packet->priority = flow->key.phy.priority;
 	packet->mark = flow->key.phy.skb_mark;
 
@@ -922,13 +925,13 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 	local_bh_enable();
 	rcu_read_unlock();
 
-	ovs_flow_free(flow);
+	ovs_flow_free(flow, false);
 	return err;
 
 err_unlock:
 	rcu_read_unlock();
 err_flow_free:
-	ovs_flow_free(flow);
+	ovs_flow_free(flow, false);
 err_kfree_skb:
 	kfree_skb(packet);
 err:
@@ -1045,7 +1048,8 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
 		if (!start)
 			return -EMSGSIZE;
 
-		err = ovs_ipv4_tun_to_nlattr(skb, nla_data(ovs_key));
+		err = ovs_ipv4_tun_to_nlattr(skb, nla_data(ovs_key),
+					     nla_data(ovs_key));
 		if (err)
 			return err;
 		nla_nest_end(skb, start);
@@ -1093,6 +1097,7 @@ static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts)
 {
 	return NLMSG_ALIGN(sizeof(struct ovs_header))
 		+ nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_KEY */
+		+ nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_MASK */
 		+ nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */
 		+ nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */
 		+ nla_total_size(8) /* OVS_FLOW_ATTR_USED */
@@ -1119,12 +1124,25 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp,
 
 	ovs_header->dp_ifindex = get_dpifindex(dp);
 
+	/* Fill flow key. */
 	nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY);
 	if (!nla)
 		goto nla_put_failure;
-	err = ovs_flow_to_nlattrs(&flow->key, skb);
+
+	err = ovs_flow_to_nlattrs(&flow->unmasked_key,
+			&flow->unmasked_key, skb);
+	if (err)
+		goto error;
+	nla_nest_end(skb, nla);
+
+	nla = nla_nest_start(skb, OVS_FLOW_ATTR_MASK);
+	if (!nla)
+		goto nla_put_failure;
+
+	err = ovs_flow_to_nlattrs(&flow->key, &flow->mask->key, skb);
 	if (err)
 		goto error;
+
 	nla_nest_end(skb, nla);
 
 	spin_lock_bh(&flow->lock);
@@ -1214,20 +1232,24 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
 {
 	struct nlattr **a = info->attrs;
 	struct ovs_header *ovs_header = info->userhdr;
-	struct sw_flow_key key;
-	struct sw_flow *flow;
+	struct sw_flow_key key, masked_key;
+	struct sw_flow *flow = NULL;
+	struct sw_flow_mask mask;
 	struct sk_buff *reply;
 	struct datapath *dp;
 	struct flow_table *table;
 	struct sw_flow_actions *acts = NULL;
+	struct sw_flow_match match;
 	int error;
-	int key_len;
 
 	/* Extract key. */
 	error = -EINVAL;
 	if (!a[OVS_FLOW_ATTR_KEY])
 		goto error;
-	error = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
+
+	ovs_match_init(&match, &key, &mask);
+	error = ovs_match_from_nlattrs(&match,
+			a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]);
 	if (error)
 		goto error;
 
@@ -1238,9 +1260,13 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
 		if (IS_ERR(acts))
 			goto error;
 
-		error = validate_and_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &key,  0, &acts);
-		if (error)
+		ovs_flow_key_mask(&masked_key, &key, &mask);
+		error = validate_and_copy_actions(a[OVS_FLOW_ATTR_ACTIONS],
+						  &masked_key, 0, &acts);
+		if (error) {
+			OVS_NLERR("Flow actions may not be safe on all matching packets.\n");
 			goto err_kfree;
+		}
 	} else if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW) {
 		error = -EINVAL;
 		goto error;
@@ -1253,8 +1279,11 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
 		goto err_unlock_ovs;
 
 	table = ovsl_dereference(dp->table);
-	flow = ovs_flow_tbl_lookup(table, &key, key_len);
+
+	/* Check if this is a duplicate flow */
+	flow = ovs_flow_lookup(table, &key);
 	if (!flow) {
+		struct sw_flow_mask *mask_p;
 		/* Bail out if we're not allowed to create a new flow. */
 		error = -ENOENT;
 		if (info->genlhdr->cmd == OVS_FLOW_CMD_SET)
@@ -1267,7 +1296,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
 			new_table = ovs_flow_tbl_expand(table);
 			if (!IS_ERR(new_table)) {
 				rcu_assign_pointer(dp->table, new_table);
-				ovs_flow_tbl_deferred_destroy(table);
+				ovs_flow_tbl_destroy(table, true);
 				table = ovsl_dereference(dp->table);
 			}
 		}
@@ -1280,14 +1309,30 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
 		}
 		clear_stats(flow);
 
+		flow->key = masked_key;
+		flow->unmasked_key = key;
+
+		/* Make sure mask is unique in the system */
+		mask_p = ovs_sw_flow_mask_find(table, &mask);
+		if (!mask_p) {
+			/* Allocate a new mask if none exsits. */
+			mask_p = ovs_sw_flow_mask_alloc();
+			if (!mask_p)
+				goto err_flow_free;
+			mask_p->key = mask.key;
+			mask_p->range = mask.range;
+			ovs_sw_flow_mask_insert(table, mask_p);
+		}
+
+		ovs_sw_flow_mask_add_ref(mask_p);
+		flow->mask = mask_p;
 		rcu_assign_pointer(flow->sf_acts, acts);
 
 		/* Put flow in bucket. */
-		ovs_flow_tbl_insert(table, flow, &key, key_len);
+		ovs_flow_insert(table, flow);
 
 		reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid,
-						info->snd_seq,
-						OVS_FLOW_CMD_NEW);
+						info->snd_seq, OVS_FLOW_CMD_NEW);
 	} else {
 		/* We found a matching flow. */
 		struct sw_flow_actions *old_acts;
@@ -1303,6 +1348,13 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
 		    info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL))
 			goto err_unlock_ovs;
 
+		/* The unmasked key has to be the same for flow updates. */
+		error = -EINVAL;
+		if (!ovs_flow_cmp_unmasked_key(flow, &key, match.range.end)) {
+			OVS_NLERR("Flow modification message rejected, unmasked key does not match.\n");
+			goto err_unlock_ovs;
+		}
+
 		/* Update actions. */
 		old_acts = ovsl_dereference(flow->sf_acts);
 		rcu_assign_pointer(flow->sf_acts, acts);
@@ -1327,6 +1379,8 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
 				ovs_dp_flow_multicast_group.id, PTR_ERR(reply));
 	return 0;
 
+err_flow_free:
+	ovs_flow_free(flow, false);
 err_unlock_ovs:
 	ovs_unlock();
 err_kfree:
@@ -1344,12 +1398,16 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
 	struct sw_flow *flow;
 	struct datapath *dp;
 	struct flow_table *table;
+	struct sw_flow_match match;
 	int err;
-	int key_len;
 
-	if (!a[OVS_FLOW_ATTR_KEY])
+	if (!a[OVS_FLOW_ATTR_KEY]) {
+		OVS_NLERR("Flow get message rejected, Key attribute missing.\n");
 		return -EINVAL;
-	err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
+	}
+
+	ovs_match_init(&match, &key, NULL);
+	err = ovs_match_from_nlattrs(&match, a[OVS_FLOW_ATTR_KEY], NULL);
 	if (err)
 		return err;
 
@@ -1361,7 +1419,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	table = ovsl_dereference(dp->table);
-	flow = ovs_flow_tbl_lookup(table, &key, key_len);
+	flow = ovs_flow_lookup_unmasked_key(table, &match);
 	if (!flow) {
 		err = -ENOENT;
 		goto unlock;
@@ -1390,8 +1448,8 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
 	struct sw_flow *flow;
 	struct datapath *dp;
 	struct flow_table *table;
+	struct sw_flow_match match;
 	int err;
-	int key_len;
 
 	ovs_lock();
 	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
@@ -1404,12 +1462,14 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
 		err = flush_flows(dp);
 		goto unlock;
 	}
-	err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
+
+	ovs_match_init(&match, &key, NULL);
+	err = ovs_match_from_nlattrs(&match, a[OVS_FLOW_ATTR_KEY], NULL);
 	if (err)
 		goto unlock;
 
 	table = ovsl_dereference(dp->table);
-	flow = ovs_flow_tbl_lookup(table, &key, key_len);
+	flow = ovs_flow_lookup_unmasked_key(table, &match);
 	if (!flow) {
 		err = -ENOENT;
 		goto unlock;
@@ -1421,13 +1481,13 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
 		goto unlock;
 	}
 
-	ovs_flow_tbl_remove(table, flow);
+	ovs_flow_remove(table, flow);
 
 	err = ovs_flow_cmd_fill_info(flow, dp, reply, info->snd_portid,
 				     info->snd_seq, 0, OVS_FLOW_CMD_DEL);
 	BUG_ON(err < 0);
 
-	ovs_flow_deferred_free(flow);
+	ovs_flow_free(flow, true);
 	ovs_unlock();
 
 	ovs_notify(reply, info, &ovs_dp_flow_multicast_group);
@@ -1457,7 +1517,7 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
 
 		bucket = cb->args[0];
 		obj = cb->args[1];
-		flow = ovs_flow_tbl_next(table, &bucket, &obj);
+		flow = ovs_flow_dump_next(table, &bucket, &obj);
 		if (!flow)
 			break;
 
@@ -1680,7 +1740,7 @@ err_destroy_ports_array:
 err_destroy_percpu:
 	free_percpu(dp->stats_percpu);
 err_destroy_table:
-	ovs_flow_tbl_destroy(ovsl_dereference(dp->table));
+	ovs_flow_tbl_destroy(ovsl_dereference(dp->table), false);
 err_free_dp:
 	release_net(ovs_dp_get_net(dp));
 	kfree(dp);
@@ -2287,7 +2347,7 @@ static void rehash_flow_table(struct work_struct *work)
 			new_table = ovs_flow_tbl_rehash(old_table);
 			if (!IS_ERR(new_table)) {
 				rcu_assign_pointer(dp->table, new_table);
-				ovs_flow_tbl_deferred_destroy(old_table);
+				ovs_flow_tbl_destroy(old_table, true);
 			}
 		}
 	}
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index a91486484916..4d109c176ef3 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -88,11 +88,13 @@ struct datapath {
 /**
  * struct ovs_skb_cb - OVS data in skb CB
  * @flow: The flow associated with this packet.  May be %NULL if no flow.
+ * @pkt_key: The flow information extracted from the packet.  Must be nonnull.
  * @tun_key: Key for the tunnel that encapsulated this packet. NULL if the
  * packet is not being tunneled.
  */
 struct ovs_skb_cb {
 	struct sw_flow		*flow;
+	struct sw_flow_key	*pkt_key;
 	struct ovs_key_ipv4_tunnel  *tun_key;
 };
 #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)
@@ -183,4 +185,8 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq,
 
 int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb);
 void ovs_dp_notify_wq(struct work_struct *work);
+
+#define OVS_NLERR(fmt, ...) \
+	pr_info_once("netlink: " fmt, ##__VA_ARGS__)
+
 #endif /* datapath.h */
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index fca282520cee..1fceb9653598 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2011 Nicira, Inc.
+ * Copyright (c) 2007-2013 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -46,6 +46,184 @@
 
 static struct kmem_cache *flow_cache;
 
+static void ovs_sw_flow_mask_set(struct sw_flow_mask *mask,
+		struct sw_flow_key_range *range, u8 val);
+
+static void update_range__(struct sw_flow_match *match,
+			  size_t offset, size_t size, bool is_mask)
+{
+	struct sw_flow_key_range *range = NULL;
+	size_t start = offset;
+	size_t end = offset + size;
+
+	if (!is_mask)
+		range = &match->range;
+	else if (match->mask)
+		range = &match->mask->range;
+
+	if (!range)
+		return;
+
+	if (range->start == range->end) {
+		range->start = start;
+		range->end = end;
+		return;
+	}
+
+	if (range->start > start)
+		range->start = start;
+
+	if (range->end < end)
+		range->end = end;
+}
+
+#define SW_FLOW_KEY_PUT(match, field, value, is_mask) \
+	do { \
+		update_range__(match, offsetof(struct sw_flow_key, field),  \
+				     sizeof((match)->key->field), is_mask); \
+		if (is_mask) {						    \
+			if ((match)->mask)				    \
+				(match)->mask->key.field = value;	    \
+		} else {                                                    \
+			(match)->key->field = value;		            \
+		}                                                           \
+	} while (0)
+
+#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \
+	do { \
+		update_range__(match, offsetof(struct sw_flow_key, field),  \
+				len, is_mask);                              \
+		if (is_mask) {						    \
+			if ((match)->mask)				    \
+				memcpy(&(match)->mask->key.field, value_p, len);\
+		} else {                                                    \
+			memcpy(&(match)->key->field, value_p, len);         \
+		}                                                           \
+	} while (0)
+
+void ovs_match_init(struct sw_flow_match *match,
+		    struct sw_flow_key *key,
+		    struct sw_flow_mask *mask)
+{
+	memset(match, 0, sizeof(*match));
+	match->key = key;
+	match->mask = mask;
+
+	memset(key, 0, sizeof(*key));
+
+	if (mask) {
+		memset(&mask->key, 0, sizeof(mask->key));
+		mask->range.start = mask->range.end = 0;
+	}
+}
+
+static bool ovs_match_validate(const struct sw_flow_match *match,
+		u64 key_attrs, u64 mask_attrs)
+{
+	u64 key_expected = 1 << OVS_KEY_ATTR_ETHERNET;
+	u64 mask_allowed = key_attrs;  /* At most allow all key attributes */
+
+	/* The following mask attributes allowed only if they
+	 * pass the validation tests. */
+	mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4)
+			| (1 << OVS_KEY_ATTR_IPV6)
+			| (1 << OVS_KEY_ATTR_TCP)
+			| (1 << OVS_KEY_ATTR_UDP)
+			| (1 << OVS_KEY_ATTR_ICMP)
+			| (1 << OVS_KEY_ATTR_ICMPV6)
+			| (1 << OVS_KEY_ATTR_ARP)
+			| (1 << OVS_KEY_ATTR_ND));
+
+	/* Always allowed mask fields. */
+	mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL)
+		       | (1 << OVS_KEY_ATTR_IN_PORT)
+		       | (1 << OVS_KEY_ATTR_ETHERTYPE));
+
+	/* Check key attributes. */
+	if (match->key->eth.type == htons(ETH_P_ARP)
+			|| match->key->eth.type == htons(ETH_P_RARP)) {
+		key_expected |= 1 << OVS_KEY_ATTR_ARP;
+		if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+			mask_allowed |= 1 << OVS_KEY_ATTR_ARP;
+	}
+
+	if (match->key->eth.type == htons(ETH_P_IP)) {
+		key_expected |= 1 << OVS_KEY_ATTR_IPV4;
+		if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+			mask_allowed |= 1 << OVS_KEY_ATTR_IPV4;
+
+		if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
+			if (match->key->ip.proto == IPPROTO_UDP) {
+				key_expected |= 1 << OVS_KEY_ATTR_UDP;
+				if (match->mask && (match->mask->key.ip.proto == 0xff))
+					mask_allowed |= 1 << OVS_KEY_ATTR_UDP;
+			}
+
+			if (match->key->ip.proto == IPPROTO_TCP) {
+				key_expected |= 1 << OVS_KEY_ATTR_TCP;
+				if (match->mask && (match->mask->key.ip.proto == 0xff))
+					mask_allowed |= 1 << OVS_KEY_ATTR_TCP;
+			}
+
+			if (match->key->ip.proto == IPPROTO_ICMP) {
+				key_expected |= 1 << OVS_KEY_ATTR_ICMP;
+				if (match->mask && (match->mask->key.ip.proto == 0xff))
+					mask_allowed |= 1 << OVS_KEY_ATTR_ICMP;
+			}
+		}
+	}
+
+	if (match->key->eth.type == htons(ETH_P_IPV6)) {
+		key_expected |= 1 << OVS_KEY_ATTR_IPV6;
+		if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+			mask_allowed |= 1 << OVS_KEY_ATTR_IPV6;
+
+		if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
+			if (match->key->ip.proto == IPPROTO_UDP) {
+				key_expected |= 1 << OVS_KEY_ATTR_UDP;
+				if (match->mask && (match->mask->key.ip.proto == 0xff))
+					mask_allowed |= 1 << OVS_KEY_ATTR_UDP;
+			}
+
+			if (match->key->ip.proto == IPPROTO_TCP) {
+				key_expected |= 1 << OVS_KEY_ATTR_TCP;
+				if (match->mask && (match->mask->key.ip.proto == 0xff))
+					mask_allowed |= 1 << OVS_KEY_ATTR_TCP;
+			}
+
+			if (match->key->ip.proto == IPPROTO_ICMPV6) {
+				key_expected |= 1 << OVS_KEY_ATTR_ICMPV6;
+				if (match->mask && (match->mask->key.ip.proto == 0xff))
+					mask_allowed |= 1 << OVS_KEY_ATTR_ICMPV6;
+
+				if (match->key->ipv6.tp.src ==
+						htons(NDISC_NEIGHBOUR_SOLICITATION) ||
+				    match->key->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
+					key_expected |= 1 << OVS_KEY_ATTR_ND;
+					if (match->mask && (match->mask->key.ipv6.tp.src == htons(0xffff)))
+						mask_allowed |= 1 << OVS_KEY_ATTR_ND;
+				}
+			}
+		}
+	}
+
+	if ((key_attrs & key_expected) != key_expected) {
+		/* Key attributes check failed. */
+		OVS_NLERR("Missing expected key attributes (key_attrs=%llx, expected=%llx).\n",
+				key_attrs, key_expected);
+		return false;
+	}
+
+	if ((mask_attrs & mask_allowed) != mask_attrs) {
+		/* Mask attributes check failed. */
+		OVS_NLERR("Contain more than allowed mask fields (mask_attrs=%llx, mask_allowed=%llx).\n",
+				mask_attrs, mask_allowed);
+		return false;
+	}
+
+	return true;
+}
+
 static int check_header(struct sk_buff *skb, int len)
 {
 	if (unlikely(skb->len < len))
@@ -121,12 +299,7 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies)
 	return cur_ms - idle_ms;
 }
 
-#define SW_FLOW_KEY_OFFSET(field)		\
-	(offsetof(struct sw_flow_key, field) +	\
-	 FIELD_SIZEOF(struct sw_flow_key, field))
-
-static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key,
-			 int *key_lenp)
+static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key)
 {
 	unsigned int nh_ofs = skb_network_offset(skb);
 	unsigned int nh_len;
@@ -136,8 +309,6 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key,
 	__be16 frag_off;
 	int err;
 
-	*key_lenp = SW_FLOW_KEY_OFFSET(ipv6.label);
-
 	err = check_header(skb, nh_ofs + sizeof(*nh));
 	if (unlikely(err))
 		return err;
@@ -176,6 +347,21 @@ static bool icmp6hdr_ok(struct sk_buff *skb)
 				  sizeof(struct icmp6hdr));
 }
 
+void ovs_flow_key_mask(struct sw_flow_key *dst, const struct sw_flow_key *src,
+		       const struct sw_flow_mask *mask)
+{
+	u8 *m = (u8 *)&mask->key + mask->range.start;
+	u8 *s = (u8 *)src + mask->range.start;
+	u8 *d = (u8 *)dst + mask->range.start;
+	int i;
+
+	memset(dst, 0, sizeof(*dst));
+	for (i = 0; i < ovs_sw_flow_mask_size_roundup(mask); i++) {
+		*d = *s & *m;
+		d++, s++, m++;
+	}
+}
+
 #define TCP_FLAGS_OFFSET 13
 #define TCP_FLAG_MASK 0x3f
 
@@ -224,6 +410,7 @@ struct sw_flow *ovs_flow_alloc(void)
 
 	spin_lock_init(&flow->lock);
 	flow->sf_acts = NULL;
+	flow->mask = NULL;
 
 	return flow;
 }
@@ -263,7 +450,7 @@ static void free_buckets(struct flex_array *buckets)
 	flex_array_free(buckets);
 }
 
-struct flow_table *ovs_flow_tbl_alloc(int new_size)
+static struct flow_table *__flow_tbl_alloc(int new_size)
 {
 	struct flow_table *table = kmalloc(sizeof(*table), GFP_KERNEL);
 
@@ -281,17 +468,15 @@ struct flow_table *ovs_flow_tbl_alloc(int new_size)
 	table->node_ver = 0;
 	table->keep_flows = false;
 	get_random_bytes(&table->hash_seed, sizeof(u32));
+	table->mask_list = NULL;
 
 	return table;
 }
 
-void ovs_flow_tbl_destroy(struct flow_table *table)
+static void __flow_tbl_destroy(struct flow_table *table)
 {
 	int i;
 
-	if (!table)
-		return;
-
 	if (table->keep_flows)
 		goto skip_flows;
 
@@ -303,31 +488,55 @@ void ovs_flow_tbl_destroy(struct flow_table *table)
 
 		hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) {
 			hlist_del(&flow->hash_node[ver]);
-			ovs_flow_free(flow);
+			ovs_flow_free(flow, false);
 		}
 	}
 
+	BUG_ON(!list_empty(table->mask_list));
+	kfree(table->mask_list);
+
 skip_flows:
 	free_buckets(table->buckets);
 	kfree(table);
 }
 
+struct flow_table *ovs_flow_tbl_alloc(int new_size)
+{
+	struct flow_table *table = __flow_tbl_alloc(new_size);
+
+	if (!table)
+		return NULL;
+
+	table->mask_list = kmalloc(sizeof(struct list_head), GFP_KERNEL);
+	if (!table->mask_list) {
+		table->keep_flows = true;
+		__flow_tbl_destroy(table);
+		return NULL;
+	}
+	INIT_LIST_HEAD(table->mask_list);
+
+	return table;
+}
+
 static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
 {
 	struct flow_table *table = container_of(rcu, struct flow_table, rcu);
 
-	ovs_flow_tbl_destroy(table);
+	__flow_tbl_destroy(table);
 }
 
-void ovs_flow_tbl_deferred_destroy(struct flow_table *table)
+void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred)
 {
 	if (!table)
 		return;
 
-	call_rcu(&table->rcu, flow_tbl_destroy_rcu_cb);
+	if (deferred)
+		call_rcu(&table->rcu, flow_tbl_destroy_rcu_cb);
+	else
+		__flow_tbl_destroy(table);
 }
 
-struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *last)
+struct sw_flow *ovs_flow_dump_next(struct flow_table *table, u32 *bucket, u32 *last)
 {
 	struct sw_flow *flow;
 	struct hlist_head *head;
@@ -353,11 +562,13 @@ struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *la
 	return NULL;
 }
 
-static void __flow_tbl_insert(struct flow_table *table, struct sw_flow *flow)
+static void __tbl_insert(struct flow_table *table, struct sw_flow *flow)
 {
 	struct hlist_head *head;
+
 	head = find_bucket(table, flow->hash);
 	hlist_add_head_rcu(&flow->hash_node[table->node_ver], head);
+
 	table->count++;
 }
 
@@ -377,8 +588,10 @@ static void flow_table_copy_flows(struct flow_table *old, struct flow_table *new
 		head = flex_array_get(old->buckets, i);
 
 		hlist_for_each_entry(flow, head, hash_node[old_ver])
-			__flow_tbl_insert(new, flow);
+			__tbl_insert(new, flow);
 	}
+
+	new->mask_list = old->mask_list;
 	old->keep_flows = true;
 }
 
@@ -386,7 +599,7 @@ static struct flow_table *__flow_tbl_rehash(struct flow_table *table, int n_buck
 {
 	struct flow_table *new_table;
 
-	new_table = ovs_flow_tbl_alloc(n_buckets);
+	new_table = __flow_tbl_alloc(n_buckets);
 	if (!new_table)
 		return ERR_PTR(-ENOMEM);
 
@@ -405,28 +618,30 @@ struct flow_table *ovs_flow_tbl_expand(struct flow_table *table)
 	return __flow_tbl_rehash(table, table->n_buckets * 2);
 }
 
-void ovs_flow_free(struct sw_flow *flow)
+static void __flow_free(struct sw_flow *flow)
 {
-	if (unlikely(!flow))
-		return;
-
 	kfree((struct sf_flow_acts __force *)flow->sf_acts);
 	kmem_cache_free(flow_cache, flow);
 }
 
-/* RCU callback used by ovs_flow_deferred_free. */
 static void rcu_free_flow_callback(struct rcu_head *rcu)
 {
 	struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu);
 
-	ovs_flow_free(flow);
+	__flow_free(flow);
 }
 
-/* Schedules 'flow' to be freed after the next RCU grace period.
- * The caller must hold rcu_read_lock for this to be sensible. */
-void ovs_flow_deferred_free(struct sw_flow *flow)
+void ovs_flow_free(struct sw_flow *flow, bool deferred)
 {
-	call_rcu(&flow->rcu, rcu_free_flow_callback);
+	if (!flow)
+		return;
+
+	ovs_sw_flow_mask_del_ref(flow->mask, deferred);
+
+	if (deferred)
+		call_rcu(&flow->rcu, rcu_free_flow_callback);
+	else
+		__flow_free(flow);
 }
 
 /* Schedules 'sf_acts' to be freed after the next RCU grace period.
@@ -497,18 +712,15 @@ static __be16 parse_ethertype(struct sk_buff *skb)
 }
 
 static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
-			int *key_lenp, int nh_len)
+			int nh_len)
 {
 	struct icmp6hdr *icmp = icmp6_hdr(skb);
-	int error = 0;
-	int key_len;
 
 	/* The ICMPv6 type and code fields use the 16-bit transport port
 	 * fields, so we need to store them in 16-bit network byte order.
 	 */
 	key->ipv6.tp.src = htons(icmp->icmp6_type);
 	key->ipv6.tp.dst = htons(icmp->icmp6_code);
-	key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
 
 	if (icmp->icmp6_code == 0 &&
 	    (icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION ||
@@ -517,21 +729,17 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
 		struct nd_msg *nd;
 		int offset;
 
-		key_len = SW_FLOW_KEY_OFFSET(ipv6.nd);
-
 		/* In order to process neighbor discovery options, we need the
 		 * entire packet.
 		 */
 		if (unlikely(icmp_len < sizeof(*nd)))
-			goto out;
-		if (unlikely(skb_linearize(skb))) {
-			error = -ENOMEM;
-			goto out;
-		}
+			return 0;
+
+		if (unlikely(skb_linearize(skb)))
+			return -ENOMEM;
 
 		nd = (struct nd_msg *)skb_transport_header(skb);
 		key->ipv6.nd.target = nd->target;
-		key_len = SW_FLOW_KEY_OFFSET(ipv6.nd);
 
 		icmp_len -= sizeof(*nd);
 		offset = 0;
@@ -541,7 +749,7 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
 			int opt_len = nd_opt->nd_opt_len * 8;
 
 			if (unlikely(!opt_len || opt_len > icmp_len))
-				goto invalid;
+				return 0;
 
 			/* Store the link layer address if the appropriate
 			 * option is provided.  It is considered an error if
@@ -566,16 +774,14 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
 		}
 	}
 
-	goto out;
+	return 0;
 
 invalid:
 	memset(&key->ipv6.nd.target, 0, sizeof(key->ipv6.nd.target));
 	memset(key->ipv6.nd.sll, 0, sizeof(key->ipv6.nd.sll));
 	memset(key->ipv6.nd.tll, 0, sizeof(key->ipv6.nd.tll));
 
-out:
-	*key_lenp = key_len;
-	return error;
+	return 0;
 }
 
 /**
@@ -584,7 +790,6 @@ out:
  * Ethernet header
  * @in_port: port number on which @skb was received.
  * @key: output flow key
- * @key_lenp: length of output flow key
  *
  * The caller must ensure that skb->len >= ETH_HLEN.
  *
@@ -602,11 +807,9 @@ out:
  *      of a correct length, otherwise the same as skb->network_header.
  *      For other key->eth.type values it is left untouched.
  */
-int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
-		 int *key_lenp)
+int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key)
 {
-	int error = 0;
-	int key_len = SW_FLOW_KEY_OFFSET(eth);
+	int error;
 	struct ethhdr *eth;
 
 	memset(key, 0, sizeof(*key));
@@ -649,15 +852,13 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
 		struct iphdr *nh;
 		__be16 offset;
 
-		key_len = SW_FLOW_KEY_OFFSET(ipv4.addr);
-
 		error = check_iphdr(skb);
 		if (unlikely(error)) {
 			if (error == -EINVAL) {
 				skb->transport_header = skb->network_header;
 				error = 0;
 			}
-			goto out;
+			return error;
 		}
 
 		nh = ip_hdr(skb);
@@ -671,7 +872,7 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
 		offset = nh->frag_off & htons(IP_OFFSET);
 		if (offset) {
 			key->ip.frag = OVS_FRAG_TYPE_LATER;
-			goto out;
+			return 0;
 		}
 		if (nh->frag_off & htons(IP_MF) ||
 			 skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
@@ -679,21 +880,18 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
 
 		/* Transport layer. */
 		if (key->ip.proto == IPPROTO_TCP) {
-			key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
 			if (tcphdr_ok(skb)) {
 				struct tcphdr *tcp = tcp_hdr(skb);
 				key->ipv4.tp.src = tcp->source;
 				key->ipv4.tp.dst = tcp->dest;
 			}
 		} else if (key->ip.proto == IPPROTO_UDP) {
-			key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
 			if (udphdr_ok(skb)) {
 				struct udphdr *udp = udp_hdr(skb);
 				key->ipv4.tp.src = udp->source;
 				key->ipv4.tp.dst = udp->dest;
 			}
 		} else if (key->ip.proto == IPPROTO_ICMP) {
-			key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
 			if (icmphdr_ok(skb)) {
 				struct icmphdr *icmp = icmp_hdr(skb);
 				/* The ICMP type and code fields use the 16-bit
@@ -722,53 +920,49 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
 			memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst));
 			memcpy(key->ipv4.arp.sha, arp->ar_sha, ETH_ALEN);
 			memcpy(key->ipv4.arp.tha, arp->ar_tha, ETH_ALEN);
-			key_len = SW_FLOW_KEY_OFFSET(ipv4.arp);
 		}
 	} else if (key->eth.type == htons(ETH_P_IPV6)) {
 		int nh_len;             /* IPv6 Header + Extensions */
 
-		nh_len = parse_ipv6hdr(skb, key, &key_len);
+		nh_len = parse_ipv6hdr(skb, key);
 		if (unlikely(nh_len < 0)) {
-			if (nh_len == -EINVAL)
+			if (nh_len == -EINVAL) {
 				skb->transport_header = skb->network_header;
-			else
+				error = 0;
+			} else {
 				error = nh_len;
-			goto out;
+			}
+			return error;
 		}
 
 		if (key->ip.frag == OVS_FRAG_TYPE_LATER)
-			goto out;
+			return 0;
 		if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
 			key->ip.frag = OVS_FRAG_TYPE_FIRST;
 
 		/* Transport layer. */
 		if (key->ip.proto == NEXTHDR_TCP) {
-			key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
 			if (tcphdr_ok(skb)) {
 				struct tcphdr *tcp = tcp_hdr(skb);
 				key->ipv6.tp.src = tcp->source;
 				key->ipv6.tp.dst = tcp->dest;
 			}
 		} else if (key->ip.proto == NEXTHDR_UDP) {
-			key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
 			if (udphdr_ok(skb)) {
 				struct udphdr *udp = udp_hdr(skb);
 				key->ipv6.tp.src = udp->source;
 				key->ipv6.tp.dst = udp->dest;
 			}
 		} else if (key->ip.proto == NEXTHDR_ICMP) {
-			key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
 			if (icmp6hdr_ok(skb)) {
-				error = parse_icmpv6(skb, key, &key_len, nh_len);
-				if (error < 0)
-					goto out;
+				error = parse_icmpv6(skb, key, nh_len);
+				if (error)
+					return error;
 			}
 		}
 	}
 
-out:
-	*key_lenp = key_len;
-	return error;
+	return 0;
 }
 
 static u32 ovs_flow_hash(const struct sw_flow_key *key, int key_start, int key_len)
@@ -777,7 +971,7 @@ static u32 ovs_flow_hash(const struct sw_flow_key *key, int key_start, int key_l
 		      DIV_ROUND_UP(key_len - key_start, sizeof(u32)), 0);
 }
 
-static int flow_key_start(struct sw_flow_key *key)
+static int flow_key_start(const struct sw_flow_key *key)
 {
 	if (key->tun_key.ipv4_dst)
 		return 0;
@@ -785,39 +979,95 @@ static int flow_key_start(struct sw_flow_key *key)
 		return offsetof(struct sw_flow_key, phy);
 }
 
-struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table,
-				struct sw_flow_key *key, int key_len)
+static bool __cmp_key(const struct sw_flow_key *key1,
+		const struct sw_flow_key *key2,  int key_start, int key_len)
+{
+	return !memcmp((u8 *)key1 + key_start,
+			(u8 *)key2 + key_start, (key_len - key_start));
+}
+
+static bool __flow_cmp_key(const struct sw_flow *flow,
+		const struct sw_flow_key *key, int key_start, int key_len)
+{
+	return __cmp_key(&flow->key, key, key_start, key_len);
+}
+
+static bool __flow_cmp_unmasked_key(const struct sw_flow *flow,
+		  const struct sw_flow_key *key, int key_start, int key_len)
+{
+	return __cmp_key(&flow->unmasked_key, key, key_start, key_len);
+}
+
+bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
+		const struct sw_flow_key *key, int key_len)
+{
+	int key_start;
+	key_start = flow_key_start(key);
+
+	return __flow_cmp_unmasked_key(flow, key, key_start, key_len);
+
+}
+
+struct sw_flow *ovs_flow_lookup_unmasked_key(struct flow_table *table,
+				       struct sw_flow_match *match)
+{
+	struct sw_flow_key *unmasked = match->key;
+	int key_len = match->range.end;
+	struct sw_flow *flow;
+
+	flow = ovs_flow_lookup(table, unmasked);
+	if (flow && (!ovs_flow_cmp_unmasked_key(flow, unmasked, key_len)))
+		flow = NULL;
+
+	return flow;
+}
+
+static struct sw_flow *ovs_masked_flow_lookup(struct flow_table *table,
+				    const struct sw_flow_key *flow_key,
+				    struct sw_flow_mask *mask)
 {
 	struct sw_flow *flow;
 	struct hlist_head *head;
-	u8 *_key;
-	int key_start;
+	int key_start = mask->range.start;
+	int key_len = mask->range.end;
 	u32 hash;
+	struct sw_flow_key masked_key;
 
-	key_start = flow_key_start(key);
-	hash = ovs_flow_hash(key, key_start, key_len);
-
-	_key = (u8 *) key + key_start;
+	ovs_flow_key_mask(&masked_key, flow_key, mask);
+	hash = ovs_flow_hash(&masked_key, key_start, key_len);
 	head = find_bucket(table, hash);
 	hlist_for_each_entry_rcu(flow, head, hash_node[table->node_ver]) {
-
-		if (flow->hash == hash &&
-		    !memcmp((u8 *)&flow->key + key_start, _key, key_len - key_start)) {
+		if (flow->mask == mask &&
+		    __flow_cmp_key(flow, &masked_key, key_start, key_len))
 			return flow;
-		}
 	}
 	return NULL;
 }
 
-void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
-			 struct sw_flow_key *key, int key_len)
+struct sw_flow *ovs_flow_lookup(struct flow_table *tbl,
+				const struct sw_flow_key *key)
 {
-	flow->hash = ovs_flow_hash(key, flow_key_start(key), key_len);
-	memcpy(&flow->key, key, sizeof(flow->key));
-	__flow_tbl_insert(table, flow);
+	struct sw_flow *flow = NULL;
+	struct sw_flow_mask *mask;
+
+	list_for_each_entry_rcu(mask, tbl->mask_list, list) {
+		flow = ovs_masked_flow_lookup(tbl, key, mask);
+		if (flow)  /* Found */
+			break;
+	}
+
+	return flow;
 }
 
-void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
+
+void ovs_flow_insert(struct flow_table *table, struct sw_flow *flow)
+{
+	flow->hash = ovs_flow_hash(&flow->key, flow->mask->range.start,
+			flow->mask->range.end);
+	__tbl_insert(table, flow);
+}
+
+void ovs_flow_remove(struct flow_table *table, struct sw_flow *flow)
 {
 	BUG_ON(table->count == 0);
 	hlist_del_rcu(&flow->hash_node[table->node_ver]);
@@ -844,149 +1094,84 @@ const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
 	[OVS_KEY_ATTR_TUNNEL] = -1,
 };
 
-static int ipv4_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len,
-				  const struct nlattr *a[], u32 *attrs)
-{
-	const struct ovs_key_icmp *icmp_key;
-	const struct ovs_key_tcp *tcp_key;
-	const struct ovs_key_udp *udp_key;
-
-	switch (swkey->ip.proto) {
-	case IPPROTO_TCP:
-		if (!(*attrs & (1 << OVS_KEY_ATTR_TCP)))
-			return -EINVAL;
-		*attrs &= ~(1 << OVS_KEY_ATTR_TCP);
-
-		*key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
-		tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]);
-		swkey->ipv4.tp.src = tcp_key->tcp_src;
-		swkey->ipv4.tp.dst = tcp_key->tcp_dst;
-		break;
-
-	case IPPROTO_UDP:
-		if (!(*attrs & (1 << OVS_KEY_ATTR_UDP)))
-			return -EINVAL;
-		*attrs &= ~(1 << OVS_KEY_ATTR_UDP);
-
-		*key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
-		udp_key = nla_data(a[OVS_KEY_ATTR_UDP]);
-		swkey->ipv4.tp.src = udp_key->udp_src;
-		swkey->ipv4.tp.dst = udp_key->udp_dst;
-		break;
-
-	case IPPROTO_ICMP:
-		if (!(*attrs & (1 << OVS_KEY_ATTR_ICMP)))
-			return -EINVAL;
-		*attrs &= ~(1 << OVS_KEY_ATTR_ICMP);
-
-		*key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
-		icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]);
-		swkey->ipv4.tp.src = htons(icmp_key->icmp_type);
-		swkey->ipv4.tp.dst = htons(icmp_key->icmp_code);
-		break;
-	}
-
-	return 0;
-}
-
-static int ipv6_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len,
-				  const struct nlattr *a[], u32 *attrs)
+static bool is_all_zero(const u8 *fp, size_t size)
 {
-	const struct ovs_key_icmpv6 *icmpv6_key;
-	const struct ovs_key_tcp *tcp_key;
-	const struct ovs_key_udp *udp_key;
-
-	switch (swkey->ip.proto) {
-	case IPPROTO_TCP:
-		if (!(*attrs & (1 << OVS_KEY_ATTR_TCP)))
-			return -EINVAL;
-		*attrs &= ~(1 << OVS_KEY_ATTR_TCP);
-
-		*key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
-		tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]);
-		swkey->ipv6.tp.src = tcp_key->tcp_src;
-		swkey->ipv6.tp.dst = tcp_key->tcp_dst;
-		break;
-
-	case IPPROTO_UDP:
-		if (!(*attrs & (1 << OVS_KEY_ATTR_UDP)))
-			return -EINVAL;
-		*attrs &= ~(1 << OVS_KEY_ATTR_UDP);
-
-		*key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
-		udp_key = nla_data(a[OVS_KEY_ATTR_UDP]);
-		swkey->ipv6.tp.src = udp_key->udp_src;
-		swkey->ipv6.tp.dst = udp_key->udp_dst;
-		break;
-
-	case IPPROTO_ICMPV6:
-		if (!(*attrs & (1 << OVS_KEY_ATTR_ICMPV6)))
-			return -EINVAL;
-		*attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6);
-
-		*key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
-		icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]);
-		swkey->ipv6.tp.src = htons(icmpv6_key->icmpv6_type);
-		swkey->ipv6.tp.dst = htons(icmpv6_key->icmpv6_code);
+	int i;
 
-		if (swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) ||
-		    swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
-			const struct ovs_key_nd *nd_key;
+	if (!fp)
+		return false;
 
-			if (!(*attrs & (1 << OVS_KEY_ATTR_ND)))
-				return -EINVAL;
-			*attrs &= ~(1 << OVS_KEY_ATTR_ND);
-
-			*key_len = SW_FLOW_KEY_OFFSET(ipv6.nd);
-			nd_key = nla_data(a[OVS_KEY_ATTR_ND]);
-			memcpy(&swkey->ipv6.nd.target, nd_key->nd_target,
-			       sizeof(swkey->ipv6.nd.target));
-			memcpy(swkey->ipv6.nd.sll, nd_key->nd_sll, ETH_ALEN);
-			memcpy(swkey->ipv6.nd.tll, nd_key->nd_tll, ETH_ALEN);
-		}
-		break;
-	}
+	for (i = 0; i < size; i++)
+		if (fp[i])
+			return false;
 
-	return 0;
+	return true;
 }
 
-static int parse_flow_nlattrs(const struct nlattr *attr,
-			      const struct nlattr *a[], u32 *attrsp)
+static int __parse_flow_nlattrs(const struct nlattr *attr,
+			      const struct nlattr *a[],
+			      u64 *attrsp, bool nz)
 {
 	const struct nlattr *nla;
 	u32 attrs;
 	int rem;
 
-	attrs = 0;
+	attrs = *attrsp;
 	nla_for_each_nested(nla, attr, rem) {
 		u16 type = nla_type(nla);
 		int expected_len;
 
-		if (type > OVS_KEY_ATTR_MAX || attrs & (1 << type))
+		if (type > OVS_KEY_ATTR_MAX) {
+			OVS_NLERR("Unknown key attribute (type=%d, max=%d).\n",
+				  type, OVS_KEY_ATTR_MAX);
+		}
+
+		if (attrs & (1 << type)) {
+			OVS_NLERR("Duplicate key attribute (type %d).\n", type);
 			return -EINVAL;
+		}
 
 		expected_len = ovs_key_lens[type];
-		if (nla_len(nla) != expected_len && expected_len != -1)
+		if (nla_len(nla) != expected_len && expected_len != -1) {
+			OVS_NLERR("Key attribute has unexpected length (type=%d"
+				  ", length=%d, expected=%d).\n", type,
+				  nla_len(nla), expected_len);
 			return -EINVAL;
+		}
 
-		attrs |= 1 << type;
-		a[type] = nla;
+		if (!nz || !is_all_zero(nla_data(nla), expected_len)) {
+			attrs |= 1 << type;
+			a[type] = nla;
+		}
 	}
-	if (rem)
+	if (rem) {
+		OVS_NLERR("Message has %d unknown bytes.\n", rem);
 		return -EINVAL;
+	}
 
 	*attrsp = attrs;
 	return 0;
 }
 
+static int parse_flow_mask_nlattrs(const struct nlattr *attr,
+			      const struct nlattr *a[], u64 *attrsp)
+{
+	return __parse_flow_nlattrs(attr, a, attrsp, true);
+}
+
+static int parse_flow_nlattrs(const struct nlattr *attr,
+			      const struct nlattr *a[], u64 *attrsp)
+{
+	return __parse_flow_nlattrs(attr, a, attrsp, false);
+}
+
 int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr,
-			     struct ovs_key_ipv4_tunnel *tun_key)
+			     struct sw_flow_match *match, bool is_mask)
 {
 	struct nlattr *a;
 	int rem;
 	bool ttl = false;
-
-	memset(tun_key, 0, sizeof(*tun_key));
+	__be16 tun_flags = 0;
 
 	nla_for_each_nested(a, attr, rem) {
 		int type = nla_type(a);
@@ -1000,53 +1185,78 @@ int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr,
 			[OVS_TUNNEL_KEY_ATTR_CSUM] = 0,
 		};
 
-		if (type > OVS_TUNNEL_KEY_ATTR_MAX ||
-			ovs_tunnel_key_lens[type] != nla_len(a))
+		if (type > OVS_TUNNEL_KEY_ATTR_MAX) {
+			OVS_NLERR("Unknown IPv4 tunnel attribute (type=%d, max=%d).\n",
+			type, OVS_TUNNEL_KEY_ATTR_MAX);
 			return -EINVAL;
+		}
+
+		if (ovs_tunnel_key_lens[type] != nla_len(a)) {
+			OVS_NLERR("IPv4 tunnel attribute type has unexpected "
+				  " length (type=%d, length=%d, expected=%d).\n",
+				  type, nla_len(a), ovs_tunnel_key_lens[type]);
+			return -EINVAL;
+		}
 
 		switch (type) {
 		case OVS_TUNNEL_KEY_ATTR_ID:
-			tun_key->tun_id = nla_get_be64(a);
-			tun_key->tun_flags |= TUNNEL_KEY;
+			SW_FLOW_KEY_PUT(match, tun_key.tun_id,
+					nla_get_be64(a), is_mask);
+			tun_flags |= TUNNEL_KEY;
 			break;
 		case OVS_TUNNEL_KEY_ATTR_IPV4_SRC:
-			tun_key->ipv4_src = nla_get_be32(a);
+			SW_FLOW_KEY_PUT(match, tun_key.ipv4_src,
+					nla_get_be32(a), is_mask);
 			break;
 		case OVS_TUNNEL_KEY_ATTR_IPV4_DST:
-			tun_key->ipv4_dst = nla_get_be32(a);
+			SW_FLOW_KEY_PUT(match, tun_key.ipv4_dst,
+					nla_get_be32(a), is_mask);
 			break;
 		case OVS_TUNNEL_KEY_ATTR_TOS:
-			tun_key->ipv4_tos = nla_get_u8(a);
+			SW_FLOW_KEY_PUT(match, tun_key.ipv4_tos,
+					nla_get_u8(a), is_mask);
 			break;
 		case OVS_TUNNEL_KEY_ATTR_TTL:
-			tun_key->ipv4_ttl = nla_get_u8(a);
+			SW_FLOW_KEY_PUT(match, tun_key.ipv4_ttl,
+					nla_get_u8(a), is_mask);
 			ttl = true;
 			break;
 		case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT:
-			tun_key->tun_flags |= TUNNEL_DONT_FRAGMENT;
+			tun_flags |= TUNNEL_DONT_FRAGMENT;
 			break;
 		case OVS_TUNNEL_KEY_ATTR_CSUM:
-			tun_key->tun_flags |= TUNNEL_CSUM;
+			tun_flags |= TUNNEL_CSUM;
 			break;
 		default:
 			return -EINVAL;
-
 		}
 	}
-	if (rem > 0)
-		return -EINVAL;
 
-	if (!tun_key->ipv4_dst)
-		return -EINVAL;
+	SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask);
 
-	if (!ttl)
+	if (rem > 0) {
+		OVS_NLERR("IPv4 tunnel attribute has %d unknown bytes.\n", rem);
 		return -EINVAL;
+	}
+
+	if (!is_mask) {
+		if (!match->key->tun_key.ipv4_dst) {
+			OVS_NLERR("IPv4 tunnel destination address is zero.\n");
+			return -EINVAL;
+		}
+
+		if (!ttl) {
+			OVS_NLERR("IPv4 tunnel TTL not specified.\n");
+			return -EINVAL;
+		}
+	}
 
 	return 0;
 }
 
 int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb,
-			   const struct ovs_key_ipv4_tunnel *tun_key)
+			   const struct ovs_key_ipv4_tunnel *tun_key,
+			   const struct ovs_key_ipv4_tunnel *output)
 {
 	struct nlattr *nla;
 
@@ -1054,23 +1264,24 @@ int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb,
 	if (!nla)
 		return -EMSGSIZE;
 
-	if (tun_key->tun_flags & TUNNEL_KEY &&
-	    nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, tun_key->tun_id))
+	if (output->tun_flags & TUNNEL_KEY &&
+	    nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id))
 		return -EMSGSIZE;
-	if (tun_key->ipv4_src &&
-	    nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, tun_key->ipv4_src))
+	if (output->ipv4_src &&
+		nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src))
 		return -EMSGSIZE;
-	if (nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, tun_key->ipv4_dst))
+	if (output->ipv4_dst &&
+		nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst))
 		return -EMSGSIZE;
-	if (tun_key->ipv4_tos &&
-	    nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, tun_key->ipv4_tos))
+	if (output->ipv4_tos &&
+		nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos))
 		return -EMSGSIZE;
-	if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, tun_key->ipv4_ttl))
+	if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl))
 		return -EMSGSIZE;
-	if ((tun_key->tun_flags & TUNNEL_DONT_FRAGMENT) &&
+	if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) &&
 		nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT))
 		return -EMSGSIZE;
-	if ((tun_key->tun_flags & TUNNEL_CSUM) &&
+	if ((output->tun_flags & TUNNEL_CSUM) &&
 		nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM))
 		return -EMSGSIZE;
 
@@ -1078,176 +1289,372 @@ int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb,
 	return 0;
 }
 
-/**
- * ovs_flow_from_nlattrs - parses Netlink attributes into a flow key.
- * @swkey: receives the extracted flow key.
- * @key_lenp: number of bytes used in @swkey.
- * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
- * sequence.
- */
-int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
-		      const struct nlattr *attr)
+static int metadata_from_nlattrs(struct sw_flow_match *match,  u64 *attrs,
+		const struct nlattr **a, bool is_mask)
 {
-	const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
-	const struct ovs_key_ethernet *eth_key;
-	int key_len;
-	u32 attrs;
-	int err;
+	if (*attrs & (1 << OVS_KEY_ATTR_PRIORITY)) {
+		SW_FLOW_KEY_PUT(match, phy.priority,
+			  nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]), is_mask);
+		*attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY);
+	}
 
-	memset(swkey, 0, sizeof(struct sw_flow_key));
-	key_len = SW_FLOW_KEY_OFFSET(eth);
+	if (*attrs & (1 << OVS_KEY_ATTR_IN_PORT)) {
+		u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]);
 
-	err = parse_flow_nlattrs(attr, a, &attrs);
-	if (err)
-		return err;
+		if (is_mask)
+			in_port = 0xffffffff; /* Always exact match in_port. */
+		else if (in_port >= DP_MAX_PORTS)
+			return -EINVAL;
 
-	/* Metadata attributes. */
-	if (attrs & (1 << OVS_KEY_ATTR_PRIORITY)) {
-		swkey->phy.priority = nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]);
-		attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY);
+		SW_FLOW_KEY_PUT(match, phy.in_port, in_port, is_mask);
+		*attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT);
+	} else if (!is_mask) {
+		SW_FLOW_KEY_PUT(match, phy.in_port, DP_MAX_PORTS, is_mask);
 	}
-	if (attrs & (1 << OVS_KEY_ATTR_IN_PORT)) {
-		u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]);
-		if (in_port >= DP_MAX_PORTS)
-			return -EINVAL;
-		swkey->phy.in_port = in_port;
-		attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT);
-	} else {
-		swkey->phy.in_port = DP_MAX_PORTS;
+
+	if (*attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) {
+		uint32_t mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]);
+
+		SW_FLOW_KEY_PUT(match, phy.skb_mark, mark, is_mask);
+		*attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK);
 	}
-	if (attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) {
-		swkey->phy.skb_mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]);
-		attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK);
+	if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) {
+		if (ovs_ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match,
+					is_mask))
+			return -EINVAL;
+		*attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL);
 	}
+	return 0;
+}
 
-	if (attrs & (1 << OVS_KEY_ATTR_TUNNEL)) {
-		err = ovs_ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], &swkey->tun_key);
-		if (err)
-			return err;
+static int ovs_key_from_nlattrs(struct sw_flow_match *match,  u64 attrs,
+		const struct nlattr **a, bool is_mask)
+{
+	int err;
+	u64 orig_attrs = attrs;
 
-		attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL);
-	}
+	err = metadata_from_nlattrs(match, &attrs, a, is_mask);
+	if (err)
+		return err;
 
-	/* Data attributes. */
-	if (!(attrs & (1 << OVS_KEY_ATTR_ETHERNET)))
-		return -EINVAL;
-	attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET);
+	if (attrs & (1 << OVS_KEY_ATTR_ETHERNET)) {
+		const struct ovs_key_ethernet *eth_key;
 
-	eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]);
-	memcpy(swkey->eth.src, eth_key->eth_src, ETH_ALEN);
-	memcpy(swkey->eth.dst, eth_key->eth_dst, ETH_ALEN);
+		eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]);
+		SW_FLOW_KEY_MEMCPY(match, eth.src,
+				eth_key->eth_src, ETH_ALEN, is_mask);
+		SW_FLOW_KEY_MEMCPY(match, eth.dst,
+				eth_key->eth_dst, ETH_ALEN, is_mask);
+		attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET);
+	}
 
-	if (attrs & (1u << OVS_KEY_ATTR_ETHERTYPE) &&
-	    nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q)) {
-		const struct nlattr *encap;
+	if (attrs & (1 << OVS_KEY_ATTR_VLAN)) {
 		__be16 tci;
 
-		if (attrs != ((1 << OVS_KEY_ATTR_VLAN) |
-			      (1 << OVS_KEY_ATTR_ETHERTYPE) |
-			      (1 << OVS_KEY_ATTR_ENCAP)))
-			return -EINVAL;
-
-		encap = a[OVS_KEY_ATTR_ENCAP];
 		tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
-		if (tci & htons(VLAN_TAG_PRESENT)) {
-			swkey->eth.tci = tci;
-
-			err = parse_flow_nlattrs(encap, a, &attrs);
-			if (err)
-				return err;
-		} else if (!tci) {
-			/* Corner case for truncated 802.1Q header. */
-			if (nla_len(encap))
-				return -EINVAL;
+		if (!(tci & htons(VLAN_TAG_PRESENT))) {
+			if (is_mask)
+				OVS_NLERR("VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit.\n");
+			else
+				OVS_NLERR("VLAN TCI does not have VLAN_TAG_PRESENT bit set.\n");
 
-			swkey->eth.type = htons(ETH_P_8021Q);
-			*key_lenp = key_len;
-			return 0;
-		} else {
 			return -EINVAL;
 		}
-	}
+
+		SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask);
+		attrs &= ~(1 << OVS_KEY_ATTR_VLAN);
+	} else if (!is_mask)
+		SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true);
 
 	if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) {
-		swkey->eth.type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
-		if (ntohs(swkey->eth.type) < ETH_P_802_3_MIN)
+		__be16 eth_type;
+
+		eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
+		if (is_mask) {
+			/* Always exact match EtherType. */
+			eth_type = htons(0xffff);
+		} else if (ntohs(eth_type) < ETH_P_802_3_MIN) {
+			OVS_NLERR("EtherType is less than minimum (type=%x, min=%x).\n",
+					ntohs(eth_type), ETH_P_802_3_MIN);
 			return -EINVAL;
+		}
+
+		SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask);
 		attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
-	} else {
-		swkey->eth.type = htons(ETH_P_802_2);
+	} else if (!is_mask) {
+		SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask);
 	}
 
-	if (swkey->eth.type == htons(ETH_P_IP)) {
+	if (attrs & (1 << OVS_KEY_ATTR_IPV4)) {
 		const struct ovs_key_ipv4 *ipv4_key;
 
-		if (!(attrs & (1 << OVS_KEY_ATTR_IPV4)))
-			return -EINVAL;
-		attrs &= ~(1 << OVS_KEY_ATTR_IPV4);
-
-		key_len = SW_FLOW_KEY_OFFSET(ipv4.addr);
 		ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]);
-		if (ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX)
+		if (!is_mask && ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) {
+			OVS_NLERR("Unknown IPv4 fragment type (value=%d, max=%d).\n",
+				ipv4_key->ipv4_frag, OVS_FRAG_TYPE_MAX);
 			return -EINVAL;
-		swkey->ip.proto = ipv4_key->ipv4_proto;
-		swkey->ip.tos = ipv4_key->ipv4_tos;
-		swkey->ip.ttl = ipv4_key->ipv4_ttl;
-		swkey->ip.frag = ipv4_key->ipv4_frag;
-		swkey->ipv4.addr.src = ipv4_key->ipv4_src;
-		swkey->ipv4.addr.dst = ipv4_key->ipv4_dst;
-
-		if (swkey->ip.frag != OVS_FRAG_TYPE_LATER) {
-			err = ipv4_flow_from_nlattrs(swkey, &key_len, a, &attrs);
-			if (err)
-				return err;
 		}
-	} else if (swkey->eth.type == htons(ETH_P_IPV6)) {
-		const struct ovs_key_ipv6 *ipv6_key;
+		SW_FLOW_KEY_PUT(match, ip.proto,
+				ipv4_key->ipv4_proto, is_mask);
+		SW_FLOW_KEY_PUT(match, ip.tos,
+				ipv4_key->ipv4_tos, is_mask);
+		SW_FLOW_KEY_PUT(match, ip.ttl,
+				ipv4_key->ipv4_ttl, is_mask);
+		SW_FLOW_KEY_PUT(match, ip.frag,
+				ipv4_key->ipv4_frag, is_mask);
+		SW_FLOW_KEY_PUT(match, ipv4.addr.src,
+				ipv4_key->ipv4_src, is_mask);
+		SW_FLOW_KEY_PUT(match, ipv4.addr.dst,
+				ipv4_key->ipv4_dst, is_mask);
+		attrs &= ~(1 << OVS_KEY_ATTR_IPV4);
+	}
 
-		if (!(attrs & (1 << OVS_KEY_ATTR_IPV6)))
-			return -EINVAL;
-		attrs &= ~(1 << OVS_KEY_ATTR_IPV6);
+	if (attrs & (1 << OVS_KEY_ATTR_IPV6)) {
+		const struct ovs_key_ipv6 *ipv6_key;
 
-		key_len = SW_FLOW_KEY_OFFSET(ipv6.label);
 		ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]);
-		if (ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX)
+		if (!is_mask && ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) {
+			OVS_NLERR("Unknown IPv6 fragment type (value=%d, max=%d).\n",
+				ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX);
 			return -EINVAL;
-		swkey->ipv6.label = ipv6_key->ipv6_label;
-		swkey->ip.proto = ipv6_key->ipv6_proto;
-		swkey->ip.tos = ipv6_key->ipv6_tclass;
-		swkey->ip.ttl = ipv6_key->ipv6_hlimit;
-		swkey->ip.frag = ipv6_key->ipv6_frag;
-		memcpy(&swkey->ipv6.addr.src, ipv6_key->ipv6_src,
-		       sizeof(swkey->ipv6.addr.src));
-		memcpy(&swkey->ipv6.addr.dst, ipv6_key->ipv6_dst,
-		       sizeof(swkey->ipv6.addr.dst));
-
-		if (swkey->ip.frag != OVS_FRAG_TYPE_LATER) {
-			err = ipv6_flow_from_nlattrs(swkey, &key_len, a, &attrs);
-			if (err)
-				return err;
 		}
-	} else if (swkey->eth.type == htons(ETH_P_ARP) ||
-		   swkey->eth.type == htons(ETH_P_RARP)) {
+		SW_FLOW_KEY_PUT(match, ipv6.label,
+				ipv6_key->ipv6_label, is_mask);
+		SW_FLOW_KEY_PUT(match, ip.proto,
+				ipv6_key->ipv6_proto, is_mask);
+		SW_FLOW_KEY_PUT(match, ip.tos,
+				ipv6_key->ipv6_tclass, is_mask);
+		SW_FLOW_KEY_PUT(match, ip.ttl,
+				ipv6_key->ipv6_hlimit, is_mask);
+		SW_FLOW_KEY_PUT(match, ip.frag,
+				ipv6_key->ipv6_frag, is_mask);
+		SW_FLOW_KEY_MEMCPY(match, ipv6.addr.src,
+				ipv6_key->ipv6_src,
+				sizeof(match->key->ipv6.addr.src),
+				is_mask);
+		SW_FLOW_KEY_MEMCPY(match, ipv6.addr.dst,
+				ipv6_key->ipv6_dst,
+				sizeof(match->key->ipv6.addr.dst),
+				is_mask);
+
+		attrs &= ~(1 << OVS_KEY_ATTR_IPV6);
+	}
+
+	if (attrs & (1 << OVS_KEY_ATTR_ARP)) {
 		const struct ovs_key_arp *arp_key;
 
-		if (!(attrs & (1 << OVS_KEY_ATTR_ARP)))
+		arp_key = nla_data(a[OVS_KEY_ATTR_ARP]);
+		if (!is_mask && (arp_key->arp_op & htons(0xff00))) {
+			OVS_NLERR("Unknown ARP opcode (opcode=%d).\n",
+				  arp_key->arp_op);
 			return -EINVAL;
+		}
+
+		SW_FLOW_KEY_PUT(match, ipv4.addr.src,
+				arp_key->arp_sip, is_mask);
+		SW_FLOW_KEY_PUT(match, ipv4.addr.dst,
+			arp_key->arp_tip, is_mask);
+		SW_FLOW_KEY_PUT(match, ip.proto,
+				ntohs(arp_key->arp_op), is_mask);
+		SW_FLOW_KEY_MEMCPY(match, ipv4.arp.sha,
+				arp_key->arp_sha, ETH_ALEN, is_mask);
+		SW_FLOW_KEY_MEMCPY(match, ipv4.arp.tha,
+				arp_key->arp_tha, ETH_ALEN, is_mask);
+
 		attrs &= ~(1 << OVS_KEY_ATTR_ARP);
+	}
 
-		key_len = SW_FLOW_KEY_OFFSET(ipv4.arp);
-		arp_key = nla_data(a[OVS_KEY_ATTR_ARP]);
-		swkey->ipv4.addr.src = arp_key->arp_sip;
-		swkey->ipv4.addr.dst = arp_key->arp_tip;
-		if (arp_key->arp_op & htons(0xff00))
+	if (attrs & (1 << OVS_KEY_ATTR_TCP)) {
+		const struct ovs_key_tcp *tcp_key;
+
+		tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]);
+		if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) {
+			SW_FLOW_KEY_PUT(match, ipv4.tp.src,
+					tcp_key->tcp_src, is_mask);
+			SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
+					tcp_key->tcp_dst, is_mask);
+		} else {
+			SW_FLOW_KEY_PUT(match, ipv6.tp.src,
+					tcp_key->tcp_src, is_mask);
+			SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
+					tcp_key->tcp_dst, is_mask);
+		}
+		attrs &= ~(1 << OVS_KEY_ATTR_TCP);
+	}
+
+	if (attrs & (1 << OVS_KEY_ATTR_UDP)) {
+		const struct ovs_key_udp *udp_key;
+
+		udp_key = nla_data(a[OVS_KEY_ATTR_UDP]);
+		if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) {
+			SW_FLOW_KEY_PUT(match, ipv4.tp.src,
+					udp_key->udp_src, is_mask);
+			SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
+					udp_key->udp_dst, is_mask);
+		} else {
+			SW_FLOW_KEY_PUT(match, ipv6.tp.src,
+					udp_key->udp_src, is_mask);
+			SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
+					udp_key->udp_dst, is_mask);
+		}
+		attrs &= ~(1 << OVS_KEY_ATTR_UDP);
+	}
+
+	if (attrs & (1 << OVS_KEY_ATTR_ICMP)) {
+		const struct ovs_key_icmp *icmp_key;
+
+		icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]);
+		SW_FLOW_KEY_PUT(match, ipv4.tp.src,
+				htons(icmp_key->icmp_type), is_mask);
+		SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
+				htons(icmp_key->icmp_code), is_mask);
+		attrs &= ~(1 << OVS_KEY_ATTR_ICMP);
+	}
+
+	if (attrs & (1 << OVS_KEY_ATTR_ICMPV6)) {
+		const struct ovs_key_icmpv6 *icmpv6_key;
+
+		icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]);
+		SW_FLOW_KEY_PUT(match, ipv6.tp.src,
+				htons(icmpv6_key->icmpv6_type), is_mask);
+		SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
+				htons(icmpv6_key->icmpv6_code), is_mask);
+		attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6);
+	}
+
+	if (attrs & (1 << OVS_KEY_ATTR_ND)) {
+		const struct ovs_key_nd *nd_key;
+
+		nd_key = nla_data(a[OVS_KEY_ATTR_ND]);
+		SW_FLOW_KEY_MEMCPY(match, ipv6.nd.target,
+			nd_key->nd_target,
+			sizeof(match->key->ipv6.nd.target),
+			is_mask);
+		SW_FLOW_KEY_MEMCPY(match, ipv6.nd.sll,
+			nd_key->nd_sll, ETH_ALEN, is_mask);
+		SW_FLOW_KEY_MEMCPY(match, ipv6.nd.tll,
+				nd_key->nd_tll, ETH_ALEN, is_mask);
+		attrs &= ~(1 << OVS_KEY_ATTR_ND);
+	}
+
+	if (attrs != 0)
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * ovs_match_from_nlattrs - parses Netlink attributes into a flow key and
+ * mask. In case the 'mask' is NULL, the flow is treated as exact match
+ * flow. Otherwise, it is treated as a wildcarded flow, except the mask
+ * does not include any don't care bit.
+ * @match: receives the extracted flow match information.
+ * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
+ * sequence. The fields should of the packet that triggered the creation
+ * of this flow.
+ * @mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink
+ * attribute specifies the mask field of the wildcarded flow.
+ */
+int ovs_match_from_nlattrs(struct sw_flow_match *match,
+			   const struct nlattr *key,
+			   const struct nlattr *mask)
+{
+	const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
+	const struct nlattr *encap;
+	u64 key_attrs = 0;
+	u64 mask_attrs = 0;
+	bool encap_valid = false;
+	int err;
+
+	err = parse_flow_nlattrs(key, a, &key_attrs);
+	if (err)
+		return err;
+
+	if ((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) &&
+	    (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) &&
+	    (nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q))) {
+		__be16 tci;
+
+		if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) &&
+		      (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) {
+			OVS_NLERR("Invalid Vlan frame.\n");
 			return -EINVAL;
-		swkey->ip.proto = ntohs(arp_key->arp_op);
-		memcpy(swkey->ipv4.arp.sha, arp_key->arp_sha, ETH_ALEN);
-		memcpy(swkey->ipv4.arp.tha, arp_key->arp_tha, ETH_ALEN);
+		}
+
+		key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
+		tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+		encap = a[OVS_KEY_ATTR_ENCAP];
+		key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
+		encap_valid = true;
+
+		if (tci & htons(VLAN_TAG_PRESENT)) {
+			err = parse_flow_nlattrs(encap, a, &key_attrs);
+			if (err)
+				return err;
+		} else if (!tci) {
+			/* Corner case for truncated 802.1Q header. */
+			if (nla_len(encap)) {
+				OVS_NLERR("Truncated 802.1Q header has non-zero encap attribute.\n");
+				return -EINVAL;
+			}
+		} else {
+			OVS_NLERR("Encap attribute is set for a non-VLAN frame.\n");
+			return  -EINVAL;
+		}
 	}
 
-	if (attrs)
+	err = ovs_key_from_nlattrs(match, key_attrs, a, false);
+	if (err)
+		return err;
+
+	if (mask) {
+		err = parse_flow_mask_nlattrs(mask, a, &mask_attrs);
+		if (err)
+			return err;
+
+		if (mask_attrs & 1ULL << OVS_KEY_ATTR_ENCAP)  {
+			__be16 eth_type = 0;
+			__be16 tci = 0;
+
+			if (!encap_valid) {
+				OVS_NLERR("Encap mask attribute is set for non-VLAN frame.\n");
+				return  -EINVAL;
+			}
+
+			mask_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
+			if (a[OVS_KEY_ATTR_ETHERTYPE])
+				eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
+
+			if (eth_type == htons(0xffff)) {
+				mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
+				encap = a[OVS_KEY_ATTR_ENCAP];
+				err = parse_flow_mask_nlattrs(encap, a, &mask_attrs);
+			} else {
+				OVS_NLERR("VLAN frames must have an exact match on the TPID (mask=%x).\n",
+						ntohs(eth_type));
+				return -EINVAL;
+			}
+
+			if (a[OVS_KEY_ATTR_VLAN])
+				tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+
+			if (!(tci & htons(VLAN_TAG_PRESENT))) {
+				OVS_NLERR("VLAN tag present bit must have an exact match (tci_mask=%x).\n", ntohs(tci));
+				return -EINVAL;
+			}
+		}
+
+		err = ovs_key_from_nlattrs(match, mask_attrs, a, true);
+		if (err)
+			return err;
+	} else {
+		/* Populate exact match flow's key mask. */
+		if (match->mask)
+			ovs_sw_flow_mask_set(match->mask, &match->range, 0xff);
+	}
+
+	if (!ovs_match_validate(match, key_attrs, mask_attrs))
 		return -EINVAL;
-	*key_lenp = key_len;
 
 	return 0;
 }
@@ -1255,7 +1662,6 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
 /**
  * ovs_flow_metadata_from_nlattrs - parses Netlink attributes into a flow key.
  * @flow: Receives extracted in_port, priority, tun_key and skb_mark.
- * @key_len: Length of key in @flow.  Used for calculating flow hash.
  * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
  * sequence.
  *
@@ -1264,102 +1670,100 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
  * get the metadata, that is, the parts of the flow key that cannot be
  * extracted from the packet itself.
  */
-int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, int key_len,
-				   const struct nlattr *attr)
+
+int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow,
+		const struct nlattr *attr)
 {
 	struct ovs_key_ipv4_tunnel *tun_key = &flow->key.tun_key;
-	const struct nlattr *nla;
-	int rem;
+	const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
+	u64 attrs = 0;
+	int err;
+	struct sw_flow_match match;
 
 	flow->key.phy.in_port = DP_MAX_PORTS;
 	flow->key.phy.priority = 0;
 	flow->key.phy.skb_mark = 0;
 	memset(tun_key, 0, sizeof(flow->key.tun_key));
 
-	nla_for_each_nested(nla, attr, rem) {
-		int type = nla_type(nla);
-
-		if (type <= OVS_KEY_ATTR_MAX && ovs_key_lens[type] > 0) {
-			int err;
-
-			if (nla_len(nla) != ovs_key_lens[type])
-				return -EINVAL;
-
-			switch (type) {
-			case OVS_KEY_ATTR_PRIORITY:
-				flow->key.phy.priority = nla_get_u32(nla);
-				break;
-
-			case OVS_KEY_ATTR_TUNNEL:
-				err = ovs_ipv4_tun_from_nlattr(nla, tun_key);
-				if (err)
-					return err;
-				break;
-
-			case OVS_KEY_ATTR_IN_PORT:
-				if (nla_get_u32(nla) >= DP_MAX_PORTS)
-					return -EINVAL;
-				flow->key.phy.in_port = nla_get_u32(nla);
-				break;
-
-			case OVS_KEY_ATTR_SKB_MARK:
-				flow->key.phy.skb_mark = nla_get_u32(nla);
-				break;
-			}
-		}
-	}
-	if (rem)
+	err = parse_flow_nlattrs(attr, a, &attrs);
+	if (err)
 		return -EINVAL;
 
-	flow->hash = ovs_flow_hash(&flow->key,
-				   flow_key_start(&flow->key), key_len);
+	memset(&match, 0, sizeof(match));
+	match.key = &flow->key;
+
+	err = metadata_from_nlattrs(&match, &attrs, a, false);
+	if (err)
+		return err;
 
 	return 0;
 }
 
-int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
+int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey,
+		const struct sw_flow_key *output, struct sk_buff *skb)
 {
 	struct ovs_key_ethernet *eth_key;
 	struct nlattr *nla, *encap;
+	bool is_mask = (swkey != output);
 
-	if (swkey->phy.priority &&
-	    nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, swkey->phy.priority))
+	if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority))
 		goto nla_put_failure;
 
-	if (swkey->tun_key.ipv4_dst &&
-	    ovs_ipv4_tun_to_nlattr(skb, &swkey->tun_key))
+	if ((swkey->tun_key.ipv4_dst || is_mask) &&
+	    ovs_ipv4_tun_to_nlattr(skb, &swkey->tun_key, &output->tun_key))
 		goto nla_put_failure;
 
-	if (swkey->phy.in_port != DP_MAX_PORTS &&
-	    nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, swkey->phy.in_port))
-		goto nla_put_failure;
+	if (swkey->phy.in_port == DP_MAX_PORTS) {
+		if (is_mask && (output->phy.in_port == 0xffff))
+			if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, 0xffffffff))
+				goto nla_put_failure;
+	} else {
+		u16 upper_u16;
+		upper_u16 = !is_mask ? 0 : 0xffff;
 
-	if (swkey->phy.skb_mark &&
-	    nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, swkey->phy.skb_mark))
+		if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT,
+				(upper_u16 << 16) | output->phy.in_port))
+			goto nla_put_failure;
+	}
+
+	if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark))
 		goto nla_put_failure;
 
 	nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key));
 	if (!nla)
 		goto nla_put_failure;
+
 	eth_key = nla_data(nla);
-	memcpy(eth_key->eth_src, swkey->eth.src, ETH_ALEN);
-	memcpy(eth_key->eth_dst, swkey->eth.dst, ETH_ALEN);
+	memcpy(eth_key->eth_src, output->eth.src, ETH_ALEN);
+	memcpy(eth_key->eth_dst, output->eth.dst, ETH_ALEN);
 
 	if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) {
-		if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, htons(ETH_P_8021Q)) ||
-		    nla_put_be16(skb, OVS_KEY_ATTR_VLAN, swkey->eth.tci))
+		__be16 eth_type;
+		eth_type = !is_mask ? htons(ETH_P_8021Q) : htons(0xffff);
+		if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) ||
+		    nla_put_be16(skb, OVS_KEY_ATTR_VLAN, output->eth.tci))
 			goto nla_put_failure;
 		encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
 		if (!swkey->eth.tci)
 			goto unencap;
-	} else {
+	} else
 		encap = NULL;
-	}
 
-	if (swkey->eth.type == htons(ETH_P_802_2))
+	if (swkey->eth.type == htons(ETH_P_802_2)) {
+		/*
+		 * Ethertype 802.2 is represented in the netlink with omitted
+		 * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and
+		 * 0xffff in the mask attribute.  Ethertype can also
+		 * be wildcarded.
+		 */
+		if (is_mask && output->eth.type)
+			if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE,
+						output->eth.type))
+				goto nla_put_failure;
 		goto unencap;
+	}
 
-	if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, swkey->eth.type))
+	if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type))
 		goto nla_put_failure;
 
 	if (swkey->eth.type == htons(ETH_P_IP)) {
@@ -1369,12 +1773,12 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
 		if (!nla)
 			goto nla_put_failure;
 		ipv4_key = nla_data(nla);
-		ipv4_key->ipv4_src = swkey->ipv4.addr.src;
-		ipv4_key->ipv4_dst = swkey->ipv4.addr.dst;
-		ipv4_key->ipv4_proto = swkey->ip.proto;
-		ipv4_key->ipv4_tos = swkey->ip.tos;
-		ipv4_key->ipv4_ttl = swkey->ip.ttl;
-		ipv4_key->ipv4_frag = swkey->ip.frag;
+		ipv4_key->ipv4_src = output->ipv4.addr.src;
+		ipv4_key->ipv4_dst = output->ipv4.addr.dst;
+		ipv4_key->ipv4_proto = output->ip.proto;
+		ipv4_key->ipv4_tos = output->ip.tos;
+		ipv4_key->ipv4_ttl = output->ip.ttl;
+		ipv4_key->ipv4_frag = output->ip.frag;
 	} else if (swkey->eth.type == htons(ETH_P_IPV6)) {
 		struct ovs_key_ipv6 *ipv6_key;
 
@@ -1382,15 +1786,15 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
 		if (!nla)
 			goto nla_put_failure;
 		ipv6_key = nla_data(nla);
-		memcpy(ipv6_key->ipv6_src, &swkey->ipv6.addr.src,
+		memcpy(ipv6_key->ipv6_src, &output->ipv6.addr.src,
 				sizeof(ipv6_key->ipv6_src));
-		memcpy(ipv6_key->ipv6_dst, &swkey->ipv6.addr.dst,
+		memcpy(ipv6_key->ipv6_dst, &output->ipv6.addr.dst,
 				sizeof(ipv6_key->ipv6_dst));
-		ipv6_key->ipv6_label = swkey->ipv6.label;
-		ipv6_key->ipv6_proto = swkey->ip.proto;
-		ipv6_key->ipv6_tclass = swkey->ip.tos;
-		ipv6_key->ipv6_hlimit = swkey->ip.ttl;
-		ipv6_key->ipv6_frag = swkey->ip.frag;
+		ipv6_key->ipv6_label = output->ipv6.label;
+		ipv6_key->ipv6_proto = output->ip.proto;
+		ipv6_key->ipv6_tclass = output->ip.tos;
+		ipv6_key->ipv6_hlimit = output->ip.ttl;
+		ipv6_key->ipv6_frag = output->ip.frag;
 	} else if (swkey->eth.type == htons(ETH_P_ARP) ||
 		   swkey->eth.type == htons(ETH_P_RARP)) {
 		struct ovs_key_arp *arp_key;
@@ -1400,11 +1804,11 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
 			goto nla_put_failure;
 		arp_key = nla_data(nla);
 		memset(arp_key, 0, sizeof(struct ovs_key_arp));
-		arp_key->arp_sip = swkey->ipv4.addr.src;
-		arp_key->arp_tip = swkey->ipv4.addr.dst;
-		arp_key->arp_op = htons(swkey->ip.proto);
-		memcpy(arp_key->arp_sha, swkey->ipv4.arp.sha, ETH_ALEN);
-		memcpy(arp_key->arp_tha, swkey->ipv4.arp.tha, ETH_ALEN);
+		arp_key->arp_sip = output->ipv4.addr.src;
+		arp_key->arp_tip = output->ipv4.addr.dst;
+		arp_key->arp_op = htons(output->ip.proto);
+		memcpy(arp_key->arp_sha, output->ipv4.arp.sha, ETH_ALEN);
+		memcpy(arp_key->arp_tha, output->ipv4.arp.tha, ETH_ALEN);
 	}
 
 	if ((swkey->eth.type == htons(ETH_P_IP) ||
@@ -1419,11 +1823,11 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
 				goto nla_put_failure;
 			tcp_key = nla_data(nla);
 			if (swkey->eth.type == htons(ETH_P_IP)) {
-				tcp_key->tcp_src = swkey->ipv4.tp.src;
-				tcp_key->tcp_dst = swkey->ipv4.tp.dst;
+				tcp_key->tcp_src = output->ipv4.tp.src;
+				tcp_key->tcp_dst = output->ipv4.tp.dst;
 			} else if (swkey->eth.type == htons(ETH_P_IPV6)) {
-				tcp_key->tcp_src = swkey->ipv6.tp.src;
-				tcp_key->tcp_dst = swkey->ipv6.tp.dst;
+				tcp_key->tcp_src = output->ipv6.tp.src;
+				tcp_key->tcp_dst = output->ipv6.tp.dst;
 			}
 		} else if (swkey->ip.proto == IPPROTO_UDP) {
 			struct ovs_key_udp *udp_key;
@@ -1433,11 +1837,11 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
 				goto nla_put_failure;
 			udp_key = nla_data(nla);
 			if (swkey->eth.type == htons(ETH_P_IP)) {
-				udp_key->udp_src = swkey->ipv4.tp.src;
-				udp_key->udp_dst = swkey->ipv4.tp.dst;
+				udp_key->udp_src = output->ipv4.tp.src;
+				udp_key->udp_dst = output->ipv4.tp.dst;
 			} else if (swkey->eth.type == htons(ETH_P_IPV6)) {
-				udp_key->udp_src = swkey->ipv6.tp.src;
-				udp_key->udp_dst = swkey->ipv6.tp.dst;
+				udp_key->udp_src = output->ipv6.tp.src;
+				udp_key->udp_dst = output->ipv6.tp.dst;
 			}
 		} else if (swkey->eth.type == htons(ETH_P_IP) &&
 			   swkey->ip.proto == IPPROTO_ICMP) {
@@ -1447,8 +1851,8 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
 			if (!nla)
 				goto nla_put_failure;
 			icmp_key = nla_data(nla);
-			icmp_key->icmp_type = ntohs(swkey->ipv4.tp.src);
-			icmp_key->icmp_code = ntohs(swkey->ipv4.tp.dst);
+			icmp_key->icmp_type = ntohs(output->ipv4.tp.src);
+			icmp_key->icmp_code = ntohs(output->ipv4.tp.dst);
 		} else if (swkey->eth.type == htons(ETH_P_IPV6) &&
 			   swkey->ip.proto == IPPROTO_ICMPV6) {
 			struct ovs_key_icmpv6 *icmpv6_key;
@@ -1458,8 +1862,8 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
 			if (!nla)
 				goto nla_put_failure;
 			icmpv6_key = nla_data(nla);
-			icmpv6_key->icmpv6_type = ntohs(swkey->ipv6.tp.src);
-			icmpv6_key->icmpv6_code = ntohs(swkey->ipv6.tp.dst);
+			icmpv6_key->icmpv6_type = ntohs(output->ipv6.tp.src);
+			icmpv6_key->icmpv6_code = ntohs(output->ipv6.tp.dst);
 
 			if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION ||
 			    icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) {
@@ -1469,10 +1873,10 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
 				if (!nla)
 					goto nla_put_failure;
 				nd_key = nla_data(nla);
-				memcpy(nd_key->nd_target, &swkey->ipv6.nd.target,
+				memcpy(nd_key->nd_target, &output->ipv6.nd.target,
 							sizeof(nd_key->nd_target));
-				memcpy(nd_key->nd_sll, swkey->ipv6.nd.sll, ETH_ALEN);
-				memcpy(nd_key->nd_tll, swkey->ipv6.nd.tll, ETH_ALEN);
+				memcpy(nd_key->nd_sll, output->ipv6.nd.sll, ETH_ALEN);
+				memcpy(nd_key->nd_tll, output->ipv6.nd.tll, ETH_ALEN);
 			}
 		}
 	}
@@ -1504,3 +1908,84 @@ void ovs_flow_exit(void)
 {
 	kmem_cache_destroy(flow_cache);
 }
+
+struct sw_flow_mask *ovs_sw_flow_mask_alloc(void)
+{
+	struct sw_flow_mask *mask;
+
+	mask = kmalloc(sizeof(*mask), GFP_KERNEL);
+	if (mask)
+		mask->ref_count = 0;
+
+	return mask;
+}
+
+void ovs_sw_flow_mask_add_ref(struct sw_flow_mask *mask)
+{
+	mask->ref_count++;
+}
+
+void ovs_sw_flow_mask_del_ref(struct sw_flow_mask *mask, bool deferred)
+{
+	if (!mask)
+		return;
+
+	BUG_ON(!mask->ref_count);
+	mask->ref_count--;
+
+	if (!mask->ref_count) {
+		list_del_rcu(&mask->list);
+		if (deferred)
+			kfree_rcu(mask, rcu);
+		else
+			kfree(mask);
+	}
+}
+
+static bool ovs_sw_flow_mask_equal(const struct sw_flow_mask *a,
+		const struct sw_flow_mask *b)
+{
+	u8 *a_ = (u8 *)&a->key + a->range.start;
+	u8 *b_ = (u8 *)&b->key + b->range.start;
+
+	return  (a->range.end == b->range.end)
+		&& (a->range.start == b->range.start)
+		&& (memcmp(a_, b_, ovs_sw_flow_mask_actual_size(a)) == 0);
+}
+
+struct sw_flow_mask *ovs_sw_flow_mask_find(const struct flow_table *tbl,
+                                           const struct sw_flow_mask *mask)
+{
+	struct list_head *ml;
+
+	list_for_each(ml, tbl->mask_list) {
+		struct sw_flow_mask *m;
+		m = container_of(ml, struct sw_flow_mask, list);
+		if (ovs_sw_flow_mask_equal(mask, m))
+			return m;
+	}
+
+	return NULL;
+}
+
+/**
+ * add a new mask into the mask list.
+ * The caller needs to make sure that 'mask' is not the same
+ * as any masks that are already on the list.
+ */
+void ovs_sw_flow_mask_insert(struct flow_table *tbl, struct sw_flow_mask *mask)
+{
+	list_add_rcu(&mask->list, tbl->mask_list);
+}
+
+/**
+ * Set 'range' fields in the mask to the value of 'val'.
+ */
+static void ovs_sw_flow_mask_set(struct sw_flow_mask *mask,
+		struct sw_flow_key_range *range, u8 val)
+{
+	u8 *m = (u8 *)&mask->key + range->start;
+
+	mask->range = *range;
+	memset(m, val, ovs_sw_flow_mask_size_roundup(mask));
+}
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 66ef7220293e..9674e45f6969 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2011 Nicira, Inc.
+ * Copyright (c) 2007-2013 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -33,6 +33,8 @@
 #include <net/inet_ecn.h>
 
 struct sk_buff;
+struct sw_flow_mask;
+struct flow_table;
 
 struct sw_flow_actions {
 	struct rcu_head rcu;
@@ -131,6 +133,8 @@ struct sw_flow {
 	u32 hash;
 
 	struct sw_flow_key key;
+	struct sw_flow_key unmasked_key;
+	struct sw_flow_mask *mask;
 	struct sw_flow_actions __rcu *sf_acts;
 
 	spinlock_t lock;	/* Lock for values below. */
@@ -140,6 +144,25 @@ struct sw_flow {
 	u8 tcp_flags;		/* Union of seen TCP flags. */
 };
 
+struct sw_flow_key_range {
+	size_t start;
+	size_t end;
+};
+
+static inline u16 ovs_sw_flow_key_range_actual_size(const struct sw_flow_key_range *range)
+{
+	return range->end - range->start;
+}
+
+struct sw_flow_match {
+	struct sw_flow_key *key;
+	struct sw_flow_key_range range;
+	struct sw_flow_mask *mask;
+};
+
+void ovs_match_init(struct sw_flow_match *match,
+		struct sw_flow_key *key, struct sw_flow_mask *mask);
+
 struct arp_eth_header {
 	__be16      ar_hrd;	/* format of hardware address   */
 	__be16      ar_pro;	/* format of protocol address   */
@@ -159,21 +182,21 @@ void ovs_flow_exit(void);
 
 struct sw_flow *ovs_flow_alloc(void);
 void ovs_flow_deferred_free(struct sw_flow *);
-void ovs_flow_free(struct sw_flow *flow);
+void ovs_flow_free(struct sw_flow *, bool deferred);
 
 struct sw_flow_actions *ovs_flow_actions_alloc(int actions_len);
 void ovs_flow_deferred_free_acts(struct sw_flow_actions *);
 
-int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *,
-		     int *key_lenp);
+int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *);
 void ovs_flow_used(struct sw_flow *, struct sk_buff *);
 u64 ovs_flow_used_time(unsigned long flow_jiffies);
-
-int ovs_flow_to_nlattrs(const struct sw_flow_key *, struct sk_buff *);
-int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
+int ovs_flow_to_nlattrs(const struct sw_flow_key *,
+		const struct sw_flow_key *, struct sk_buff *);
+int ovs_match_from_nlattrs(struct sw_flow_match *match,
+		      const struct nlattr *,
 		      const struct nlattr *);
-int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, int key_len,
-				  const struct nlattr *attr);
+int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow,
+		const struct nlattr *attr);
 
 #define MAX_ACTIONS_BUFSIZE    (32 * 1024)
 #define TBL_MIN_BUCKETS		1024
@@ -182,6 +205,7 @@ struct flow_table {
 	struct flex_array *buckets;
 	unsigned int count, n_buckets;
 	struct rcu_head rcu;
+	struct list_head *mask_list;
 	int node_ver;
 	u32 hash_seed;
 	bool keep_flows;
@@ -197,22 +221,56 @@ static inline int ovs_flow_tbl_need_to_expand(struct flow_table *table)
 	return (table->count > table->n_buckets);
 }
 
-struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table,
-				    struct sw_flow_key *key, int len);
-void ovs_flow_tbl_destroy(struct flow_table *table);
-void ovs_flow_tbl_deferred_destroy(struct flow_table *table);
+struct sw_flow *ovs_flow_lookup(struct flow_table *,
+				const struct sw_flow_key *);
+struct sw_flow *ovs_flow_lookup_unmasked_key(struct flow_table *table,
+				    struct sw_flow_match *match);
+
+void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred);
 struct flow_table *ovs_flow_tbl_alloc(int new_size);
 struct flow_table *ovs_flow_tbl_expand(struct flow_table *table);
 struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table);
-void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
-			 struct sw_flow_key *key, int key_len);
-void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow);
 
-struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *idx);
+void ovs_flow_insert(struct flow_table *table, struct sw_flow *flow);
+void ovs_flow_remove(struct flow_table *table, struct sw_flow *flow);
+
+struct sw_flow *ovs_flow_dump_next(struct flow_table *table, u32 *bucket, u32 *idx);
 extern const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1];
 int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr,
-			 struct ovs_key_ipv4_tunnel *tun_key);
+			     struct sw_flow_match *match, bool is_mask);
 int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb,
-			const struct ovs_key_ipv4_tunnel *tun_key);
+			   const struct ovs_key_ipv4_tunnel *tun_key,
+			   const struct ovs_key_ipv4_tunnel *output);
+
+bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
+		const struct sw_flow_key *key, int key_len);
+
+struct sw_flow_mask {
+	int ref_count;
+	struct rcu_head rcu;
+	struct list_head list;
+	struct sw_flow_key_range range;
+	struct sw_flow_key key;
+};
+
+static inline u16
+ovs_sw_flow_mask_actual_size(const struct sw_flow_mask *mask)
+{
+	return ovs_sw_flow_key_range_actual_size(&mask->range);
+}
+
+static inline u16
+ovs_sw_flow_mask_size_roundup(const struct sw_flow_mask *mask)
+{
+	return roundup(ovs_sw_flow_mask_actual_size(mask), sizeof(u32));
+}
 
+struct sw_flow_mask *ovs_sw_flow_mask_alloc(void);
+void ovs_sw_flow_mask_add_ref(struct sw_flow_mask *);
+void ovs_sw_flow_mask_del_ref(struct sw_flow_mask *, bool deferred);
+void ovs_sw_flow_mask_insert(struct flow_table *, struct sw_flow_mask *);
+struct sw_flow_mask *ovs_sw_flow_mask_find(const struct flow_table *,
+		const struct sw_flow_mask *);
+void ovs_flow_key_mask(struct sw_flow_key *dst, const struct sw_flow_key *src,
+		       const struct sw_flow_mask *mask);
 #endif /* flow.h */
-- 
cgit v1.2.3


From b4de77ade3fc56e41b978b68d78a351dab28b74e Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 23 Aug 2013 11:15:37 +0300
Subject: ipip: potential race in ip_tunnel_init_net()

Eric Dumazet says that my previous fix for an ERR_PTR dereference
(ea857f28ab 'ipip: dereferencing an ERR_PTR in ip_tunnel_init_net()')
could be racy and suggests the following fix instead.

Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_tunnel.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 24549b4aeae2..830de3f4e293 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -854,16 +854,14 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
 
 	rtnl_lock();
 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
-	rtnl_unlock();
-
-	if (IS_ERR(itn->fb_tunnel_dev))
-		return PTR_ERR(itn->fb_tunnel_dev);
 	/* FB netdevice is special: we have one, and only one per netns.
 	 * Allowing to move it to another netns is clearly unsafe.
 	 */
-	itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
+	if (!IS_ERR(itn->fb_tunnel_dev))
+		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
+	rtnl_unlock();
 
-	return 0;
+	return PTR_RET(itn->fb_tunnel_dev);
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
 
-- 
cgit v1.2.3


From a98655387762394371b88cdfb8215884757978ab Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 26 Aug 2013 09:30:32 +0200
Subject: mac80211: fix change_interface queue assignments

Jouni reported that with mac80211_hwsim, multicast TX was causing
crashes due to invalid vif->cab_queue assignment. It turns out that
this is caused by change_interface() getting invoked and not having
the vif->type/vif->p2p assigned correctly before calling the queue
check (ieee80211_check_queues). Fix this by passing the 'external'
interface type to the function and adjusting it accordingly.

While at it, also fix the error path in change_interface, it wasn't
correctly resetting to the external type but using the internal one
instead.

Fortunately this affects on hwsim because all other drivers set the
vif->type/vif->p2p variables when changing iftype. This shouldn't
be needed, but almost all implementations actually do it for their
own internal handling.

Reported-by: Jouni Malinen <j@w1.fi>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/iface.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 7ca534bf4cea..fcecd633514e 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -308,12 +308,13 @@ static int ieee80211_check_concurrent_iface(struct ieee80211_sub_if_data *sdata,
 	return 0;
 }
 
-static int ieee80211_check_queues(struct ieee80211_sub_if_data *sdata)
+static int ieee80211_check_queues(struct ieee80211_sub_if_data *sdata,
+				  enum nl80211_iftype iftype)
 {
 	int n_queues = sdata->local->hw.queues;
 	int i;
 
-	if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE) {
+	if (iftype != NL80211_IFTYPE_P2P_DEVICE) {
 		for (i = 0; i < IEEE80211_NUM_ACS; i++) {
 			if (WARN_ON_ONCE(sdata->vif.hw_queue[i] ==
 					 IEEE80211_INVAL_HW_QUEUE))
@@ -324,8 +325,9 @@ static int ieee80211_check_queues(struct ieee80211_sub_if_data *sdata)
 		}
 	}
 
-	if ((sdata->vif.type != NL80211_IFTYPE_AP &&
-	     sdata->vif.type != NL80211_IFTYPE_MESH_POINT) ||
+	if ((iftype != NL80211_IFTYPE_AP &&
+	     iftype != NL80211_IFTYPE_P2P_GO &&
+	     iftype != NL80211_IFTYPE_MESH_POINT) ||
 	    !(sdata->local->hw.flags & IEEE80211_HW_QUEUE_CONTROL)) {
 		sdata->vif.cab_queue = IEEE80211_INVAL_HW_QUEUE;
 		return 0;
@@ -408,7 +410,7 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local)
 		return ret;
 	}
 
-	ret = ieee80211_check_queues(sdata);
+	ret = ieee80211_check_queues(sdata, NL80211_IFTYPE_MONITOR);
 	if (ret) {
 		kfree(sdata);
 		return ret;
@@ -592,7 +594,8 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
 			res = drv_add_interface(local, sdata);
 			if (res)
 				goto err_stop;
-			res = ieee80211_check_queues(sdata);
+			res = ieee80211_check_queues(sdata,
+				ieee80211_vif_type_p2p(&sdata->vif));
 			if (res)
 				goto err_del_interface;
 		}
@@ -1389,14 +1392,14 @@ static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata,
 
 	ret = drv_change_interface(local, sdata, internal_type, p2p);
 	if (ret)
-		type = sdata->vif.type;
+		type = ieee80211_vif_type_p2p(&sdata->vif);
 
 	/*
 	 * Ignore return value here, there's not much we can do since
 	 * the driver changed the interface type internally already.
 	 * The warnings will hopefully make driver authors fix it :-)
 	 */
-	ieee80211_check_queues(sdata);
+	ieee80211_check_queues(sdata, type);
 
 	ieee80211_setup_sdata(sdata, type);
 
-- 
cgit v1.2.3


From a175a723301a8a4a9fedf9ce5b8ca586e7a97b40 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Thu, 22 Aug 2013 12:30:48 -0700
Subject: openvswitch: Add SCTP support

This patch adds support for rewriting SCTP src,dst ports similar to the
functionality already available for TCP/UDP.

Rewriting SCTP ports is expensive due to double-recalculation of the
SCTP checksums; this is performed to ensure that packets traversing OVS
with invalid checksums will continue to the destination with any
checksum corruption intact.

Reviewed-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Joe Stringer <joe@wand.net.nz>
Signed-off-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 include/uapi/linux/openvswitch.h |  6 ++++
 net/openvswitch/Kconfig          |  1 +
 net/openvswitch/actions.c        | 39 ++++++++++++++++++++++++
 net/openvswitch/datapath.c       |  6 ++++
 net/openvswitch/flow.c           | 65 ++++++++++++++++++++++++++++++++++++++++
 net/openvswitch/flow.h           |  8 ++---
 6 files changed, 121 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index de1fa5d3780f..a74d375b439b 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -259,6 +259,7 @@ enum ovs_key_attr {
 	OVS_KEY_ATTR_ND,        /* struct ovs_key_nd */
 	OVS_KEY_ATTR_SKB_MARK,  /* u32 skb mark */
 	OVS_KEY_ATTR_TUNNEL,    /* Nested set of ovs_tunnel attributes */
+	OVS_KEY_ATTR_SCTP,      /* struct ovs_key_sctp */
 
 #ifdef __KERNEL__
 	OVS_KEY_ATTR_IPV4_TUNNEL,  /* struct ovs_key_ipv4_tunnel */
@@ -333,6 +334,11 @@ struct ovs_key_udp {
 	__be16 udp_dst;
 };
 
+struct ovs_key_sctp {
+	__be16 sctp_src;
+	__be16 sctp_dst;
+};
+
 struct ovs_key_icmp {
 	__u8 icmp_type;
 	__u8 icmp_code;
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index bed30e69baa7..6ecf491ad509 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -4,6 +4,7 @@
 
 config OPENVSWITCH
 	tristate "Open vSwitch"
+	select LIBCRC32C
 	---help---
 	  Open vSwitch is a multilayer Ethernet switch targeted at virtualized
 	  environments.  In addition to supporting a variety of features
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 1f680222f4f7..65cfaa816075 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -22,6 +22,7 @@
 #include <linux/in.h>
 #include <linux/ip.h>
 #include <linux/openvswitch.h>
+#include <linux/sctp.h>
 #include <linux/tcp.h>
 #include <linux/udp.h>
 #include <linux/in6.h>
@@ -31,6 +32,7 @@
 #include <net/ipv6.h>
 #include <net/checksum.h>
 #include <net/dsfield.h>
+#include <net/sctp/checksum.h>
 
 #include "datapath.h"
 #include "vport.h"
@@ -352,6 +354,39 @@ static int set_tcp(struct sk_buff *skb, const struct ovs_key_tcp *tcp_port_key)
 	return 0;
 }
 
+static int set_sctp(struct sk_buff *skb,
+		     const struct ovs_key_sctp *sctp_port_key)
+{
+	struct sctphdr *sh;
+	int err;
+	unsigned int sctphoff = skb_transport_offset(skb);
+
+	err = make_writable(skb, sctphoff + sizeof(struct sctphdr));
+	if (unlikely(err))
+		return err;
+
+	sh = sctp_hdr(skb);
+	if (sctp_port_key->sctp_src != sh->source ||
+	    sctp_port_key->sctp_dst != sh->dest) {
+		__le32 old_correct_csum, new_csum, old_csum;
+
+		old_csum = sh->checksum;
+		old_correct_csum = sctp_compute_cksum(skb, sctphoff);
+
+		sh->source = sctp_port_key->sctp_src;
+		sh->dest = sctp_port_key->sctp_dst;
+
+		new_csum = sctp_compute_cksum(skb, sctphoff);
+
+		/* Carry any checksum errors through. */
+		sh->checksum = old_csum ^ old_correct_csum ^ new_csum;
+
+		skb->rxhash = 0;
+	}
+
+	return 0;
+}
+
 static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port)
 {
 	struct vport *vport;
@@ -461,6 +496,10 @@ static int execute_set_action(struct sk_buff *skb,
 	case OVS_KEY_ATTR_UDP:
 		err = set_udp(skb, nla_data(nested_attr));
 		break;
+
+	case OVS_KEY_ATTR_SCTP:
+		err = set_sctp(skb, nla_data(nested_attr));
+		break;
 	}
 
 	return err;
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index d29cd9aa4a67..2aa13bd7f2b2 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -712,6 +712,12 @@ static int validate_set(const struct nlattr *a,
 
 		return validate_tp_port(flow_key);
 
+	case OVS_KEY_ATTR_SCTP:
+		if (flow_key->ip.proto != IPPROTO_SCTP)
+			return -EINVAL;
+
+		return validate_tp_port(flow_key);
+
 	default:
 		return -EINVAL;
 	}
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 1fceb9653598..2b4785590b56 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -34,6 +34,7 @@
 #include <linux/if_arp.h>
 #include <linux/ip.h>
 #include <linux/ipv6.h>
+#include <linux/sctp.h>
 #include <linux/tcp.h>
 #include <linux/udp.h>
 #include <linux/icmp.h>
@@ -129,6 +130,7 @@ static bool ovs_match_validate(const struct sw_flow_match *match,
 			| (1 << OVS_KEY_ATTR_IPV6)
 			| (1 << OVS_KEY_ATTR_TCP)
 			| (1 << OVS_KEY_ATTR_UDP)
+			| (1 << OVS_KEY_ATTR_SCTP)
 			| (1 << OVS_KEY_ATTR_ICMP)
 			| (1 << OVS_KEY_ATTR_ICMPV6)
 			| (1 << OVS_KEY_ATTR_ARP)
@@ -159,6 +161,12 @@ static bool ovs_match_validate(const struct sw_flow_match *match,
 					mask_allowed |= 1 << OVS_KEY_ATTR_UDP;
 			}
 
+			if (match->key->ip.proto == IPPROTO_SCTP) {
+				key_expected |= 1 << OVS_KEY_ATTR_SCTP;
+				if (match->mask && (match->mask->key.ip.proto == 0xff))
+					mask_allowed |= 1 << OVS_KEY_ATTR_SCTP;
+			}
+
 			if (match->key->ip.proto == IPPROTO_TCP) {
 				key_expected |= 1 << OVS_KEY_ATTR_TCP;
 				if (match->mask && (match->mask->key.ip.proto == 0xff))
@@ -185,6 +193,12 @@ static bool ovs_match_validate(const struct sw_flow_match *match,
 					mask_allowed |= 1 << OVS_KEY_ATTR_UDP;
 			}
 
+			if (match->key->ip.proto == IPPROTO_SCTP) {
+				key_expected |= 1 << OVS_KEY_ATTR_SCTP;
+				if (match->mask && (match->mask->key.ip.proto == 0xff))
+					mask_allowed |= 1 << OVS_KEY_ATTR_SCTP;
+			}
+
 			if (match->key->ip.proto == IPPROTO_TCP) {
 				key_expected |= 1 << OVS_KEY_ATTR_TCP;
 				if (match->mask && (match->mask->key.ip.proto == 0xff))
@@ -280,6 +294,12 @@ static bool udphdr_ok(struct sk_buff *skb)
 				  sizeof(struct udphdr));
 }
 
+static bool sctphdr_ok(struct sk_buff *skb)
+{
+	return pskb_may_pull(skb, skb_transport_offset(skb) +
+				  sizeof(struct sctphdr));
+}
+
 static bool icmphdr_ok(struct sk_buff *skb)
 {
 	return pskb_may_pull(skb, skb_transport_offset(skb) +
@@ -891,6 +911,12 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key)
 				key->ipv4.tp.src = udp->source;
 				key->ipv4.tp.dst = udp->dest;
 			}
+		} else if (key->ip.proto == IPPROTO_SCTP) {
+			if (sctphdr_ok(skb)) {
+				struct sctphdr *sctp = sctp_hdr(skb);
+				key->ipv4.tp.src = sctp->source;
+				key->ipv4.tp.dst = sctp->dest;
+			}
 		} else if (key->ip.proto == IPPROTO_ICMP) {
 			if (icmphdr_ok(skb)) {
 				struct icmphdr *icmp = icmp_hdr(skb);
@@ -953,6 +979,12 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key)
 				key->ipv6.tp.src = udp->source;
 				key->ipv6.tp.dst = udp->dest;
 			}
+		} else if (key->ip.proto == NEXTHDR_SCTP) {
+			if (sctphdr_ok(skb)) {
+				struct sctphdr *sctp = sctp_hdr(skb);
+				key->ipv6.tp.src = sctp->source;
+				key->ipv6.tp.dst = sctp->dest;
+			}
 		} else if (key->ip.proto == NEXTHDR_ICMP) {
 			if (icmp6hdr_ok(skb)) {
 				error = parse_icmpv6(skb, key, nh_len);
@@ -1087,6 +1119,7 @@ const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
 	[OVS_KEY_ATTR_IPV6] = sizeof(struct ovs_key_ipv6),
 	[OVS_KEY_ATTR_TCP] = sizeof(struct ovs_key_tcp),
 	[OVS_KEY_ATTR_UDP] = sizeof(struct ovs_key_udp),
+	[OVS_KEY_ATTR_SCTP] = sizeof(struct ovs_key_sctp),
 	[OVS_KEY_ATTR_ICMP] = sizeof(struct ovs_key_icmp),
 	[OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6),
 	[OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp),
@@ -1500,6 +1533,24 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match,  u64 attrs,
 		attrs &= ~(1 << OVS_KEY_ATTR_UDP);
 	}
 
+	if (attrs & (1 << OVS_KEY_ATTR_SCTP)) {
+		const struct ovs_key_sctp *sctp_key;
+
+		sctp_key = nla_data(a[OVS_KEY_ATTR_SCTP]);
+		if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) {
+			SW_FLOW_KEY_PUT(match, ipv4.tp.src,
+					sctp_key->sctp_src, is_mask);
+			SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
+					sctp_key->sctp_dst, is_mask);
+		} else {
+			SW_FLOW_KEY_PUT(match, ipv6.tp.src,
+					sctp_key->sctp_src, is_mask);
+			SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
+					sctp_key->sctp_dst, is_mask);
+		}
+		attrs &= ~(1 << OVS_KEY_ATTR_SCTP);
+	}
+
 	if (attrs & (1 << OVS_KEY_ATTR_ICMP)) {
 		const struct ovs_key_icmp *icmp_key;
 
@@ -1843,6 +1894,20 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey,
 				udp_key->udp_src = output->ipv6.tp.src;
 				udp_key->udp_dst = output->ipv6.tp.dst;
 			}
+		} else if (swkey->ip.proto == IPPROTO_SCTP) {
+			struct ovs_key_sctp *sctp_key;
+
+			nla = nla_reserve(skb, OVS_KEY_ATTR_SCTP, sizeof(*sctp_key));
+			if (!nla)
+				goto nla_put_failure;
+			sctp_key = nla_data(nla);
+			if (swkey->eth.type == htons(ETH_P_IP)) {
+				sctp_key->sctp_src = swkey->ipv4.tp.src;
+				sctp_key->sctp_dst = swkey->ipv4.tp.dst;
+			} else if (swkey->eth.type == htons(ETH_P_IPV6)) {
+				sctp_key->sctp_src = swkey->ipv6.tp.src;
+				sctp_key->sctp_dst = swkey->ipv6.tp.dst;
+			}
 		} else if (swkey->eth.type == htons(ETH_P_IP) &&
 			   swkey->ip.proto == IPPROTO_ICMP) {
 			struct ovs_key_icmp *icmp_key;
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 9674e45f6969..d08dcf78dbf3 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -99,8 +99,8 @@ struct sw_flow_key {
 			} addr;
 			union {
 				struct {
-					__be16 src;		/* TCP/UDP source port. */
-					__be16 dst;		/* TCP/UDP destination port. */
+					__be16 src;		/* TCP/UDP/SCTP source port. */
+					__be16 dst;		/* TCP/UDP/SCTP destination port. */
 				} tp;
 				struct {
 					u8 sha[ETH_ALEN];	/* ARP source hardware address. */
@@ -115,8 +115,8 @@ struct sw_flow_key {
 			} addr;
 			__be32 label;			/* IPv6 flow label. */
 			struct {
-				__be16 src;		/* TCP/UDP source port. */
-				__be16 dst;		/* TCP/UDP destination port. */
+				__be16 src;		/* TCP/UDP/SCTP source port. */
+				__be16 dst;		/* TCP/UDP/SCTP destination port. */
 			} tp;
 			struct {
 				struct in6_addr target;	/* ND target address. */
-- 
cgit v1.2.3


From 02237373b1c61a09a4db329545e39cffc48910d5 Mon Sep 17 00:00:00 2001
From: Andy Zhou <azhou@nicira.com>
Date: Thu, 22 Aug 2013 12:12:57 -0700
Subject: openvswitch: Rename key_len to key_end

Key_end is a better name describing the ending boundary than key_len.
Rename those variables to make it less confusing.

Signed-off-by: Andy Zhou <azhou@nicira.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 net/openvswitch/flow.c | 31 ++++++++++++++++---------------
 net/openvswitch/flow.h |  2 +-
 2 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 2b4785590b56..80bcb96467a5 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -997,10 +997,11 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key)
 	return 0;
 }
 
-static u32 ovs_flow_hash(const struct sw_flow_key *key, int key_start, int key_len)
+static u32 ovs_flow_hash(const struct sw_flow_key *key, int key_start,
+			 int key_end)
 {
 	return jhash2((u32 *)((u8 *)key + key_start),
-		      DIV_ROUND_UP(key_len - key_start, sizeof(u32)), 0);
+		      DIV_ROUND_UP(key_end - key_start, sizeof(u32)), 0);
 }
 
 static int flow_key_start(const struct sw_flow_key *key)
@@ -1012,31 +1013,31 @@ static int flow_key_start(const struct sw_flow_key *key)
 }
 
 static bool __cmp_key(const struct sw_flow_key *key1,
-		const struct sw_flow_key *key2,  int key_start, int key_len)
+		const struct sw_flow_key *key2,  int key_start, int key_end)
 {
 	return !memcmp((u8 *)key1 + key_start,
-			(u8 *)key2 + key_start, (key_len - key_start));
+			(u8 *)key2 + key_start, (key_end - key_start));
 }
 
 static bool __flow_cmp_key(const struct sw_flow *flow,
-		const struct sw_flow_key *key, int key_start, int key_len)
+		const struct sw_flow_key *key, int key_start, int key_end)
 {
-	return __cmp_key(&flow->key, key, key_start, key_len);
+	return __cmp_key(&flow->key, key, key_start, key_end);
 }
 
 static bool __flow_cmp_unmasked_key(const struct sw_flow *flow,
-		  const struct sw_flow_key *key, int key_start, int key_len)
+		  const struct sw_flow_key *key, int key_start, int key_end)
 {
-	return __cmp_key(&flow->unmasked_key, key, key_start, key_len);
+	return __cmp_key(&flow->unmasked_key, key, key_start, key_end);
 }
 
 bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
-		const struct sw_flow_key *key, int key_len)
+		const struct sw_flow_key *key, int key_end)
 {
 	int key_start;
 	key_start = flow_key_start(key);
 
-	return __flow_cmp_unmasked_key(flow, key, key_start, key_len);
+	return __flow_cmp_unmasked_key(flow, key, key_start, key_end);
 
 }
 
@@ -1044,11 +1045,11 @@ struct sw_flow *ovs_flow_lookup_unmasked_key(struct flow_table *table,
 				       struct sw_flow_match *match)
 {
 	struct sw_flow_key *unmasked = match->key;
-	int key_len = match->range.end;
+	int key_end = match->range.end;
 	struct sw_flow *flow;
 
 	flow = ovs_flow_lookup(table, unmasked);
-	if (flow && (!ovs_flow_cmp_unmasked_key(flow, unmasked, key_len)))
+	if (flow && (!ovs_flow_cmp_unmasked_key(flow, unmasked, key_end)))
 		flow = NULL;
 
 	return flow;
@@ -1061,16 +1062,16 @@ static struct sw_flow *ovs_masked_flow_lookup(struct flow_table *table,
 	struct sw_flow *flow;
 	struct hlist_head *head;
 	int key_start = mask->range.start;
-	int key_len = mask->range.end;
+	int key_end = mask->range.end;
 	u32 hash;
 	struct sw_flow_key masked_key;
 
 	ovs_flow_key_mask(&masked_key, flow_key, mask);
-	hash = ovs_flow_hash(&masked_key, key_start, key_len);
+	hash = ovs_flow_hash(&masked_key, key_start, key_end);
 	head = find_bucket(table, hash);
 	hlist_for_each_entry_rcu(flow, head, hash_node[table->node_ver]) {
 		if (flow->mask == mask &&
-		    __flow_cmp_key(flow, &masked_key, key_start, key_len))
+		    __flow_cmp_key(flow, &masked_key, key_start, key_end))
 			return flow;
 	}
 	return NULL;
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index d08dcf78dbf3..e79305184b79 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -243,7 +243,7 @@ int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb,
 			   const struct ovs_key_ipv4_tunnel *output);
 
 bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
-		const struct sw_flow_key *key, int key_len);
+		const struct sw_flow_key *key, int key_end);
 
 struct sw_flow_mask {
 	int ref_count;
-- 
cgit v1.2.3


From b1dcdc68b1f4cc77f603d7507f7a14f1f4864d41 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Fri, 23 Aug 2013 16:16:33 +0200
Subject: net: tcp_probe: allow more advanced ingress filtering by mark

Currently, the tcp_probe snooper can either filter packets by a given
port (handed to the module via module parameter e.g. port=80) or lets
all TCP traffic pass (port=0, default). When a port is specified, the
port number is tested against the sk's source/destination port. Thus,
if one of them matches, the information will be further processed for
the log.

As this is quite limited, allow for more advanced filtering possibilities
which can facilitate debugging/analysis with the help of the tcp_probe
snooper. Therefore, similarly as added to BPF machine in commit 7e75f93e
("pkt_sched: ingress socket filter by mark"), add the possibility to
use skb->mark as a filter.

If the mark is not being used otherwise, this allows ingress filtering
by flow (e.g. in order to track updates from only a single flow, or a
subset of all flows for a given port) and other things such as dynamic
logging and reconfiguration without removing/re-inserting the tcp_probe
module, etc. Simple example:

  insmod net/ipv4/tcp_probe.ko fwmark=8888 full=1
  ...
  iptables -A INPUT -i eth4 -t mangle -p tcp --dport 22 \
           --sport 60952 -j MARK --set-mark 8888
  [... sampling interval ...]
  iptables -D INPUT -i eth4 -t mangle -p tcp --dport 22 \
           --sport 60952 -j MARK --set-mark 8888

The current option to filter by a given port is still being preserved. A
similar approach could be done for the sctp_probe module as a follow-up.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_probe.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 301a3effe579..622a4377b397 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -46,6 +46,10 @@ static unsigned int bufsize __read_mostly = 4096;
 MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)");
 module_param(bufsize, uint, 0);
 
+static unsigned int fwmark __read_mostly = 0;
+MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)");
+module_param(fwmark, uint, 0);
+
 static int full __read_mostly;
 MODULE_PARM_DESC(full, "Full log (1=every ack packet received,  0=only cwnd changes)");
 module_param(full, int, 0);
@@ -124,9 +128,11 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	const struct tcp_sock *tp = tcp_sk(sk);
 	const struct inet_sock *inet = inet_sk(sk);
 
-	/* Only update if port matches */
-	if ((port == 0 || ntohs(inet->inet_dport) == port ||
-	     ntohs(inet->inet_sport) == port) &&
+	/* Only update if port or skb mark matches */
+	if (((port == 0 && fwmark == 0) ||
+	     ntohs(inet->inet_dport) == port ||
+	     ntohs(inet->inet_sport) == port ||
+	     (fwmark > 0 && skb->mark == fwmark)) &&
 	    (full || tp->snd_cwnd != tcp_probe.lastcwnd)) {
 
 		spin_lock(&tcp_probe.lock);
@@ -284,7 +290,8 @@ static __init int tcpprobe_init(void)
 	if (ret)
 		goto err1;
 
-	pr_info("probe registered (port=%d) bufsize=%u\n", port, bufsize);
+	pr_info("probe registered (port=%d/fwmark=%u) bufsize=%u\n",
+		port, fwmark, bufsize);
 	return 0;
  err1:
 	remove_proc_entry(procname, init_net.proc_net);
-- 
cgit v1.2.3


From 5828cd9a68873df1340b420371c02c47647878fb Mon Sep 17 00:00:00 2001
From: Andy Zhou <azhou@nicira.com>
Date: Tue, 27 Aug 2013 13:02:21 -0700
Subject: openvswitch: optimize flow compare and mask functions

Make sure the sw_flow_key structure and valid mask boundaries are always
machine word aligned. Optimize the flow compare and mask operations
using machine word size operations. This patch improves throughput on
average by 15% when CPU is the bottleneck of forwarding packets.

This patch is inspired by ideas and code from a patch submitted by Peter
Klausler titled "replace memcmp() with specialized comparator".
However, The original patch only optimizes for architectures
support unaligned machine word access. This patch optimizes for all
architectures.

Signed-off-by: Andy Zhou <azhou@nicira.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 net/openvswitch/flow.c | 64 +++++++++++++++++++++++++++++++++-----------------
 net/openvswitch/flow.h | 19 +--------------
 2 files changed, 44 insertions(+), 39 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 80bcb96467a5..ad1aeeb4f373 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -54,8 +54,8 @@ static void update_range__(struct sw_flow_match *match,
 			  size_t offset, size_t size, bool is_mask)
 {
 	struct sw_flow_key_range *range = NULL;
-	size_t start = offset;
-	size_t end = offset + size;
+	size_t start = rounddown(offset, sizeof(long));
+	size_t end = roundup(offset + size, sizeof(long));
 
 	if (!is_mask)
 		range = &match->range;
@@ -102,6 +102,11 @@ static void update_range__(struct sw_flow_match *match,
 		}                                                           \
 	} while (0)
 
+static u16 range_n_bytes(const struct sw_flow_key_range *range)
+{
+	return range->end - range->start;
+}
+
 void ovs_match_init(struct sw_flow_match *match,
 		    struct sw_flow_key *key,
 		    struct sw_flow_mask *mask)
@@ -370,16 +375,17 @@ static bool icmp6hdr_ok(struct sk_buff *skb)
 void ovs_flow_key_mask(struct sw_flow_key *dst, const struct sw_flow_key *src,
 		       const struct sw_flow_mask *mask)
 {
-	u8 *m = (u8 *)&mask->key + mask->range.start;
-	u8 *s = (u8 *)src + mask->range.start;
-	u8 *d = (u8 *)dst + mask->range.start;
+	const long *m = (long *)((u8 *)&mask->key + mask->range.start);
+	const long *s = (long *)((u8 *)src + mask->range.start);
+	long *d = (long *)((u8 *)dst + mask->range.start);
 	int i;
 
-	memset(dst, 0, sizeof(*dst));
-	for (i = 0; i < ovs_sw_flow_mask_size_roundup(mask); i++) {
-		*d = *s & *m;
-		d++, s++, m++;
-	}
+	/* The memory outside of the 'mask->range' are not set since
+	 * further operations on 'dst' only uses contents within
+	 * 'mask->range'.
+	 */
+	for (i = 0; i < range_n_bytes(&mask->range); i += sizeof(long))
+		*d++ = *s++ & *m++;
 }
 
 #define TCP_FLAGS_OFFSET 13
@@ -1000,8 +1006,13 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key)
 static u32 ovs_flow_hash(const struct sw_flow_key *key, int key_start,
 			 int key_end)
 {
-	return jhash2((u32 *)((u8 *)key + key_start),
-		      DIV_ROUND_UP(key_end - key_start, sizeof(u32)), 0);
+	u32 *hash_key = (u32 *)((u8 *)key + key_start);
+	int hash_u32s = (key_end - key_start) >> 2;
+
+	/* Make sure number of hash bytes are multiple of u32. */
+	BUILD_BUG_ON(sizeof(long) % sizeof(u32));
+
+	return jhash2(hash_key, hash_u32s, 0);
 }
 
 static int flow_key_start(const struct sw_flow_key *key)
@@ -1009,17 +1020,25 @@ static int flow_key_start(const struct sw_flow_key *key)
 	if (key->tun_key.ipv4_dst)
 		return 0;
 	else
-		return offsetof(struct sw_flow_key, phy);
+		return rounddown(offsetof(struct sw_flow_key, phy),
+					  sizeof(long));
 }
 
 static bool __cmp_key(const struct sw_flow_key *key1,
 		const struct sw_flow_key *key2,  int key_start, int key_end)
 {
-	return !memcmp((u8 *)key1 + key_start,
-			(u8 *)key2 + key_start, (key_end - key_start));
+	const long *cp1 = (long *)((u8 *)key1 + key_start);
+	const long *cp2 = (long *)((u8 *)key2 + key_start);
+	long diffs = 0;
+	int i;
+
+	for (i = key_start; i < key_end;  i += sizeof(long))
+		diffs |= *cp1++ ^ *cp2++;
+
+	return diffs == 0;
 }
 
-static bool __flow_cmp_key(const struct sw_flow *flow,
+static bool __flow_cmp_masked_key(const struct sw_flow *flow,
 		const struct sw_flow_key *key, int key_start, int key_end)
 {
 	return __cmp_key(&flow->key, key, key_start, key_end);
@@ -1056,7 +1075,7 @@ struct sw_flow *ovs_flow_lookup_unmasked_key(struct flow_table *table,
 }
 
 static struct sw_flow *ovs_masked_flow_lookup(struct flow_table *table,
-				    const struct sw_flow_key *flow_key,
+				    const struct sw_flow_key *unmasked,
 				    struct sw_flow_mask *mask)
 {
 	struct sw_flow *flow;
@@ -1066,12 +1085,13 @@ static struct sw_flow *ovs_masked_flow_lookup(struct flow_table *table,
 	u32 hash;
 	struct sw_flow_key masked_key;
 
-	ovs_flow_key_mask(&masked_key, flow_key, mask);
+	ovs_flow_key_mask(&masked_key, unmasked, mask);
 	hash = ovs_flow_hash(&masked_key, key_start, key_end);
 	head = find_bucket(table, hash);
 	hlist_for_each_entry_rcu(flow, head, hash_node[table->node_ver]) {
 		if (flow->mask == mask &&
-		    __flow_cmp_key(flow, &masked_key, key_start, key_end))
+		    __flow_cmp_masked_key(flow, &masked_key,
+					  key_start, key_end))
 			return flow;
 	}
 	return NULL;
@@ -1961,6 +1981,8 @@ nla_put_failure:
  * Returns zero if successful or a negative error code. */
 int ovs_flow_init(void)
 {
+	BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
+
 	flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0,
 					0, NULL);
 	if (flow_cache == NULL)
@@ -2016,7 +2038,7 @@ static bool ovs_sw_flow_mask_equal(const struct sw_flow_mask *a,
 
 	return  (a->range.end == b->range.end)
 		&& (a->range.start == b->range.start)
-		&& (memcmp(a_, b_, ovs_sw_flow_mask_actual_size(a)) == 0);
+		&& (memcmp(a_, b_, range_n_bytes(&a->range)) == 0);
 }
 
 struct sw_flow_mask *ovs_sw_flow_mask_find(const struct flow_table *tbl,
@@ -2053,5 +2075,5 @@ static void ovs_sw_flow_mask_set(struct sw_flow_mask *mask,
 	u8 *m = (u8 *)&mask->key + range->start;
 
 	mask->range = *range;
-	memset(m, val, ovs_sw_flow_mask_size_roundup(mask));
+	memset(m, val, range_n_bytes(range));
 }
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index e79305184b79..b65f885ac3dc 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -125,7 +125,7 @@ struct sw_flow_key {
 			} nd;
 		} ipv6;
 	};
-};
+} __aligned(__alignof__(long));
 
 struct sw_flow {
 	struct rcu_head rcu;
@@ -149,11 +149,6 @@ struct sw_flow_key_range {
 	size_t end;
 };
 
-static inline u16 ovs_sw_flow_key_range_actual_size(const struct sw_flow_key_range *range)
-{
-	return range->end - range->start;
-}
-
 struct sw_flow_match {
 	struct sw_flow_key *key;
 	struct sw_flow_key_range range;
@@ -253,18 +248,6 @@ struct sw_flow_mask {
 	struct sw_flow_key key;
 };
 
-static inline u16
-ovs_sw_flow_mask_actual_size(const struct sw_flow_mask *mask)
-{
-	return ovs_sw_flow_key_range_actual_size(&mask->range);
-}
-
-static inline u16
-ovs_sw_flow_mask_size_roundup(const struct sw_flow_mask *mask)
-{
-	return roundup(ovs_sw_flow_mask_actual_size(mask), sizeof(u32));
-}
-
 struct sw_flow_mask *ovs_sw_flow_mask_alloc(void);
 void ovs_sw_flow_mask_add_ref(struct sw_flow_mask *);
 void ovs_sw_flow_mask_del_ref(struct sw_flow_mask *, bool deferred);
-- 
cgit v1.2.3


From affe759dbaa9e6c08b0da0a11d1933b61f199f51 Mon Sep 17 00:00:00 2001
From: Phil Oester <kernel@linuxace.com>
Date: Wed, 26 Jun 2013 17:16:28 -0400
Subject: netfilter: ip[6]t_REJECT: tcp-reset using wrong MAC source if bridged

As reported by Casper Gripenberg, in a bridged setup, using ip[6]t_REJECT
with the tcp-reset option sends out reset packets with the src MAC address
of the local bridge interface, instead of the MAC address of the intended
destination.  This causes some routers/firewalls to drop the reset packet
as it appears to be spoofed.  Fix this by bypassing ip[6]_local_out and
setting the MAC of the sender in the tcp reset packet.

This closes netfilter bugzilla #531.

Signed-off-by: Phil Oester <kernel@linuxace.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ipt_REJECT.c  | 21 ++++++++++++++++++++-
 net/ipv6/netfilter/ip6t_REJECT.c | 20 +++++++++++++++++++-
 2 files changed, 39 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 04b18c1ac345..b969131ad1c1 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -119,7 +119,26 @@ static void send_reset(struct sk_buff *oldskb, int hook)
 
 	nf_ct_attach(nskb, oldskb);
 
-	ip_local_out(nskb);
+#ifdef CONFIG_BRIDGE_NETFILTER
+	/* If we use ip_local_out for bridged traffic, the MAC source on
+	 * the RST will be ours, instead of the destination's.  This confuses
+	 * some routers/firewalls, and they drop the packet.  So we need to
+	 * build the eth header using the original destination's MAC as the
+	 * source, and send the RST packet directly.
+	 */
+	if (oldskb->nf_bridge) {
+		struct ethhdr *oeth = eth_hdr(oldskb);
+		nskb->dev = oldskb->nf_bridge->physindev;
+		niph->tot_len = htons(nskb->len);
+		ip_send_check(niph);
+		if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol),
+				    oeth->h_source, oeth->h_dest, nskb->len) < 0)
+			goto free_nskb;
+		dev_queue_xmit(nskb);
+	} else
+#endif
+		ip_local_out(nskb);
+
 	return;
 
  free_nskb:
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
index 70f9abc0efe9..56eef30ee5f6 100644
--- a/net/ipv6/netfilter/ip6t_REJECT.c
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -169,7 +169,25 @@ static void send_reset(struct net *net, struct sk_buff *oldskb)
 
 	nf_ct_attach(nskb, oldskb);
 
-	ip6_local_out(nskb);
+#ifdef CONFIG_BRIDGE_NETFILTER
+	/* If we use ip6_local_out for bridged traffic, the MAC source on
+	 * the RST will be ours, instead of the destination's.  This confuses
+	 * some routers/firewalls, and they drop the packet.  So we need to
+	 * build the eth header using the original destination's MAC as the
+	 * source, and send the RST packet directly.
+	 */
+	if (oldskb->nf_bridge) {
+		struct ethhdr *oeth = eth_hdr(oldskb);
+		nskb->dev = oldskb->nf_bridge->physindev;
+		nskb->protocol = htons(ETH_P_IPV6);
+		ip6h->payload_len = htons(sizeof(struct tcphdr));
+		if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol),
+				    oeth->h_source, oeth->h_dest, nskb->len) < 0)
+			return;
+		dev_queue_xmit(nskb);
+	} else
+#endif
+		ip6_local_out(nskb);
 }
 
 static inline void
-- 
cgit v1.2.3


From 706f5151e349a3d8ab85237d0d6c553930376e9f Mon Sep 17 00:00:00 2001
From: Nathan Hintz <nlhintz@hotmail.com>
Date: Thu, 22 Aug 2013 22:09:12 -0700
Subject: netfilter: nf_defrag_ipv6.o included twice

'nf_defrag_ipv6' is built as a separate module; it shouldn't be
included in the 'nf_conntrack_ipv6' module as well.

Signed-off-by: Nathan Hintz <nlhintz@hotmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 2d11fcc2cf3c..ad13bf0d25f0 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -14,7 +14,7 @@ obj-$(CONFIG_NF_NAT_IPV6) += ip6table_nat.o
 nf_conntrack_ipv6-y  :=  nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o
 
 # l3 independent conntrack
-obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o nf_defrag_ipv6.o
+obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o
 
 nf_nat_ipv6-y		:= nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o
 obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o
-- 
cgit v1.2.3


From 41d73ec053d2424599c4ed8452b889374d523ade Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 27 Aug 2013 08:50:12 +0200
Subject: netfilter: nf_conntrack: make sequence number adjustments usuable
 without NAT

Split out sequence number adjustments from NAT and move them to the conntrack
core to make them usable for SYN proxying. The sequence number adjustment
information is moved to a seperate extend. The extend is added to new
conntracks when a NAT mapping is set up for a connection using a helper.

As a side effect, this saves 24 bytes per connection with NAT in the common
case that a connection does not have a helper assigned.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Tested-by: Martin Topholm <mph@one.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h                          |   9 +-
 include/net/netfilter/nf_conntrack_extend.h        |   2 +
 include/net/netfilter/nf_conntrack_seqadj.h        |  49 +++++
 include/net/netfilter/nf_nat.h                     |  10 -
 include/net/netfilter/nf_nat_helper.h              |  19 --
 include/uapi/linux/netfilter/nf_conntrack_common.h |   3 +-
 include/uapi/linux/netfilter/nfnetlink_conntrack.h |  15 +-
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c     |   7 +-
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c     |   7 +-
 net/netfilter/Makefile                             |   2 +-
 net/netfilter/nf_conntrack_core.c                  |  16 +-
 net/netfilter/nf_conntrack_netlink.c               | 115 +++++------
 net/netfilter/nf_conntrack_proto_tcp.c             |  18 +-
 net/netfilter/nf_conntrack_seqadj.c                | 218 ++++++++++++++++++++
 net/netfilter/nf_nat_core.c                        |  16 +-
 net/netfilter/nf_nat_helper.c                      | 228 +--------------------
 net/netfilter/nf_nat_sip.c                         |   3 +-
 net/netfilter/nfnetlink_queue_ct.c                 |   8 +-
 18 files changed, 369 insertions(+), 376 deletions(-)
 create mode 100644 include/net/netfilter/nf_conntrack_seqadj.h
 create mode 100644 net/netfilter/nf_conntrack_seqadj.c

(limited to 'net')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index e2cf786be22f..708fe72ab913 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -319,6 +319,7 @@ extern void nf_ct_attach(struct sk_buff *, const struct sk_buff *);
 extern void (*nf_ct_destroy)(struct nf_conntrack *) __rcu;
 
 struct nf_conn;
+enum ip_conntrack_info;
 struct nlattr;
 
 struct nfq_ct_hook {
@@ -327,14 +328,10 @@ struct nfq_ct_hook {
 	int (*parse)(const struct nlattr *attr, struct nf_conn *ct);
 	int (*attach_expect)(const struct nlattr *attr, struct nf_conn *ct,
 			     u32 portid, u32 report);
-};
-extern struct nfq_ct_hook __rcu *nfq_ct_hook;
-
-struct nfq_ct_nat_hook {
 	void (*seq_adjust)(struct sk_buff *skb, struct nf_conn *ct,
-			   u32 ctinfo, s32 off);
+			   enum ip_conntrack_info ctinfo, s32 off);
 };
-extern struct nfq_ct_nat_hook __rcu *nfq_ct_nat_hook;
+extern struct nfq_ct_hook __rcu *nfq_ct_hook;
 #else
 static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
 #endif
diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h
index 977bc8a46444..2a22bcbfe6e4 100644
--- a/include/net/netfilter/nf_conntrack_extend.h
+++ b/include/net/netfilter/nf_conntrack_extend.h
@@ -9,6 +9,7 @@ enum nf_ct_ext_id {
 	NF_CT_EXT_HELPER,
 #if defined(CONFIG_NF_NAT) || defined(CONFIG_NF_NAT_MODULE)
 	NF_CT_EXT_NAT,
+	NF_CT_EXT_SEQADJ,
 #endif
 	NF_CT_EXT_ACCT,
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
@@ -31,6 +32,7 @@ enum nf_ct_ext_id {
 
 #define NF_CT_EXT_HELPER_TYPE struct nf_conn_help
 #define NF_CT_EXT_NAT_TYPE struct nf_conn_nat
+#define NF_CT_EXT_SEQADJ_TYPE struct nf_conn_seqadj
 #define NF_CT_EXT_ACCT_TYPE struct nf_conn_counter
 #define NF_CT_EXT_ECACHE_TYPE struct nf_conntrack_ecache
 #define NF_CT_EXT_ZONE_TYPE struct nf_conntrack_zone
diff --git a/include/net/netfilter/nf_conntrack_seqadj.h b/include/net/netfilter/nf_conntrack_seqadj.h
new file mode 100644
index 000000000000..30bfbbed9f47
--- /dev/null
+++ b/include/net/netfilter/nf_conntrack_seqadj.h
@@ -0,0 +1,49 @@
+#ifndef _NF_CONNTRACK_SEQADJ_H
+#define _NF_CONNTRACK_SEQADJ_H
+
+#include <net/netfilter/nf_conntrack_extend.h>
+
+/**
+ * struct nf_ct_seqadj - sequence number adjustment information
+ *
+ * @correction_pos: position of the last TCP sequence number modification
+ * @offset_before: sequence number offset before last modification
+ * @offset_after: sequence number offset after last modification
+ */
+struct nf_ct_seqadj {
+	u32		correction_pos;
+	s32		offset_before;
+	s32		offset_after;
+};
+
+struct nf_conn_seqadj {
+	struct nf_ct_seqadj	seq[IP_CT_DIR_MAX];
+};
+
+static inline struct nf_conn_seqadj *nfct_seqadj(const struct nf_conn *ct)
+{
+	return nf_ct_ext_find(ct, NF_CT_EXT_SEQADJ);
+}
+
+static inline struct nf_conn_seqadj *nfct_seqadj_ext_add(struct nf_conn *ct)
+{
+	return nf_ct_ext_add(ct, NF_CT_EXT_SEQADJ, GFP_ATOMIC);
+}
+
+extern int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+			    __be32 seq, s32 off);
+extern void nf_ct_tcp_seqadj_set(struct sk_buff *skb,
+				 struct nf_conn *ct,
+				 enum ip_conntrack_info ctinfo,
+				 s32 off);
+
+extern int nf_ct_seq_adjust(struct sk_buff *skb,
+			    struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+			    unsigned int protoff);
+extern s32 nf_ct_seq_offset(const struct nf_conn *ct, enum ip_conntrack_dir,
+			    u32 seq);
+
+extern int nf_conntrack_seqadj_init(void);
+extern void nf_conntrack_seqadj_fini(void);
+
+#endif /* _NF_CONNTRACK_SEQADJ_H */
diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
index e2441413675c..59a192420053 100644
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -13,15 +13,6 @@ enum nf_nat_manip_type {
 #define HOOK2MANIP(hooknum) ((hooknum) != NF_INET_POST_ROUTING && \
 			     (hooknum) != NF_INET_LOCAL_IN)
 
-/* NAT sequence number modifications */
-struct nf_nat_seq {
-	/* position of the last TCP sequence number modification (if any) */
-	u_int32_t correction_pos;
-
-	/* sequence number offset before and after last modification */
-	int32_t offset_before, offset_after;
-};
-
 #include <linux/list.h>
 #include <linux/netfilter/nf_conntrack_pptp.h>
 #include <net/netfilter/nf_conntrack_extend.h>
@@ -39,7 +30,6 @@ struct nf_conn;
 /* The structure embedded in the conntrack structure. */
 struct nf_conn_nat {
 	struct hlist_node bysource;
-	struct nf_nat_seq seq[IP_CT_DIR_MAX];
 	struct nf_conn *ct;
 	union nf_conntrack_nat_help help;
 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
diff --git a/include/net/netfilter/nf_nat_helper.h b/include/net/netfilter/nf_nat_helper.h
index 194c34794923..404324d1d0c4 100644
--- a/include/net/netfilter/nf_nat_helper.h
+++ b/include/net/netfilter/nf_nat_helper.h
@@ -39,28 +39,9 @@ extern int nf_nat_mangle_udp_packet(struct sk_buff *skb,
 				    const char *rep_buffer,
 				    unsigned int rep_len);
 
-extern void nf_nat_set_seq_adjust(struct nf_conn *ct,
-				  enum ip_conntrack_info ctinfo,
-				  __be32 seq, s32 off);
-extern int nf_nat_seq_adjust(struct sk_buff *skb,
-			     struct nf_conn *ct,
-			     enum ip_conntrack_info ctinfo,
-			     unsigned int protoff);
-extern int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb,
-				     struct nf_conn *ct,
-				     enum ip_conntrack_info ctinfo,
-				     unsigned int protoff);
-
 /* Setup NAT on this expected conntrack so it follows master, but goes
  * to port ct->master->saved_proto. */
 extern void nf_nat_follow_master(struct nf_conn *ct,
 				 struct nf_conntrack_expect *this);
 
-extern s32 nf_nat_get_offset(const struct nf_conn *ct,
-			     enum ip_conntrack_dir dir,
-			     u32 seq);
-
-extern void nf_nat_tcp_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
-				  u32 dir, s32 off);
-
 #endif
diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
index d69483fb3825..8dd803818ebe 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -99,7 +99,8 @@ enum ip_conntrack_events {
 	IPCT_PROTOINFO,		/* protocol information has changed */
 	IPCT_HELPER,		/* new helper has been set */
 	IPCT_MARK,		/* new mark has been set */
-	IPCT_NATSEQADJ,		/* NAT is doing sequence adjustment */
+	IPCT_SEQADJ,		/* sequence adjustment has changed */
+	IPCT_NATSEQADJ = IPCT_SEQADJ,
 	IPCT_SECMARK,		/* new security mark has been set */
 	IPCT_LABEL,		/* new connlabel has been set */
 };
diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
index 08fabc6c93f3..acad6c52a652 100644
--- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
@@ -42,8 +42,10 @@ enum ctattr_type {
 	CTA_ID,
 	CTA_NAT_DST,
 	CTA_TUPLE_MASTER,
-	CTA_NAT_SEQ_ADJ_ORIG,
-	CTA_NAT_SEQ_ADJ_REPLY,
+	CTA_SEQ_ADJ_ORIG,
+	CTA_NAT_SEQ_ADJ_ORIG	= CTA_SEQ_ADJ_ORIG,
+	CTA_SEQ_ADJ_REPLY,
+	CTA_NAT_SEQ_ADJ_REPLY	= CTA_SEQ_ADJ_REPLY,
 	CTA_SECMARK,		/* obsolete */
 	CTA_ZONE,
 	CTA_SECCTX,
@@ -165,6 +167,15 @@ enum ctattr_protonat {
 };
 #define CTA_PROTONAT_MAX (__CTA_PROTONAT_MAX - 1)
 
+enum ctattr_seqadj {
+	CTA_SEQADJ_UNSPEC,
+	CTA_SEQADJ_CORRECTION_POS,
+	CTA_SEQADJ_OFFSET_BEFORE,
+	CTA_SEQADJ_OFFSET_AFTER,
+	__CTA_SEQADJ_MAX
+};
+#define CTA_SEQADJ_MAX (__CTA_SEQADJ_MAX - 1)
+
 enum ctattr_natseq {
 	CTA_NAT_SEQ_UNSPEC,
 	CTA_NAT_SEQ_CORRECTION_POS,
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 0a2e0e3e95ba..86f5b34a4ed1 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -25,6 +25,7 @@
 #include <net/netfilter/nf_conntrack_l3proto.h>
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
 #include <net/netfilter/nf_nat_helper.h>
 #include <net/netfilter/ipv4/nf_defrag_ipv4.h>
@@ -136,11 +137,7 @@ static unsigned int ipv4_confirm(unsigned int hooknum,
 	/* adjust seqs for loopback traffic only in outgoing direction */
 	if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
 	    !nf_is_loopback_packet(skb)) {
-		typeof(nf_nat_seq_adjust_hook) seq_adjust;
-
-		seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook);
-		if (!seq_adjust ||
-		    !seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
+		if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
 			NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
 			return NF_DROP;
 		}
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index c9b6a6e6a1e8..d6e4dd8b58df 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -28,6 +28,7 @@
 #include <net/netfilter/nf_conntrack_l3proto.h>
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
 #include <net/netfilter/nf_nat_helper.h>
 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
@@ -158,11 +159,7 @@ static unsigned int ipv6_confirm(unsigned int hooknum,
 	/* adjust seqs for loopback traffic only in outgoing direction */
 	if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
 	    !nf_is_loopback_packet(skb)) {
-		typeof(nf_nat_seq_adjust_hook) seq_adjust;
-
-		seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook);
-		if (!seq_adjust ||
-		    !seq_adjust(skb, ct, ctinfo, protoff)) {
+		if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
 			NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
 			return NF_DROP;
 		}
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index ebfa7dc747cd..89a9c1658f5e 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -1,6 +1,6 @@
 netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o
 
-nf_conntrack-y	:= nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o
+nf_conntrack-y	:= nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o
 nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o
 nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o
 nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index da6f1787a102..00a7a94d4132 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -39,6 +39,7 @@
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_expect.h>
 #include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack_extend.h>
 #include <net/netfilter/nf_conntrack_acct.h>
@@ -1326,6 +1327,7 @@ void nf_conntrack_cleanup_end(void)
 	nf_ct_extend_unregister(&nf_ct_zone_extend);
 #endif
 	nf_conntrack_proto_fini();
+	nf_conntrack_seqadj_fini();
 	nf_conntrack_labels_fini();
 	nf_conntrack_helper_fini();
 	nf_conntrack_timeout_fini();
@@ -1531,6 +1533,10 @@ int nf_conntrack_init_start(void)
 	if (ret < 0)
 		goto err_labels;
 
+	ret = nf_conntrack_seqadj_init();
+	if (ret < 0)
+		goto err_seqadj;
+
 #ifdef CONFIG_NF_CONNTRACK_ZONES
 	ret = nf_ct_extend_register(&nf_ct_zone_extend);
 	if (ret < 0)
@@ -1555,6 +1561,8 @@ err_proto:
 	nf_ct_extend_unregister(&nf_ct_zone_extend);
 err_extend:
 #endif
+	nf_conntrack_seqadj_fini();
+err_seqadj:
 	nf_conntrack_labels_fini();
 err_labels:
 	nf_conntrack_helper_fini();
@@ -1577,9 +1585,6 @@ void nf_conntrack_init_end(void)
 	/* For use by REJECT target */
 	RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach);
 	RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack);
-
-	/* Howto get NAT offsets */
-	RCU_INIT_POINTER(nf_ct_nat_offset, NULL);
 }
 
 /*
@@ -1666,8 +1671,3 @@ err_slabname:
 err_stat:
 	return ret;
 }
-
-s32 (*nf_ct_nat_offset)(const struct nf_conn *ct,
-			enum ip_conntrack_dir dir,
-			u32 seq);
-EXPORT_SYMBOL_GPL(nf_ct_nat_offset);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index fa61fea63234..7c55745ececf 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -37,6 +37,7 @@
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack_expect.h>
 #include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netfilter/nf_conntrack_l3proto.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_tuple.h>
@@ -381,9 +382,8 @@ nla_put_failure:
 	return -1;
 }
 
-#ifdef CONFIG_NF_NAT_NEEDED
 static int
-dump_nat_seq_adj(struct sk_buff *skb, const struct nf_nat_seq *natseq, int type)
+dump_ct_seq_adj(struct sk_buff *skb, const struct nf_ct_seqadj *seq, int type)
 {
 	struct nlattr *nest_parms;
 
@@ -391,12 +391,12 @@ dump_nat_seq_adj(struct sk_buff *skb, const struct nf_nat_seq *natseq, int type)
 	if (!nest_parms)
 		goto nla_put_failure;
 
-	if (nla_put_be32(skb, CTA_NAT_SEQ_CORRECTION_POS,
-			 htonl(natseq->correction_pos)) ||
-	    nla_put_be32(skb, CTA_NAT_SEQ_OFFSET_BEFORE,
-			 htonl(natseq->offset_before)) ||
-	    nla_put_be32(skb, CTA_NAT_SEQ_OFFSET_AFTER,
-			 htonl(natseq->offset_after)))
+	if (nla_put_be32(skb, CTA_SEQADJ_CORRECTION_POS,
+			 htonl(seq->correction_pos)) ||
+	    nla_put_be32(skb, CTA_SEQADJ_OFFSET_BEFORE,
+			 htonl(seq->offset_before)) ||
+	    nla_put_be32(skb, CTA_SEQADJ_OFFSET_AFTER,
+			 htonl(seq->offset_after)))
 		goto nla_put_failure;
 
 	nla_nest_end(skb, nest_parms);
@@ -408,27 +408,24 @@ nla_put_failure:
 }
 
 static inline int
-ctnetlink_dump_nat_seq_adj(struct sk_buff *skb, const struct nf_conn *ct)
+ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, const struct nf_conn *ct)
 {
-	struct nf_nat_seq *natseq;
-	struct nf_conn_nat *nat = nfct_nat(ct);
+	struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+	struct nf_ct_seqadj *seq;
 
-	if (!(ct->status & IPS_SEQ_ADJUST) || !nat)
+	if (!(ct->status & IPS_SEQ_ADJUST) || !seqadj)
 		return 0;
 
-	natseq = &nat->seq[IP_CT_DIR_ORIGINAL];
-	if (dump_nat_seq_adj(skb, natseq, CTA_NAT_SEQ_ADJ_ORIG) == -1)
+	seq = &seqadj->seq[IP_CT_DIR_ORIGINAL];
+	if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_ORIG) == -1)
 		return -1;
 
-	natseq = &nat->seq[IP_CT_DIR_REPLY];
-	if (dump_nat_seq_adj(skb, natseq, CTA_NAT_SEQ_ADJ_REPLY) == -1)
+	seq = &seqadj->seq[IP_CT_DIR_REPLY];
+	if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_REPLY) == -1)
 		return -1;
 
 	return 0;
 }
-#else
-#define ctnetlink_dump_nat_seq_adj(a, b) (0)
-#endif
 
 static inline int
 ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct)
@@ -502,7 +499,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 	    ctnetlink_dump_id(skb, ct) < 0 ||
 	    ctnetlink_dump_use(skb, ct) < 0 ||
 	    ctnetlink_dump_master(skb, ct) < 0 ||
-	    ctnetlink_dump_nat_seq_adj(skb, ct) < 0)
+	    ctnetlink_dump_ct_seq_adj(skb, ct) < 0)
 		goto nla_put_failure;
 
 	nlmsg_end(skb, nlh);
@@ -707,8 +704,8 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
 		    ctnetlink_dump_master(skb, ct) < 0)
 			goto nla_put_failure;
 
-		if (events & (1 << IPCT_NATSEQADJ) &&
-		    ctnetlink_dump_nat_seq_adj(skb, ct) < 0)
+		if (events & (1 << IPCT_SEQADJ) &&
+		    ctnetlink_dump_ct_seq_adj(skb, ct) < 0)
 			goto nla_put_failure;
 	}
 
@@ -1439,66 +1436,65 @@ ctnetlink_change_protoinfo(struct nf_conn *ct, const struct nlattr * const cda[]
 	return err;
 }
 
-#ifdef CONFIG_NF_NAT_NEEDED
-static const struct nla_policy nat_seq_policy[CTA_NAT_SEQ_MAX+1] = {
-	[CTA_NAT_SEQ_CORRECTION_POS]	= { .type = NLA_U32 },
-	[CTA_NAT_SEQ_OFFSET_BEFORE]	= { .type = NLA_U32 },
-	[CTA_NAT_SEQ_OFFSET_AFTER]	= { .type = NLA_U32 },
+static const struct nla_policy seqadj_policy[CTA_SEQADJ_MAX+1] = {
+	[CTA_SEQADJ_CORRECTION_POS]	= { .type = NLA_U32 },
+	[CTA_SEQADJ_OFFSET_BEFORE]	= { .type = NLA_U32 },
+	[CTA_SEQADJ_OFFSET_AFTER]	= { .type = NLA_U32 },
 };
 
 static inline int
-change_nat_seq_adj(struct nf_nat_seq *natseq, const struct nlattr * const attr)
+change_seq_adj(struct nf_ct_seqadj *seq, const struct nlattr * const attr)
 {
 	int err;
-	struct nlattr *cda[CTA_NAT_SEQ_MAX+1];
+	struct nlattr *cda[CTA_SEQADJ_MAX+1];
 
-	err = nla_parse_nested(cda, CTA_NAT_SEQ_MAX, attr, nat_seq_policy);
+	err = nla_parse_nested(cda, CTA_SEQADJ_MAX, attr, seqadj_policy);
 	if (err < 0)
 		return err;
 
-	if (!cda[CTA_NAT_SEQ_CORRECTION_POS])
+	if (!cda[CTA_SEQADJ_CORRECTION_POS])
 		return -EINVAL;
 
-	natseq->correction_pos =
-		ntohl(nla_get_be32(cda[CTA_NAT_SEQ_CORRECTION_POS]));
+	seq->correction_pos =
+		ntohl(nla_get_be32(cda[CTA_SEQADJ_CORRECTION_POS]));
 
-	if (!cda[CTA_NAT_SEQ_OFFSET_BEFORE])
+	if (!cda[CTA_SEQADJ_OFFSET_BEFORE])
 		return -EINVAL;
 
-	natseq->offset_before =
-		ntohl(nla_get_be32(cda[CTA_NAT_SEQ_OFFSET_BEFORE]));
+	seq->offset_before =
+		ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_BEFORE]));
 
-	if (!cda[CTA_NAT_SEQ_OFFSET_AFTER])
+	if (!cda[CTA_SEQADJ_OFFSET_AFTER])
 		return -EINVAL;
 
-	natseq->offset_after =
-		ntohl(nla_get_be32(cda[CTA_NAT_SEQ_OFFSET_AFTER]));
+	seq->offset_after =
+		ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_AFTER]));
 
 	return 0;
 }
 
 static int
-ctnetlink_change_nat_seq_adj(struct nf_conn *ct,
-			     const struct nlattr * const cda[])
+ctnetlink_change_seq_adj(struct nf_conn *ct,
+			 const struct nlattr * const cda[])
 {
+	struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
 	int ret = 0;
-	struct nf_conn_nat *nat = nfct_nat(ct);
 
-	if (!nat)
+	if (!seqadj)
 		return 0;
 
-	if (cda[CTA_NAT_SEQ_ADJ_ORIG]) {
-		ret = change_nat_seq_adj(&nat->seq[IP_CT_DIR_ORIGINAL],
-					 cda[CTA_NAT_SEQ_ADJ_ORIG]);
+	if (cda[CTA_SEQ_ADJ_ORIG]) {
+		ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_ORIGINAL],
+				     cda[CTA_SEQ_ADJ_ORIG]);
 		if (ret < 0)
 			return ret;
 
 		ct->status |= IPS_SEQ_ADJUST;
 	}
 
-	if (cda[CTA_NAT_SEQ_ADJ_REPLY]) {
-		ret = change_nat_seq_adj(&nat->seq[IP_CT_DIR_REPLY],
-					 cda[CTA_NAT_SEQ_ADJ_REPLY]);
+	if (cda[CTA_SEQ_ADJ_REPLY]) {
+		ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_REPLY],
+				     cda[CTA_SEQ_ADJ_REPLY]);
 		if (ret < 0)
 			return ret;
 
@@ -1507,7 +1503,6 @@ ctnetlink_change_nat_seq_adj(struct nf_conn *ct,
 
 	return 0;
 }
-#endif
 
 static int
 ctnetlink_attach_labels(struct nf_conn *ct, const struct nlattr * const cda[])
@@ -1573,13 +1568,12 @@ ctnetlink_change_conntrack(struct nf_conn *ct,
 		ct->mark = ntohl(nla_get_be32(cda[CTA_MARK]));
 #endif
 
-#ifdef CONFIG_NF_NAT_NEEDED
-	if (cda[CTA_NAT_SEQ_ADJ_ORIG] || cda[CTA_NAT_SEQ_ADJ_REPLY]) {
-		err = ctnetlink_change_nat_seq_adj(ct, cda);
+	if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) {
+		err = ctnetlink_change_seq_adj(ct, cda);
 		if (err < 0)
 			return err;
 	}
-#endif
+
 	if (cda[CTA_LABELS]) {
 		err = ctnetlink_attach_labels(ct, cda);
 		if (err < 0)
@@ -1684,13 +1678,11 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
 			goto err2;
 	}
 
-#ifdef CONFIG_NF_NAT_NEEDED
-	if (cda[CTA_NAT_SEQ_ADJ_ORIG] || cda[CTA_NAT_SEQ_ADJ_REPLY]) {
-		err = ctnetlink_change_nat_seq_adj(ct, cda);
+	if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) {
+		err = ctnetlink_change_seq_adj(ct, cda);
 		if (err < 0)
 			goto err2;
 	}
-#endif
 
 	memset(&ct->proto, 0, sizeof(ct->proto));
 	if (cda[CTA_PROTOINFO]) {
@@ -1804,7 +1796,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 						      (1 << IPCT_ASSURED) |
 						      (1 << IPCT_HELPER) |
 						      (1 << IPCT_PROTOINFO) |
-						      (1 << IPCT_NATSEQADJ) |
+						      (1 << IPCT_SEQADJ) |
 						      (1 << IPCT_MARK) | events,
 						      ct, NETLINK_CB(skb).portid,
 						      nlmsg_report(nlh));
@@ -1827,7 +1819,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 						      (1 << IPCT_HELPER) |
 						      (1 << IPCT_LABEL) |
 						      (1 << IPCT_PROTOINFO) |
-						      (1 << IPCT_NATSEQADJ) |
+						      (1 << IPCT_SEQADJ) |
 						      (1 << IPCT_MARK),
 						      ct, NETLINK_CB(skb).portid,
 						      nlmsg_report(nlh));
@@ -2082,7 +2074,7 @@ ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct)
 		goto nla_put_failure;
 
 	if ((ct->status & IPS_SEQ_ADJUST) &&
-	    ctnetlink_dump_nat_seq_adj(skb, ct) < 0)
+	    ctnetlink_dump_ct_seq_adj(skb, ct) < 0)
 		goto nla_put_failure;
 
 #ifdef CONFIG_NF_CONNTRACK_MARK
@@ -2211,6 +2203,7 @@ static struct nfq_ct_hook ctnetlink_nfqueue_hook = {
 	.build		= ctnetlink_nfqueue_build,
 	.parse		= ctnetlink_nfqueue_parse,
 	.attach_expect	= ctnetlink_nfqueue_attach_expect,
+	.seq_adjust	= nf_ct_tcp_seqadj_set,
 };
 #endif /* CONFIG_NETFILTER_NETLINK_QUEUE_CT */
 
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index d224e001f14f..984a8d1a3359 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -27,6 +27,7 @@
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netfilter/nf_log.h>
 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
@@ -495,21 +496,6 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
 	}
 }
 
-#ifdef CONFIG_NF_NAT_NEEDED
-static inline s32 nat_offset(const struct nf_conn *ct,
-			     enum ip_conntrack_dir dir,
-			     u32 seq)
-{
-	typeof(nf_ct_nat_offset) get_offset = rcu_dereference(nf_ct_nat_offset);
-
-	return get_offset != NULL ? get_offset(ct, dir, seq) : 0;
-}
-#define NAT_OFFSET(ct, dir, seq) \
-	(nat_offset(ct, dir, seq))
-#else
-#define NAT_OFFSET(ct, dir, seq)	0
-#endif
-
 static bool tcp_in_window(const struct nf_conn *ct,
 			  struct ip_ct_tcp *state,
 			  enum ip_conntrack_dir dir,
@@ -540,7 +526,7 @@ static bool tcp_in_window(const struct nf_conn *ct,
 		tcp_sack(skb, dataoff, tcph, &sack);
 
 	/* Take into account NAT sequence number mangling */
-	receiver_offset = NAT_OFFSET(ct, !dir, ack - 1);
+	receiver_offset = nf_ct_seq_offset(ct, !dir, ack - 1);
 	ack -= receiver_offset;
 	sack -= receiver_offset;
 
diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c
new file mode 100644
index 000000000000..483eb9ce3216
--- /dev/null
+++ b/net/netfilter/nf_conntrack_seqadj.c
@@ -0,0 +1,218 @@
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <net/tcp.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+
+int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+		     __be32 seq, s32 off)
+{
+	struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	struct nf_ct_seqadj *this_way;
+
+	if (off == 0)
+		return 0;
+
+	set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
+
+	spin_lock_bh(&ct->lock);
+	this_way = &seqadj->seq[dir];
+	if (this_way->offset_before == this_way->offset_after ||
+	    before(this_way->correction_pos, seq)) {
+		this_way->correction_pos = seq;
+		this_way->offset_before	 = this_way->offset_after;
+		this_way->offset_after	+= off;
+	}
+	spin_unlock_bh(&ct->lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_seqadj_set);
+
+void nf_ct_tcp_seqadj_set(struct sk_buff *skb,
+			  struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+			  s32 off)
+{
+	const struct tcphdr *th;
+
+	if (nf_ct_protonum(ct) != IPPROTO_TCP)
+		return;
+
+	th = (struct tcphdr *)(skb_network_header(skb) + ip_hdrlen(skb));
+	nf_ct_seqadj_set(ct, ctinfo, th->seq, off);
+}
+EXPORT_SYMBOL_GPL(nf_ct_tcp_seqadj_set);
+
+/* Adjust one found SACK option including checksum correction */
+static void nf_ct_sack_block_adjust(struct sk_buff *skb,
+				    struct tcphdr *tcph,
+				    unsigned int sackoff,
+				    unsigned int sackend,
+				    struct nf_ct_seqadj *seq)
+{
+	while (sackoff < sackend) {
+		struct tcp_sack_block_wire *sack;
+		__be32 new_start_seq, new_end_seq;
+
+		sack = (void *)skb->data + sackoff;
+		if (after(ntohl(sack->start_seq) - seq->offset_before,
+			  seq->correction_pos))
+			new_start_seq = htonl(ntohl(sack->start_seq) -
+					seq->offset_after);
+		else
+			new_start_seq = htonl(ntohl(sack->start_seq) -
+					seq->offset_before);
+
+		if (after(ntohl(sack->end_seq) - seq->offset_before,
+			  seq->correction_pos))
+			new_end_seq = htonl(ntohl(sack->end_seq) -
+				      seq->offset_after);
+		else
+			new_end_seq = htonl(ntohl(sack->end_seq) -
+				      seq->offset_before);
+
+		pr_debug("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n",
+			 ntohl(sack->start_seq), new_start_seq,
+			 ntohl(sack->end_seq), new_end_seq);
+
+		inet_proto_csum_replace4(&tcph->check, skb,
+					 sack->start_seq, new_start_seq, 0);
+		inet_proto_csum_replace4(&tcph->check, skb,
+					 sack->end_seq, new_end_seq, 0);
+		sack->start_seq = new_start_seq;
+		sack->end_seq = new_end_seq;
+		sackoff += sizeof(*sack);
+	}
+}
+
+/* TCP SACK sequence number adjustment */
+static unsigned int nf_ct_sack_adjust(struct sk_buff *skb,
+				      unsigned int protoff,
+				      struct tcphdr *tcph,
+				      struct nf_conn *ct,
+				      enum ip_conntrack_info ctinfo)
+{
+	unsigned int dir, optoff, optend;
+	struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+
+	optoff = protoff + sizeof(struct tcphdr);
+	optend = protoff + tcph->doff * 4;
+
+	if (!skb_make_writable(skb, optend))
+		return 0;
+
+	dir = CTINFO2DIR(ctinfo);
+
+	while (optoff < optend) {
+		/* Usually: option, length. */
+		unsigned char *op = skb->data + optoff;
+
+		switch (op[0]) {
+		case TCPOPT_EOL:
+			return 1;
+		case TCPOPT_NOP:
+			optoff++;
+			continue;
+		default:
+			/* no partial options */
+			if (optoff + 1 == optend ||
+			    optoff + op[1] > optend ||
+			    op[1] < 2)
+				return 0;
+			if (op[0] == TCPOPT_SACK &&
+			    op[1] >= 2+TCPOLEN_SACK_PERBLOCK &&
+			    ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0)
+				nf_ct_sack_block_adjust(skb, tcph, optoff + 2,
+							optoff+op[1],
+							&seqadj->seq[!dir]);
+			optoff += op[1];
+		}
+	}
+	return 1;
+}
+
+/* TCP sequence number adjustment.  Returns 1 on success, 0 on failure */
+int nf_ct_seq_adjust(struct sk_buff *skb,
+		     struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+		     unsigned int protoff)
+{
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	struct tcphdr *tcph;
+	__be32 newseq, newack;
+	s32 seqoff, ackoff;
+	struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+	struct nf_ct_seqadj *this_way, *other_way;
+	int res;
+
+	this_way  = &seqadj->seq[dir];
+	other_way = &seqadj->seq[!dir];
+
+	if (!skb_make_writable(skb, protoff + sizeof(*tcph)))
+		return 0;
+
+	tcph = (void *)skb->data + protoff;
+	spin_lock_bh(&ct->lock);
+	if (after(ntohl(tcph->seq), this_way->correction_pos))
+		seqoff = this_way->offset_after;
+	else
+		seqoff = this_way->offset_before;
+
+	if (after(ntohl(tcph->ack_seq) - other_way->offset_before,
+		  other_way->correction_pos))
+		ackoff = other_way->offset_after;
+	else
+		ackoff = other_way->offset_before;
+
+	newseq = htonl(ntohl(tcph->seq) + seqoff);
+	newack = htonl(ntohl(tcph->ack_seq) - ackoff);
+
+	inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0);
+	inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0);
+
+	pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n",
+		 ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
+		 ntohl(newack));
+
+	tcph->seq = newseq;
+	tcph->ack_seq = newack;
+
+	res = nf_ct_sack_adjust(skb, protoff, tcph, ct, ctinfo);
+	spin_unlock_bh(&ct->lock);
+
+	return res;
+}
+EXPORT_SYMBOL_GPL(nf_ct_seq_adjust);
+
+s32 nf_ct_seq_offset(const struct nf_conn *ct,
+		     enum ip_conntrack_dir dir,
+		     u32 seq)
+{
+	struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+	struct nf_ct_seqadj *this_way;
+
+	if (!seqadj)
+		return 0;
+
+	this_way = &seqadj->seq[dir];
+	return after(seq, this_way->correction_pos) ?
+		 this_way->offset_after : this_way->offset_before;
+}
+EXPORT_SYMBOL_GPL(nf_ct_seq_offset);
+
+static struct nf_ct_ext_type nf_ct_seqadj_extend __read_mostly = {
+	.len	= sizeof(struct nf_conn_seqadj),
+	.align	= __alignof__(struct nf_conn_seqadj),
+	.id	= NF_CT_EXT_SEQADJ,
+};
+
+int nf_conntrack_seqadj_init(void)
+{
+	return nf_ct_extend_register(&nf_ct_seqadj_extend);
+}
+
+void nf_conntrack_seqadj_fini(void)
+{
+	nf_ct_extend_unregister(&nf_ct_seqadj_extend);
+}
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 6ff808375b5e..6f0f4f7f68a5 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -25,6 +25,7 @@
 #include <net/netfilter/nf_nat_core.h>
 #include <net/netfilter/nf_nat_helper.h>
 #include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netfilter/nf_conntrack_l3proto.h>
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <linux/netfilter/nf_nat.h>
@@ -402,6 +403,9 @@ nf_nat_setup_info(struct nf_conn *ct,
 			ct->status |= IPS_SRC_NAT;
 		else
 			ct->status |= IPS_DST_NAT;
+
+		if (nfct_help(ct))
+			nfct_seqadj_ext_add(ct);
 	}
 
 	if (maniptype == NF_NAT_MANIP_SRC) {
@@ -764,10 +768,6 @@ static struct nf_ct_helper_expectfn follow_master_nat = {
 	.expectfn	= nf_nat_follow_master,
 };
 
-static struct nfq_ct_nat_hook nfq_ct_nat = {
-	.seq_adjust	= nf_nat_tcp_seq_adjust,
-};
-
 static int __init nf_nat_init(void)
 {
 	int ret;
@@ -787,14 +787,9 @@ static int __init nf_nat_init(void)
 	/* Initialize fake conntrack so that NAT will skip it */
 	nf_ct_untracked_status_or(IPS_NAT_DONE_MASK);
 
-	BUG_ON(nf_nat_seq_adjust_hook != NULL);
-	RCU_INIT_POINTER(nf_nat_seq_adjust_hook, nf_nat_seq_adjust);
 	BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);
 	RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook,
 			   nfnetlink_parse_nat_setup);
-	BUG_ON(nf_ct_nat_offset != NULL);
-	RCU_INIT_POINTER(nf_ct_nat_offset, nf_nat_get_offset);
-	RCU_INIT_POINTER(nfq_ct_nat_hook, &nfq_ct_nat);
 #ifdef CONFIG_XFRM
 	BUG_ON(nf_nat_decode_session_hook != NULL);
 	RCU_INIT_POINTER(nf_nat_decode_session_hook, __nf_nat_decode_session);
@@ -813,10 +808,7 @@ static void __exit nf_nat_cleanup(void)
 	unregister_pernet_subsys(&nf_nat_net_ops);
 	nf_ct_extend_unregister(&nat_extend);
 	nf_ct_helper_expectfn_unregister(&follow_master_nat);
-	RCU_INIT_POINTER(nf_nat_seq_adjust_hook, NULL);
 	RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL);
-	RCU_INIT_POINTER(nf_ct_nat_offset, NULL);
-	RCU_INIT_POINTER(nfq_ct_nat_hook, NULL);
 #ifdef CONFIG_XFRM
 	RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL);
 #endif
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index 46b9baa845a6..2840abb5bb99 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -20,67 +20,13 @@
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_ecache.h>
 #include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netfilter/nf_nat.h>
 #include <net/netfilter/nf_nat_l3proto.h>
 #include <net/netfilter/nf_nat_l4proto.h>
 #include <net/netfilter/nf_nat_core.h>
 #include <net/netfilter/nf_nat_helper.h>
 
-#define DUMP_OFFSET(x) \
-	pr_debug("offset_before=%d, offset_after=%d, correction_pos=%u\n", \
-		 x->offset_before, x->offset_after, x->correction_pos);
-
-/* Setup TCP sequence correction given this change at this sequence */
-static inline void
-adjust_tcp_sequence(u32 seq,
-		    int sizediff,
-		    struct nf_conn *ct,
-		    enum ip_conntrack_info ctinfo)
-{
-	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-	struct nf_conn_nat *nat = nfct_nat(ct);
-	struct nf_nat_seq *this_way = &nat->seq[dir];
-
-	pr_debug("adjust_tcp_sequence: seq = %u, sizediff = %d\n",
-		 seq, sizediff);
-
-	pr_debug("adjust_tcp_sequence: Seq_offset before: ");
-	DUMP_OFFSET(this_way);
-
-	spin_lock_bh(&ct->lock);
-
-	/* SYN adjust. If it's uninitialized, or this is after last
-	 * correction, record it: we don't handle more than one
-	 * adjustment in the window, but do deal with common case of a
-	 * retransmit */
-	if (this_way->offset_before == this_way->offset_after ||
-	    before(this_way->correction_pos, seq)) {
-		this_way->correction_pos = seq;
-		this_way->offset_before = this_way->offset_after;
-		this_way->offset_after += sizediff;
-	}
-	spin_unlock_bh(&ct->lock);
-
-	pr_debug("adjust_tcp_sequence: Seq_offset after: ");
-	DUMP_OFFSET(this_way);
-}
-
-/* Get the offset value, for conntrack. Caller must have the conntrack locked */
-s32 nf_nat_get_offset(const struct nf_conn *ct,
-		      enum ip_conntrack_dir dir,
-		      u32 seq)
-{
-	struct nf_conn_nat *nat = nfct_nat(ct);
-	struct nf_nat_seq *this_way;
-
-	if (!nat)
-		return 0;
-
-	this_way = &nat->seq[dir];
-	return after(seq, this_way->correction_pos)
-		 ? this_way->offset_after : this_way->offset_before;
-}
-
 /* Frobs data inside this packet, which is linear. */
 static void mangle_contents(struct sk_buff *skb,
 			    unsigned int dataoff,
@@ -135,30 +81,6 @@ static int enlarge_skb(struct sk_buff *skb, unsigned int extra)
 	return 1;
 }
 
-void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
-			   __be32 seq, s32 off)
-{
-	if (!off)
-		return;
-	set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
-	adjust_tcp_sequence(ntohl(seq), off, ct, ctinfo);
-	nf_conntrack_event_cache(IPCT_NATSEQADJ, ct);
-}
-EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust);
-
-void nf_nat_tcp_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
-			   u32 ctinfo, int off)
-{
-	const struct tcphdr *th;
-
-	if (nf_ct_protonum(ct) != IPPROTO_TCP)
-		return;
-
-	th = (struct tcphdr *)(skb_network_header(skb)+ ip_hdrlen(skb));
-	nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off);
-}
-EXPORT_SYMBOL_GPL(nf_nat_tcp_seq_adjust);
-
 /* Generic function for mangling variable-length address changes inside
  * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
  * command in FTP).
@@ -203,8 +125,8 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
 			     datalen, oldlen);
 
 	if (adjust && rep_len != match_len)
-		nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq,
-				      (int)rep_len - (int)match_len);
+		nf_ct_seqadj_set(ct, ctinfo, tcph->seq,
+				 (int)rep_len - (int)match_len);
 
 	return 1;
 }
@@ -264,150 +186,6 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
 }
 EXPORT_SYMBOL(nf_nat_mangle_udp_packet);
 
-/* Adjust one found SACK option including checksum correction */
-static void
-sack_adjust(struct sk_buff *skb,
-	    struct tcphdr *tcph,
-	    unsigned int sackoff,
-	    unsigned int sackend,
-	    struct nf_nat_seq *natseq)
-{
-	while (sackoff < sackend) {
-		struct tcp_sack_block_wire *sack;
-		__be32 new_start_seq, new_end_seq;
-
-		sack = (void *)skb->data + sackoff;
-		if (after(ntohl(sack->start_seq) - natseq->offset_before,
-			  natseq->correction_pos))
-			new_start_seq = htonl(ntohl(sack->start_seq)
-					- natseq->offset_after);
-		else
-			new_start_seq = htonl(ntohl(sack->start_seq)
-					- natseq->offset_before);
-
-		if (after(ntohl(sack->end_seq) - natseq->offset_before,
-			  natseq->correction_pos))
-			new_end_seq = htonl(ntohl(sack->end_seq)
-				      - natseq->offset_after);
-		else
-			new_end_seq = htonl(ntohl(sack->end_seq)
-				      - natseq->offset_before);
-
-		pr_debug("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n",
-			 ntohl(sack->start_seq), new_start_seq,
-			 ntohl(sack->end_seq), new_end_seq);
-
-		inet_proto_csum_replace4(&tcph->check, skb,
-					 sack->start_seq, new_start_seq, 0);
-		inet_proto_csum_replace4(&tcph->check, skb,
-					 sack->end_seq, new_end_seq, 0);
-		sack->start_seq = new_start_seq;
-		sack->end_seq = new_end_seq;
-		sackoff += sizeof(*sack);
-	}
-}
-
-/* TCP SACK sequence number adjustment */
-static inline unsigned int
-nf_nat_sack_adjust(struct sk_buff *skb,
-		   unsigned int protoff,
-		   struct tcphdr *tcph,
-		   struct nf_conn *ct,
-		   enum ip_conntrack_info ctinfo)
-{
-	unsigned int dir, optoff, optend;
-	struct nf_conn_nat *nat = nfct_nat(ct);
-
-	optoff = protoff + sizeof(struct tcphdr);
-	optend = protoff + tcph->doff * 4;
-
-	if (!skb_make_writable(skb, optend))
-		return 0;
-
-	dir = CTINFO2DIR(ctinfo);
-
-	while (optoff < optend) {
-		/* Usually: option, length. */
-		unsigned char *op = skb->data + optoff;
-
-		switch (op[0]) {
-		case TCPOPT_EOL:
-			return 1;
-		case TCPOPT_NOP:
-			optoff++;
-			continue;
-		default:
-			/* no partial options */
-			if (optoff + 1 == optend ||
-			    optoff + op[1] > optend ||
-			    op[1] < 2)
-				return 0;
-			if (op[0] == TCPOPT_SACK &&
-			    op[1] >= 2+TCPOLEN_SACK_PERBLOCK &&
-			    ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0)
-				sack_adjust(skb, tcph, optoff+2,
-					    optoff+op[1], &nat->seq[!dir]);
-			optoff += op[1];
-		}
-	}
-	return 1;
-}
-
-/* TCP sequence number adjustment.  Returns 1 on success, 0 on failure */
-int
-nf_nat_seq_adjust(struct sk_buff *skb,
-		  struct nf_conn *ct,
-		  enum ip_conntrack_info ctinfo,
-		  unsigned int protoff)
-{
-	struct tcphdr *tcph;
-	int dir;
-	__be32 newseq, newack;
-	s32 seqoff, ackoff;
-	struct nf_conn_nat *nat = nfct_nat(ct);
-	struct nf_nat_seq *this_way, *other_way;
-	int res;
-
-	dir = CTINFO2DIR(ctinfo);
-
-	this_way = &nat->seq[dir];
-	other_way = &nat->seq[!dir];
-
-	if (!skb_make_writable(skb, protoff + sizeof(*tcph)))
-		return 0;
-
-	tcph = (void *)skb->data + protoff;
-	spin_lock_bh(&ct->lock);
-	if (after(ntohl(tcph->seq), this_way->correction_pos))
-		seqoff = this_way->offset_after;
-	else
-		seqoff = this_way->offset_before;
-
-	if (after(ntohl(tcph->ack_seq) - other_way->offset_before,
-		  other_way->correction_pos))
-		ackoff = other_way->offset_after;
-	else
-		ackoff = other_way->offset_before;
-
-	newseq = htonl(ntohl(tcph->seq) + seqoff);
-	newack = htonl(ntohl(tcph->ack_seq) - ackoff);
-
-	inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0);
-	inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0);
-
-	pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n",
-		 ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
-		 ntohl(newack));
-
-	tcph->seq = newseq;
-	tcph->ack_seq = newack;
-
-	res = nf_nat_sack_adjust(skb, protoff, tcph, ct, ctinfo);
-	spin_unlock_bh(&ct->lock);
-
-	return res;
-}
-
 /* Setup NAT on this expected conntrack so it follows master. */
 /* If we fail to get a free NAT slot, we'll get dropped on confirm */
 void nf_nat_follow_master(struct nf_conn *ct,
diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c
index dac11f73868e..f9790405b7ff 100644
--- a/net/netfilter/nf_nat_sip.c
+++ b/net/netfilter/nf_nat_sip.c
@@ -20,6 +20,7 @@
 #include <net/netfilter/nf_nat_helper.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
 #include <linux/netfilter/nf_conntrack_sip.h>
 
 MODULE_LICENSE("GPL");
@@ -308,7 +309,7 @@ static void nf_nat_sip_seq_adjust(struct sk_buff *skb, unsigned int protoff,
 		return;
 
 	th = (struct tcphdr *)(skb->data + protoff);
-	nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off);
+	nf_ct_seqadj_set(ct, ctinfo, th->seq, off);
 }
 
 /* Handles expected signalling connections and media streams */
diff --git a/net/netfilter/nfnetlink_queue_ct.c b/net/netfilter/nfnetlink_queue_ct.c
index be893039966d..96cac50e0d12 100644
--- a/net/netfilter/nfnetlink_queue_ct.c
+++ b/net/netfilter/nfnetlink_queue_ct.c
@@ -87,14 +87,14 @@ nla_put_failure:
 void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
 			 enum ip_conntrack_info ctinfo, int diff)
 {
-	struct nfq_ct_nat_hook *nfq_nat_ct;
+	struct nfq_ct_hook *nfq_ct;
 
-	nfq_nat_ct = rcu_dereference(nfq_ct_nat_hook);
-	if (nfq_nat_ct == NULL)
+	nfq_ct = rcu_dereference(nfq_ct_hook);
+	if (nfq_ct == NULL)
 		return;
 
 	if ((ct->status & IPS_NAT_MASK) && diff)
-		nfq_nat_ct->seq_adjust(skb, ct, ctinfo, diff);
+		nfq_ct->seq_adjust(skb, ct, ctinfo, diff);
 }
 
 int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr,
-- 
cgit v1.2.3


From 0198230b7705eb2386e53778d944e307eef0cc71 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 27 Aug 2013 08:50:13 +0200
Subject: net: syncookies: export cookie_v4_init_sequence/cookie_v4_check

Extract the local TCP stack independant parts of tcp_v4_init_sequence()
and cookie_v4_check() and export them for use by the upcoming SYNPROXY
target.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Acked-by: David S. Miller <davem@davemloft.net>
Tested-by: Martin Topholm <mph@one.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/tcp.h     |  4 ++++
 net/ipv4/syncookies.c | 29 ++++++++++++++++++-----------
 2 files changed, 22 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 09cb5c11ceea..8e2b63678a83 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -476,9 +476,13 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb);
 
 /* From syncookies.c */
 extern __u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS];
+extern int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
+			     u32 cookie);
 extern struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, 
 				    struct ip_options *opt);
 #ifdef CONFIG_SYN_COOKIES
+extern u32 __cookie_v4_init_sequence(const struct iphdr *iph,
+				     const struct tcphdr *th, u16 *mssp);
 extern __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, 
 				     __u16 *mss);
 #else
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index b05c96e7af8b..14a15c49129d 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -160,26 +160,33 @@ static __u16 const msstab[] = {
  * Generate a syncookie.  mssp points to the mss, which is returned
  * rounded down to the value encoded in the cookie.
  */
-__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
+u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
+			      u16 *mssp)
 {
-	const struct iphdr *iph = ip_hdr(skb);
-	const struct tcphdr *th = tcp_hdr(skb);
 	int mssind;
 	const __u16 mss = *mssp;
 
-	tcp_synq_overflow(sk);
-
 	for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--)
 		if (mss >= msstab[mssind])
 			break;
 	*mssp = msstab[mssind];
 
-	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
-
 	return secure_tcp_syn_cookie(iph->saddr, iph->daddr,
 				     th->source, th->dest, ntohl(th->seq),
 				     jiffies / (HZ * 60), mssind);
 }
+EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);
+
+__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	const struct tcphdr *th = tcp_hdr(skb);
+
+	tcp_synq_overflow(sk);
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
+
+	return __cookie_v4_init_sequence(iph, th, mssp);
+}
 
 /*
  * This (misnamed) value is the age of syncookie which is permitted.
@@ -192,10 +199,9 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
  * Check if a ack sequence number is a valid syncookie.
  * Return the decoded mss if it is, or 0 if not.
  */
-static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
+int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
+		      u32 cookie)
 {
-	const struct iphdr *iph = ip_hdr(skb);
-	const struct tcphdr *th = tcp_hdr(skb);
 	__u32 seq = ntohl(th->seq) - 1;
 	__u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr,
 					    th->source, th->dest, seq,
@@ -204,6 +210,7 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
 
 	return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
 }
+EXPORT_SYMBOL_GPL(__cookie_v4_check);
 
 static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
 					   struct request_sock *req,
@@ -284,7 +291,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 		goto out;
 
 	if (tcp_synq_no_recent_overflow(sk) ||
-	    (mss = cookie_check(skb, cookie)) == 0) {
+	    (mss = __cookie_v4_check(ip_hdr(skb), th, cookie)) == 0) {
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED);
 		goto out;
 	}
-- 
cgit v1.2.3


From 48b1de4c110a7afa4b85862f6c75af817db26fad Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 27 Aug 2013 08:50:14 +0200
Subject: netfilter: add SYNPROXY core/target

Add a SYNPROXY for netfilter. The code is split into two parts, the synproxy
core with common functions and an address family specific target.

The SYNPROXY receives the connection request from the client, responds with
a SYN/ACK containing a SYN cookie and announcing a zero window and checks
whether the final ACK from the client contains a valid cookie.

It then establishes a connection to the original destination and, if
successful, sends a window update to the client with the window size
announced by the server.

Support for timestamps, SACK, window scaling and MSS options can be
statically configured as target parameters if the features of the server
are known. If timestamps are used, the timestamp value sent back to
the client in the SYN/ACK will be different from the real timestamp of
the server. In order to now break PAWS, the timestamps are translated in
the direction server->client.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Tested-by: Martin Topholm <mph@one.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_extend.h   |   6 +-
 include/net/netfilter/nf_conntrack_seqadj.h   |   2 +
 include/net/netfilter/nf_conntrack_synproxy.h |  77 +++++
 include/uapi/linux/netfilter/xt_SYNPROXY.h    |  16 +
 net/ipv4/netfilter/Kconfig                    |  13 +
 net/ipv4/netfilter/Makefile                   |   1 +
 net/ipv4/netfilter/ipt_SYNPROXY.c             | 472 ++++++++++++++++++++++++++
 net/netfilter/Kconfig                         |   3 +
 net/netfilter/Makefile                        |   3 +
 net/netfilter/nf_conntrack_core.c             |   6 +
 net/netfilter/nf_conntrack_proto_tcp.c        |  16 +
 net/netfilter/nf_conntrack_seqadj.c           |  20 ++
 net/netfilter/nf_synproxy_core.c              | 432 +++++++++++++++++++++++
 13 files changed, 1066 insertions(+), 1 deletion(-)
 create mode 100644 include/net/netfilter/nf_conntrack_synproxy.h
 create mode 100644 include/uapi/linux/netfilter/xt_SYNPROXY.h
 create mode 100644 net/ipv4/netfilter/ipt_SYNPROXY.c
 create mode 100644 net/netfilter/nf_synproxy_core.c

(limited to 'net')

diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h
index 2a22bcbfe6e4..ff95434e50ca 100644
--- a/include/net/netfilter/nf_conntrack_extend.h
+++ b/include/net/netfilter/nf_conntrack_extend.h
@@ -9,8 +9,8 @@ enum nf_ct_ext_id {
 	NF_CT_EXT_HELPER,
 #if defined(CONFIG_NF_NAT) || defined(CONFIG_NF_NAT_MODULE)
 	NF_CT_EXT_NAT,
-	NF_CT_EXT_SEQADJ,
 #endif
+	NF_CT_EXT_SEQADJ,
 	NF_CT_EXT_ACCT,
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
 	NF_CT_EXT_ECACHE,
@@ -26,6 +26,9 @@ enum nf_ct_ext_id {
 #endif
 #ifdef CONFIG_NF_CONNTRACK_LABELS
 	NF_CT_EXT_LABELS,
+#endif
+#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
+	NF_CT_EXT_SYNPROXY,
 #endif
 	NF_CT_EXT_NUM,
 };
@@ -39,6 +42,7 @@ enum nf_ct_ext_id {
 #define NF_CT_EXT_TSTAMP_TYPE struct nf_conn_tstamp
 #define NF_CT_EXT_TIMEOUT_TYPE struct nf_conn_timeout
 #define NF_CT_EXT_LABELS_TYPE struct nf_conn_labels
+#define NF_CT_EXT_SYNPROXY_TYPE struct nf_conn_synproxy
 
 /* Extensions: optional stuff which isn't permanently in struct. */
 struct nf_ct_ext {
diff --git a/include/net/netfilter/nf_conntrack_seqadj.h b/include/net/netfilter/nf_conntrack_seqadj.h
index 30bfbbed9f47..f6177a5fe0ca 100644
--- a/include/net/netfilter/nf_conntrack_seqadj.h
+++ b/include/net/netfilter/nf_conntrack_seqadj.h
@@ -30,6 +30,8 @@ static inline struct nf_conn_seqadj *nfct_seqadj_ext_add(struct nf_conn *ct)
 	return nf_ct_ext_add(ct, NF_CT_EXT_SEQADJ, GFP_ATOMIC);
 }
 
+extern int nf_ct_seqadj_init(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+			     s32 off);
 extern int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
 			    __be32 seq, s32 off);
 extern void nf_ct_tcp_seqadj_set(struct sk_buff *skb,
diff --git a/include/net/netfilter/nf_conntrack_synproxy.h b/include/net/netfilter/nf_conntrack_synproxy.h
new file mode 100644
index 000000000000..806f54a290d6
--- /dev/null
+++ b/include/net/netfilter/nf_conntrack_synproxy.h
@@ -0,0 +1,77 @@
+#ifndef _NF_CONNTRACK_SYNPROXY_H
+#define _NF_CONNTRACK_SYNPROXY_H
+
+#include <net/netns/generic.h>
+
+struct nf_conn_synproxy {
+	u32	isn;
+	u32	its;
+	u32	tsoff;
+};
+
+static inline struct nf_conn_synproxy *nfct_synproxy(const struct nf_conn *ct)
+{
+#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
+	return nf_ct_ext_find(ct, NF_CT_EXT_SYNPROXY);
+#else
+	return NULL;
+#endif
+}
+
+static inline struct nf_conn_synproxy *nfct_synproxy_ext_add(struct nf_conn *ct)
+{
+#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
+	return nf_ct_ext_add(ct, NF_CT_EXT_SYNPROXY, GFP_ATOMIC);
+#else
+	return NULL;
+#endif
+}
+
+struct synproxy_stats {
+	unsigned int			syn_received;
+	unsigned int			cookie_invalid;
+	unsigned int			cookie_valid;
+	unsigned int			cookie_retrans;
+	unsigned int			conn_reopened;
+};
+
+struct synproxy_net {
+	struct nf_conn			*tmpl;
+	struct synproxy_stats __percpu	*stats;
+};
+
+extern int synproxy_net_id;
+static inline struct synproxy_net *synproxy_pernet(struct net *net)
+{
+	return net_generic(net, synproxy_net_id);
+}
+
+struct synproxy_options {
+	u8				options;
+	u8				wscale;
+	u16				mss;
+	u32				tsval;
+	u32				tsecr;
+};
+
+struct tcphdr;
+struct xt_synproxy_info;
+extern void synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
+				   const struct tcphdr *th,
+				   struct synproxy_options *opts);
+extern unsigned int synproxy_options_size(const struct synproxy_options *opts);
+extern void synproxy_build_options(struct tcphdr *th,
+				   const struct synproxy_options *opts);
+
+extern void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info,
+					   struct synproxy_options *opts);
+extern void synproxy_check_timestamp_cookie(struct synproxy_options *opts);
+
+extern unsigned int synproxy_tstamp_adjust(struct sk_buff *skb,
+					   unsigned int protoff,
+					   struct tcphdr *th,
+					   struct nf_conn *ct,
+					   enum ip_conntrack_info ctinfo,
+					   const struct nf_conn_synproxy *synproxy);
+
+#endif /* _NF_CONNTRACK_SYNPROXY_H */
diff --git a/include/uapi/linux/netfilter/xt_SYNPROXY.h b/include/uapi/linux/netfilter/xt_SYNPROXY.h
new file mode 100644
index 000000000000..2d59fbaa93c6
--- /dev/null
+++ b/include/uapi/linux/netfilter/xt_SYNPROXY.h
@@ -0,0 +1,16 @@
+#ifndef _XT_SYNPROXY_H
+#define _XT_SYNPROXY_H
+
+#define XT_SYNPROXY_OPT_MSS		0x01
+#define XT_SYNPROXY_OPT_WSCALE		0x02
+#define XT_SYNPROXY_OPT_SACK_PERM	0x04
+#define XT_SYNPROXY_OPT_TIMESTAMP	0x08
+#define XT_SYNPROXY_OPT_ECN		0x10
+
+struct xt_synproxy_info {
+	__u8	options;
+	__u8	wscale;
+	__u16	mss;
+};
+
+#endif /* _XT_SYNPROXY_H */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 4e9028017428..1657e39b291f 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -110,6 +110,19 @@ config IP_NF_TARGET_REJECT
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config IP_NF_TARGET_SYNPROXY
+	tristate "SYNPROXY target support"
+	depends on NF_CONNTRACK && NETFILTER_ADVANCED
+	select NETFILTER_SYNPROXY
+	select SYN_COOKIES
+	help
+	  The SYNPROXY target allows you to intercept TCP connections and
+	  establish them using syncookies before they are passed on to the
+	  server. This allows to avoid conntrack and server resource usage
+	  during SYN-flood attacks.
+
+	  To compile it as a module, choose M here. If unsure, say N.
+
 config IP_NF_TARGET_ULOG
 	tristate "ULOG target support (obsolete)"
 	default m if NETFILTER_ADVANCED=n
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 007b128eecc9..3622b248b6dd 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -46,6 +46,7 @@ obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
 obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
 obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
 obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
+obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o
 obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
 
 # generic ARP tables
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
new file mode 100644
index 000000000000..94371db6aecc
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -0,0 +1,472 @@
+/*
+ * Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_SYNPROXY.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
+
+static struct iphdr *
+synproxy_build_ip(struct sk_buff *skb, u32 saddr, u32 daddr)
+{
+	struct iphdr *iph;
+
+	skb_reset_network_header(skb);
+	iph = (struct iphdr *)skb_put(skb, sizeof(*iph));
+	iph->version	= 4;
+	iph->ihl	= sizeof(*iph) / 4;
+	iph->tos	= 0;
+	iph->id		= 0;
+	iph->frag_off	= htons(IP_DF);
+	iph->ttl	= sysctl_ip_default_ttl;
+	iph->protocol	= IPPROTO_TCP;
+	iph->check	= 0;
+	iph->saddr	= saddr;
+	iph->daddr	= daddr;
+
+	return iph;
+}
+
+static void
+synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb,
+		  struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
+		  struct iphdr *niph, struct tcphdr *nth,
+		  unsigned int tcp_hdr_size)
+{
+	nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0);
+	nskb->ip_summed   = CHECKSUM_PARTIAL;
+	nskb->csum_start  = (unsigned char *)nth - nskb->head;
+	nskb->csum_offset = offsetof(struct tcphdr, check);
+
+	skb_dst_set_noref(nskb, skb_dst(skb));
+	nskb->protocol = htons(ETH_P_IP);
+	if (ip_route_me_harder(nskb, RTN_UNSPEC))
+		goto free_nskb;
+
+	if (nfct) {
+		nskb->nfct = nfct;
+		nskb->nfctinfo = ctinfo;
+		nf_conntrack_get(nfct);
+	}
+
+	ip_local_out(nskb);
+	return;
+
+free_nskb:
+	kfree_skb(nskb);
+}
+
+static void
+synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th,
+			    const struct synproxy_options *opts)
+{
+	struct sk_buff *nskb;
+	struct iphdr *iph, *niph;
+	struct tcphdr *nth;
+	unsigned int tcp_hdr_size;
+	u16 mss = opts->mss;
+
+	iph = ip_hdr(skb);
+
+	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+			 GFP_ATOMIC);
+	if (nskb == NULL)
+		return;
+	skb_reserve(nskb, MAX_TCP_HEADER);
+
+	niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr);
+
+	skb_reset_transport_header(nskb);
+	nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+	nth->source	= th->dest;
+	nth->dest	= th->source;
+	nth->seq	= htonl(__cookie_v4_init_sequence(iph, th, &mss));
+	nth->ack_seq	= htonl(ntohl(th->seq) + 1);
+	tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
+	if (opts->options & XT_SYNPROXY_OPT_ECN)
+		tcp_flag_word(nth) |= TCP_FLAG_ECE;
+	nth->doff	= tcp_hdr_size / 4;
+	nth->window	= 0;
+	nth->check	= 0;
+	nth->urg_ptr	= 0;
+
+	synproxy_build_options(nth, opts);
+
+	synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
+			  niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_server_syn(const struct synproxy_net *snet,
+			 const struct sk_buff *skb, const struct tcphdr *th,
+			 const struct synproxy_options *opts, u32 recv_seq)
+{
+	struct sk_buff *nskb;
+	struct iphdr *iph, *niph;
+	struct tcphdr *nth;
+	unsigned int tcp_hdr_size;
+
+	iph = ip_hdr(skb);
+
+	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+			 GFP_ATOMIC);
+	if (nskb == NULL)
+		return;
+	skb_reserve(nskb, MAX_TCP_HEADER);
+
+	niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr);
+
+	skb_reset_transport_header(nskb);
+	nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+	nth->source	= th->source;
+	nth->dest	= th->dest;
+	nth->seq	= htonl(recv_seq - 1);
+	/* ack_seq is used to relay our ISN to the synproxy hook to initialize
+	 * sequence number translation once a connection tracking entry exists.
+	 */
+	nth->ack_seq	= htonl(ntohl(th->ack_seq) - 1);
+	tcp_flag_word(nth) = TCP_FLAG_SYN;
+	if (opts->options & XT_SYNPROXY_OPT_ECN)
+		tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
+	nth->doff	= tcp_hdr_size / 4;
+	nth->window	= th->window;
+	nth->check	= 0;
+	nth->urg_ptr	= 0;
+
+	synproxy_build_options(nth, opts);
+
+	synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
+			  niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_server_ack(const struct synproxy_net *snet,
+			 const struct ip_ct_tcp *state,
+			 const struct sk_buff *skb, const struct tcphdr *th,
+			 const struct synproxy_options *opts)
+{
+	struct sk_buff *nskb;
+	struct iphdr *iph, *niph;
+	struct tcphdr *nth;
+	unsigned int tcp_hdr_size;
+
+	iph = ip_hdr(skb);
+
+	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+			 GFP_ATOMIC);
+	if (nskb == NULL)
+		return;
+	skb_reserve(nskb, MAX_TCP_HEADER);
+
+	niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr);
+
+	skb_reset_transport_header(nskb);
+	nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+	nth->source	= th->dest;
+	nth->dest	= th->source;
+	nth->seq	= htonl(ntohl(th->ack_seq));
+	nth->ack_seq	= htonl(ntohl(th->seq) + 1);
+	tcp_flag_word(nth) = TCP_FLAG_ACK;
+	nth->doff	= tcp_hdr_size / 4;
+	nth->window	= htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
+	nth->check	= 0;
+	nth->urg_ptr	= 0;
+
+	synproxy_build_options(nth, opts);
+
+	synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_client_ack(const struct synproxy_net *snet,
+			 const struct sk_buff *skb, const struct tcphdr *th,
+			 const struct synproxy_options *opts)
+{
+	struct sk_buff *nskb;
+	struct iphdr *iph, *niph;
+	struct tcphdr *nth;
+	unsigned int tcp_hdr_size;
+
+	iph = ip_hdr(skb);
+
+	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+			 GFP_ATOMIC);
+	if (nskb == NULL)
+		return;
+	skb_reserve(nskb, MAX_TCP_HEADER);
+
+	niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr);
+
+	skb_reset_transport_header(nskb);
+	nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+	nth->source	= th->source;
+	nth->dest	= th->dest;
+	nth->seq	= htonl(ntohl(th->seq) + 1);
+	nth->ack_seq	= th->ack_seq;
+	tcp_flag_word(nth) = TCP_FLAG_ACK;
+	nth->doff	= tcp_hdr_size / 4;
+	nth->window	= ntohs(htons(th->window) >> opts->wscale);
+	nth->check	= 0;
+	nth->urg_ptr	= 0;
+
+	synproxy_build_options(nth, opts);
+
+	synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+}
+
+static bool
+synproxy_recv_client_ack(const struct synproxy_net *snet,
+			 const struct sk_buff *skb, const struct tcphdr *th,
+			 struct synproxy_options *opts, u32 recv_seq)
+{
+	int mss;
+
+	mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1);
+	if (mss == 0) {
+		this_cpu_inc(snet->stats->cookie_invalid);
+		return false;
+	}
+
+	this_cpu_inc(snet->stats->cookie_valid);
+	opts->mss = mss;
+
+	if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
+		synproxy_check_timestamp_cookie(opts);
+
+	synproxy_send_server_syn(snet, skb, th, opts, recv_seq);
+	return true;
+}
+
+static unsigned int
+synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_synproxy_info *info = par->targinfo;
+	struct synproxy_net *snet = synproxy_pernet(dev_net(par->in));
+	struct synproxy_options opts = {};
+	struct tcphdr *th, _th;
+
+	if (nf_ip_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP))
+		return NF_DROP;
+
+	th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th);
+	if (th == NULL)
+		return NF_DROP;
+
+	synproxy_parse_options(skb, par->thoff, th, &opts);
+
+	if (th->syn && !th->ack) {
+		/* Initial SYN from client */
+		this_cpu_inc(snet->stats->syn_received);
+
+		if (th->ece && th->cwr)
+			opts.options |= XT_SYNPROXY_OPT_ECN;
+
+		opts.options &= info->options;
+		if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+			synproxy_init_timestamp_cookie(info, &opts);
+		else
+			opts.options &= ~(XT_SYNPROXY_OPT_WSCALE |
+					  XT_SYNPROXY_OPT_SACK_PERM |
+					  XT_SYNPROXY_OPT_ECN);
+
+		synproxy_send_client_synack(skb, th, &opts);
+	} else if (th->ack && !(th->fin || th->rst))
+		/* ACK from client */
+		synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq));
+
+	return NF_DROP;
+}
+
+static unsigned int ipv4_synproxy_hook(unsigned int hooknum,
+				       struct sk_buff *skb,
+				       const struct net_device *in,
+				       const struct net_device *out,
+				       int (*okfn)(struct sk_buff *))
+{
+	struct synproxy_net *snet = synproxy_pernet(dev_net(in ? : out));
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+	struct nf_conn_synproxy *synproxy;
+	struct synproxy_options opts = {};
+	const struct ip_ct_tcp *state;
+	struct tcphdr *th, _th;
+	unsigned int thoff;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct == NULL)
+		return NF_ACCEPT;
+
+	synproxy = nfct_synproxy(ct);
+	if (synproxy == NULL)
+		return NF_ACCEPT;
+
+	if (nf_is_loopback_packet(skb))
+		return NF_ACCEPT;
+
+	thoff = ip_hdrlen(skb);
+	th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
+	if (th == NULL)
+		return NF_DROP;
+
+	state = &ct->proto.tcp;
+	switch (state->state) {
+	case TCP_CONNTRACK_CLOSE:
+		if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+			nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
+						      ntohl(th->seq) + 1);
+			break;
+		}
+
+		if (!th->syn || th->ack ||
+		    CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+			break;
+
+		/* Reopened connection - reset the sequence number and timestamp
+		 * adjustments, they will get initialized once the connection is
+		 * reestablished.
+		 */
+		nf_ct_seqadj_init(ct, ctinfo, 0);
+		synproxy->tsoff = 0;
+		this_cpu_inc(snet->stats->conn_reopened);
+
+		/* fall through */
+	case TCP_CONNTRACK_SYN_SENT:
+		synproxy_parse_options(skb, thoff, th, &opts);
+
+		if (!th->syn && th->ack &&
+		    CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
+			/* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
+			 * therefore we need to add 1 to make the SYN sequence
+			 * number match the one of first SYN.
+			 */
+			if (synproxy_recv_client_ack(snet, skb, th, &opts,
+						     ntohl(th->seq) + 1))
+				this_cpu_inc(snet->stats->cookie_retrans);
+
+			return NF_DROP;
+		}
+
+		synproxy->isn = ntohl(th->ack_seq);
+		if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+			synproxy->its = opts.tsecr;
+		break;
+	case TCP_CONNTRACK_SYN_RECV:
+		if (!th->syn || !th->ack)
+			break;
+
+		synproxy_parse_options(skb, thoff, th, &opts);
+		if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+			synproxy->tsoff = opts.tsval - synproxy->its;
+
+		opts.options &= ~(XT_SYNPROXY_OPT_MSS |
+				  XT_SYNPROXY_OPT_WSCALE |
+				  XT_SYNPROXY_OPT_SACK_PERM);
+
+		swap(opts.tsval, opts.tsecr);
+		synproxy_send_server_ack(snet, state, skb, th, &opts);
+
+		nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
+
+		swap(opts.tsval, opts.tsecr);
+		synproxy_send_client_ack(snet, skb, th, &opts);
+
+		consume_skb(skb);
+		return NF_STOLEN;
+	default:
+		break;
+	}
+
+	synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
+	return NF_ACCEPT;
+}
+
+static int synproxy_tg4_check(const struct xt_tgchk_param *par)
+{
+	const struct ipt_entry *e = par->entryinfo;
+
+	if (e->ip.proto != IPPROTO_TCP ||
+	    e->ip.invflags & XT_INV_PROTO)
+		return -EINVAL;
+
+	return nf_ct_l3proto_try_module_get(par->family);
+}
+
+static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par)
+{
+	nf_ct_l3proto_module_put(par->family);
+}
+
+static struct xt_target synproxy_tg4_reg __read_mostly = {
+	.name		= "SYNPROXY",
+	.family		= NFPROTO_IPV4,
+	.target		= synproxy_tg4,
+	.targetsize	= sizeof(struct xt_synproxy_info),
+	.checkentry	= synproxy_tg4_check,
+	.destroy	= synproxy_tg4_destroy,
+	.me		= THIS_MODULE,
+};
+
+static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = {
+	{
+		.hook		= ipv4_synproxy_hook,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+	},
+	{
+		.hook		= ipv4_synproxy_hook,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_POST_ROUTING,
+		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+	},
+};
+
+static int __init synproxy_tg4_init(void)
+{
+	int err;
+
+	err = nf_register_hooks(ipv4_synproxy_ops,
+				ARRAY_SIZE(ipv4_synproxy_ops));
+	if (err < 0)
+		goto err1;
+
+	err = xt_register_target(&synproxy_tg4_reg);
+	if (err < 0)
+		goto err2;
+
+	return 0;
+
+err2:
+	nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops));
+err1:
+	return err;
+}
+
+static void __exit synproxy_tg4_exit(void)
+{
+	xt_unregister_target(&synproxy_tg4_reg);
+	nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops));
+}
+
+module_init(synproxy_tg4_init);
+module_exit(synproxy_tg4_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index c45fc1a60e0d..62a171ab204f 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -408,6 +408,9 @@ config NF_NAT_TFTP
 	depends on NF_CONNTRACK && NF_NAT
 	default NF_NAT && NF_CONNTRACK_TFTP
 
+config NETFILTER_SYNPROXY
+	tristate
+
 endif # NF_CONNTRACK
 
 config NETFILTER_XTABLES
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 89a9c1658f5e..c3a0a12907f6 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -61,6 +61,9 @@ obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o
 obj-$(CONFIG_NF_NAT_SIP) += nf_nat_sip.o
 obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o
 
+# SYNPROXY
+obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o
+
 # generic X tables 
 obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 00a7a94d4132..5d892febd64c 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -48,6 +48,7 @@
 #include <net/netfilter/nf_conntrack_timestamp.h>
 #include <net/netfilter/nf_conntrack_timeout.h>
 #include <net/netfilter/nf_conntrack_labels.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
 #include <net/netfilter/nf_nat.h>
 #include <net/netfilter/nf_nat_core.h>
 #include <net/netfilter/nf_nat_helper.h>
@@ -799,6 +800,11 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 	if (IS_ERR(ct))
 		return (struct nf_conntrack_tuple_hash *)ct;
 
+	if (tmpl && nfct_synproxy(tmpl)) {
+		nfct_seqadj_ext_add(ct);
+		nfct_synproxy_ext_add(ct);
+	}
+
 	timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
 	if (timeout_ext)
 		timeouts = NF_CT_TIMEOUT_EXT_DATA(timeout_ext);
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 984a8d1a3359..44d1ea32570a 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -28,6 +28,7 @@
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_ecache.h>
 #include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
 #include <net/netfilter/nf_log.h>
 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
@@ -946,6 +947,21 @@ static int tcp_packet(struct nf_conn *ct,
 				  "state %s ", tcp_conntrack_names[old_state]);
 		return NF_ACCEPT;
 	case TCP_CONNTRACK_MAX:
+		/* Special case for SYN proxy: when the SYN to the server or
+		 * the SYN/ACK from the server is lost, the client may transmit
+		 * a keep-alive packet while in SYN_SENT state. This needs to
+		 * be associated with the original conntrack entry in order to
+		 * generate a new SYN with the correct sequence number.
+		 */
+		if (nfct_synproxy(ct) && old_state == TCP_CONNTRACK_SYN_SENT &&
+		    index == TCP_ACK_SET && dir == IP_CT_DIR_ORIGINAL &&
+		    ct->proto.tcp.last_dir == IP_CT_DIR_ORIGINAL &&
+		    ct->proto.tcp.seen[dir].td_end - 1 == ntohl(th->seq)) {
+			pr_debug("nf_ct_tcp: SYN proxy client keep alive\n");
+			spin_unlock_bh(&ct->lock);
+			return NF_ACCEPT;
+		}
+
 		/* Invalid packet */
 		pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
 			 dir, get_conntrack_index(th), old_state);
diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c
index 483eb9ce3216..5f9bfd060dea 100644
--- a/net/netfilter/nf_conntrack_seqadj.c
+++ b/net/netfilter/nf_conntrack_seqadj.c
@@ -6,6 +6,26 @@
 #include <net/netfilter/nf_conntrack_extend.h>
 #include <net/netfilter/nf_conntrack_seqadj.h>
 
+int nf_ct_seqadj_init(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+		      s32 off)
+{
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	struct nf_conn_seqadj *seqadj;
+	struct nf_ct_seqadj *this_way;
+
+	if (off == 0)
+		return 0;
+
+	set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
+
+	seqadj = nfct_seqadj(ct);
+	this_way = &seqadj->seq[dir];
+	this_way->offset_before	 = off;
+	this_way->offset_after	 = off;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_seqadj_init);
+
 int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
 		     __be32 seq, s32 off)
 {
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
new file mode 100644
index 000000000000..d23dc791aca7
--- /dev/null
+++ b/net/netfilter/nf_synproxy_core.c
@@ -0,0 +1,432 @@
+/*
+ * Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <asm/unaligned.h>
+#include <net/tcp.h>
+#include <net/netns/generic.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_tcpudp.h>
+#include <linux/netfilter/xt_SYNPROXY.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
+
+int synproxy_net_id;
+EXPORT_SYMBOL_GPL(synproxy_net_id);
+
+void
+synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
+		       const struct tcphdr *th, struct synproxy_options *opts)
+{
+	int length = (th->doff * 4) - sizeof(*th);
+	u8 buf[40], *ptr;
+
+	ptr = skb_header_pointer(skb, doff + sizeof(*th), length, buf);
+	BUG_ON(ptr == NULL);
+
+	opts->options = 0;
+	while (length > 0) {
+		int opcode = *ptr++;
+		int opsize;
+
+		switch (opcode) {
+		case TCPOPT_EOL:
+			return;
+		case TCPOPT_NOP:
+			length--;
+			continue;
+		default:
+			opsize = *ptr++;
+			if (opsize < 2)
+				return;
+			if (opsize > length)
+				return;
+
+			switch (opcode) {
+			case TCPOPT_MSS:
+				if (opsize == TCPOLEN_MSS) {
+					opts->mss = get_unaligned_be16(ptr);
+					opts->options |= XT_SYNPROXY_OPT_MSS;
+				}
+				break;
+			case TCPOPT_WINDOW:
+				if (opsize == TCPOLEN_WINDOW) {
+					opts->wscale = *ptr;
+					if (opts->wscale > 14)
+						opts->wscale = 14;
+					opts->options |= XT_SYNPROXY_OPT_WSCALE;
+				}
+				break;
+			case TCPOPT_TIMESTAMP:
+				if (opsize == TCPOLEN_TIMESTAMP) {
+					opts->tsval = get_unaligned_be32(ptr);
+					opts->tsecr = get_unaligned_be32(ptr + 4);
+					opts->options |= XT_SYNPROXY_OPT_TIMESTAMP;
+				}
+				break;
+			case TCPOPT_SACK_PERM:
+				if (opsize == TCPOLEN_SACK_PERM)
+					opts->options |= XT_SYNPROXY_OPT_SACK_PERM;
+				break;
+			}
+
+			ptr += opsize - 2;
+			length -= opsize;
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(synproxy_parse_options);
+
+unsigned int synproxy_options_size(const struct synproxy_options *opts)
+{
+	unsigned int size = 0;
+
+	if (opts->options & XT_SYNPROXY_OPT_MSS)
+		size += TCPOLEN_MSS_ALIGNED;
+	if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
+		size += TCPOLEN_TSTAMP_ALIGNED;
+	else if (opts->options & XT_SYNPROXY_OPT_SACK_PERM)
+		size += TCPOLEN_SACKPERM_ALIGNED;
+	if (opts->options & XT_SYNPROXY_OPT_WSCALE)
+		size += TCPOLEN_WSCALE_ALIGNED;
+
+	return size;
+}
+EXPORT_SYMBOL_GPL(synproxy_options_size);
+
+void
+synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts)
+{
+	__be32 *ptr = (__be32 *)(th + 1);
+	u8 options = opts->options;
+
+	if (options & XT_SYNPROXY_OPT_MSS)
+		*ptr++ = htonl((TCPOPT_MSS << 24) |
+			       (TCPOLEN_MSS << 16) |
+			       opts->mss);
+
+	if (options & XT_SYNPROXY_OPT_TIMESTAMP) {
+		if (options & XT_SYNPROXY_OPT_SACK_PERM)
+			*ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
+				       (TCPOLEN_SACK_PERM << 16) |
+				       (TCPOPT_TIMESTAMP << 8) |
+				       TCPOLEN_TIMESTAMP);
+		else
+			*ptr++ = htonl((TCPOPT_NOP << 24) |
+				       (TCPOPT_NOP << 16) |
+				       (TCPOPT_TIMESTAMP << 8) |
+				       TCPOLEN_TIMESTAMP);
+
+		*ptr++ = htonl(opts->tsval);
+		*ptr++ = htonl(opts->tsecr);
+	} else if (options & XT_SYNPROXY_OPT_SACK_PERM)
+		*ptr++ = htonl((TCPOPT_NOP << 24) |
+			       (TCPOPT_NOP << 16) |
+			       (TCPOPT_SACK_PERM << 8) |
+			       TCPOLEN_SACK_PERM);
+
+	if (options & XT_SYNPROXY_OPT_WSCALE)
+		*ptr++ = htonl((TCPOPT_NOP << 24) |
+			       (TCPOPT_WINDOW << 16) |
+			       (TCPOLEN_WINDOW << 8) |
+			       opts->wscale);
+}
+EXPORT_SYMBOL_GPL(synproxy_build_options);
+
+void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info,
+				    struct synproxy_options *opts)
+{
+	opts->tsecr = opts->tsval;
+	opts->tsval = tcp_time_stamp & ~0x3f;
+
+	if (opts->options & XT_SYNPROXY_OPT_WSCALE)
+		opts->tsval |= info->wscale;
+	else
+		opts->tsval |= 0xf;
+
+	if (opts->options & XT_SYNPROXY_OPT_SACK_PERM)
+		opts->tsval |= 1 << 4;
+
+	if (opts->options & XT_SYNPROXY_OPT_ECN)
+		opts->tsval |= 1 << 5;
+}
+EXPORT_SYMBOL_GPL(synproxy_init_timestamp_cookie);
+
+void synproxy_check_timestamp_cookie(struct synproxy_options *opts)
+{
+	opts->wscale = opts->tsecr & 0xf;
+	if (opts->wscale != 0xf)
+		opts->options |= XT_SYNPROXY_OPT_WSCALE;
+
+	opts->options |= opts->tsecr & (1 << 4) ? XT_SYNPROXY_OPT_SACK_PERM : 0;
+
+	opts->options |= opts->tsecr & (1 << 5) ? XT_SYNPROXY_OPT_ECN : 0;
+}
+EXPORT_SYMBOL_GPL(synproxy_check_timestamp_cookie);
+
+unsigned int synproxy_tstamp_adjust(struct sk_buff *skb,
+				    unsigned int protoff,
+				    struct tcphdr *th,
+				    struct nf_conn *ct,
+				    enum ip_conntrack_info ctinfo,
+				    const struct nf_conn_synproxy *synproxy)
+{
+	unsigned int optoff, optend;
+	u32 *ptr, old;
+
+	if (synproxy->tsoff == 0)
+		return 1;
+
+	optoff = protoff + sizeof(struct tcphdr);
+	optend = protoff + th->doff * 4;
+
+	if (!skb_make_writable(skb, optend))
+		return 0;
+
+	while (optoff < optend) {
+		unsigned char *op = skb->data + optoff;
+
+		switch (op[0]) {
+		case TCPOPT_EOL:
+			return 1;
+		case TCPOPT_NOP:
+			optoff++;
+			continue;
+		default:
+			if (optoff + 1 == optend ||
+			    optoff + op[1] > optend ||
+			    op[1] < 2)
+				return 0;
+			if (op[0] == TCPOPT_TIMESTAMP &&
+			    op[1] == TCPOLEN_TIMESTAMP) {
+				if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) {
+					ptr = (u32 *)&op[2];
+					old = *ptr;
+					*ptr = htonl(ntohl(*ptr) -
+						     synproxy->tsoff);
+				} else {
+					ptr = (u32 *)&op[6];
+					old = *ptr;
+					*ptr = htonl(ntohl(*ptr) +
+						     synproxy->tsoff);
+				}
+				inet_proto_csum_replace4(&th->check, skb,
+							 old, *ptr, 0);
+				return 1;
+			}
+			optoff += op[1];
+		}
+	}
+	return 1;
+}
+EXPORT_SYMBOL_GPL(synproxy_tstamp_adjust);
+
+static struct nf_ct_ext_type nf_ct_synproxy_extend __read_mostly = {
+	.len		= sizeof(struct nf_conn_synproxy),
+	.align		= __alignof__(struct nf_conn_synproxy),
+	.id		= NF_CT_EXT_SYNPROXY,
+};
+
+#ifdef CONFIG_PROC_FS
+static void *synproxy_cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq));
+	int cpu;
+
+	if (*pos == 0)
+		return SEQ_START_TOKEN;
+
+	for (cpu = *pos - 1; cpu < nr_cpu_ids; cpu++) {
+		if (!cpu_possible(cpu))
+			continue;
+		*pos = cpu + 1;
+		return per_cpu_ptr(snet->stats, cpu);
+	}
+
+	return NULL;
+}
+
+static void *synproxy_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq));
+	int cpu;
+
+	for (cpu = *pos; cpu < nr_cpu_ids; cpu++) {
+		if (!cpu_possible(cpu))
+			continue;
+		*pos = cpu + 1;
+		return per_cpu_ptr(snet->stats, cpu);
+	}
+
+	return NULL;
+}
+
+static void synproxy_cpu_seq_stop(struct seq_file *seq, void *v)
+{
+	return;
+}
+
+static int synproxy_cpu_seq_show(struct seq_file *seq, void *v)
+{
+	struct synproxy_stats *stats = v;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq, "entries\t\tsyn_received\t"
+				"cookie_invalid\tcookie_valid\t"
+				"cookie_retrans\tconn_reopened\n");
+		return 0;
+	}
+
+	seq_printf(seq, "%08x\t%08x\t%08x\t%08x\t%08x\t%08x\n", 0,
+		   stats->syn_received,
+		   stats->cookie_invalid,
+		   stats->cookie_valid,
+		   stats->cookie_retrans,
+		   stats->conn_reopened);
+
+	return 0;
+}
+
+static const struct seq_operations synproxy_cpu_seq_ops = {
+	.start		= synproxy_cpu_seq_start,
+	.next		= synproxy_cpu_seq_next,
+	.stop		= synproxy_cpu_seq_stop,
+	.show		= synproxy_cpu_seq_show,
+};
+
+static int synproxy_cpu_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &synproxy_cpu_seq_ops,
+			    sizeof(struct seq_net_private));
+}
+
+static const struct file_operations synproxy_cpu_seq_fops = {
+	.owner		= THIS_MODULE,
+	.open		= synproxy_cpu_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_net,
+};
+
+static int __net_init synproxy_proc_init(struct net *net)
+{
+	if (!proc_create("synproxy", S_IRUGO, net->proc_net_stat,
+			 &synproxy_cpu_seq_fops))
+		return -ENOMEM;
+	return 0;
+}
+
+static void __net_exit synproxy_proc_exit(struct net *net)
+{
+	remove_proc_entry("synproxy", net->proc_net_stat);
+}
+#else
+static int __net_init synproxy_proc_init(struct net *net)
+{
+	return 0;
+}
+
+static void __net_exit synproxy_proc_exit(struct net *net)
+{
+	return;
+}
+#endif /* CONFIG_PROC_FS */
+
+static int __net_init synproxy_net_init(struct net *net)
+{
+	struct synproxy_net *snet = synproxy_pernet(net);
+	struct nf_conntrack_tuple t;
+	struct nf_conn *ct;
+	int err = -ENOMEM;
+
+	memset(&t, 0, sizeof(t));
+	ct = nf_conntrack_alloc(net, 0, &t, &t, GFP_KERNEL);
+	if (IS_ERR(ct)) {
+		err = PTR_ERR(ct);
+		goto err1;
+	}
+
+	__set_bit(IPS_TEMPLATE_BIT, &ct->status);
+	__set_bit(IPS_CONFIRMED_BIT, &ct->status);
+	if (!nfct_seqadj_ext_add(ct))
+		goto err2;
+	if (!nfct_synproxy_ext_add(ct))
+		goto err2;
+
+	snet->tmpl = ct;
+
+	snet->stats = alloc_percpu(struct synproxy_stats);
+	if (snet->stats == NULL)
+		goto err2;
+
+	err = synproxy_proc_init(net);
+	if (err < 0)
+		goto err3;
+
+	return 0;
+
+err3:
+	free_percpu(snet->stats);
+err2:
+	nf_conntrack_free(ct);
+err1:
+	return err;
+}
+
+static void __net_exit synproxy_net_exit(struct net *net)
+{
+	struct synproxy_net *snet = synproxy_pernet(net);
+
+	nf_conntrack_free(snet->tmpl);
+	synproxy_proc_exit(net);
+	free_percpu(snet->stats);
+}
+
+static struct pernet_operations synproxy_net_ops = {
+	.init		= synproxy_net_init,
+	.exit		= synproxy_net_exit,
+	.id		= &synproxy_net_id,
+	.size		= sizeof(struct synproxy_net),
+};
+
+static int __init synproxy_core_init(void)
+{
+	int err;
+
+	err = nf_ct_extend_register(&nf_ct_synproxy_extend);
+	if (err < 0)
+		goto err1;
+
+	err = register_pernet_subsys(&synproxy_net_ops);
+	if (err < 0)
+		goto err2;
+
+	return 0;
+
+err2:
+	nf_ct_extend_unregister(&nf_ct_synproxy_extend);
+err1:
+	return err;
+}
+
+static void __exit synproxy_core_exit(void)
+{
+	unregister_pernet_subsys(&synproxy_net_ops);
+	nf_ct_extend_unregister(&nf_ct_synproxy_extend);
+}
+
+module_init(synproxy_core_init);
+module_exit(synproxy_core_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-- 
cgit v1.2.3


From 81eb6a1487718a89621a0e0be7fafd0cd7c429a4 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 27 Aug 2013 08:50:15 +0200
Subject: net: syncookies: export cookie_v6_init_sequence/cookie_v6_check

Extract the local TCP stack independant parts of tcp_v6_init_sequence()
and cookie_v6_check() and export them for use by the upcoming IPv6 SYNPROXY
target.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Acked-by: David S. Miller <davem@davemloft.net>
Tested-by: Martin Topholm <mph@one.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/tcp.h     |  4 ++++
 net/ipv6/syncookies.c | 25 ++++++++++++++++---------
 2 files changed, 20 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 8e2b63678a83..dd5e16f66f84 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -499,8 +499,12 @@ extern bool cookie_check_timestamp(struct tcp_options_received *opt,
 				struct net *net, bool *ecn_ok);
 
 /* From net/ipv6/syncookies.c */
+extern int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th,
+			     u32 cookie);
 extern struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb);
 #ifdef CONFIG_SYN_COOKIES
+extern u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,
+				     const struct tcphdr *th, u16 *mssp);
 extern __u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb,
 				     __u16 *mss);
 #else
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index d5dda20bd717..bf63ac8a49b9 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -112,32 +112,38 @@ static __u32 check_tcp_syn_cookie(__u32 cookie, const struct in6_addr *saddr,
 		& COOKIEMASK;
 }
 
-__u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb, __u16 *mssp)
+u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,
+			      const struct tcphdr *th, __u16 *mssp)
 {
-	const struct ipv6hdr *iph = ipv6_hdr(skb);
-	const struct tcphdr *th = tcp_hdr(skb);
 	int mssind;
 	const __u16 mss = *mssp;
 
-	tcp_synq_overflow(sk);
-
 	for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--)
 		if (mss >= msstab[mssind])
 			break;
 
 	*mssp = msstab[mssind];
 
-	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
-
 	return secure_tcp_syn_cookie(&iph->saddr, &iph->daddr, th->source,
 				     th->dest, ntohl(th->seq),
 				     jiffies / (HZ * 60), mssind);
 }
+EXPORT_SYMBOL_GPL(__cookie_v6_init_sequence);
 
-static inline int cookie_check(const struct sk_buff *skb, __u32 cookie)
+__u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb, __u16 *mssp)
 {
 	const struct ipv6hdr *iph = ipv6_hdr(skb);
 	const struct tcphdr *th = tcp_hdr(skb);
+
+	tcp_synq_overflow(sk);
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
+
+	return __cookie_v6_init_sequence(iph, th, mssp);
+}
+
+int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th,
+		      __u32 cookie)
+{
 	__u32 seq = ntohl(th->seq) - 1;
 	__u32 mssind = check_tcp_syn_cookie(cookie, &iph->saddr, &iph->daddr,
 					    th->source, th->dest, seq,
@@ -145,6 +151,7 @@ static inline int cookie_check(const struct sk_buff *skb, __u32 cookie)
 
 	return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
 }
+EXPORT_SYMBOL_GPL(__cookie_v6_check);
 
 struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 {
@@ -167,7 +174,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 		goto out;
 
 	if (tcp_synq_no_recent_overflow(sk) ||
-		(mss = cookie_check(skb, cookie)) == 0) {
+		(mss = __cookie_v6_check(ipv6_hdr(skb), th, cookie)) == 0) {
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED);
 		goto out;
 	}
-- 
cgit v1.2.3


From 4ad362282cb45bbc831a182e45637da8c5bd7aa1 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 27 Aug 2013 08:50:16 +0200
Subject: netfilter: add IPv6 SYNPROXY target

Add an IPv6 version of the SYNPROXY target. The main differences to the
IPv4 version is routing and IP header construction.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Tested-by: Martin Topholm <mph@one.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/Kconfig         |  13 +
 net/ipv6/netfilter/Makefile        |   1 +
 net/ipv6/netfilter/ip6t_SYNPROXY.c | 495 +++++++++++++++++++++++++++++++++++++
 3 files changed, 509 insertions(+)
 create mode 100644 net/ipv6/netfilter/ip6t_SYNPROXY.c

(limited to 'net')

diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 4433ab40e7de..a7f842b29b67 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -153,6 +153,19 @@ config IP6_NF_TARGET_REJECT
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config IP6_NF_TARGET_SYNPROXY
+	tristate "SYNPROXY target support"
+	depends on NF_CONNTRACK && NETFILTER_ADVANCED
+	select NETFILTER_SYNPROXY
+	select SYN_COOKIES
+	help
+	  The SYNPROXY target allows you to intercept TCP connections and
+	  establish them using syncookies before they are passed on to the
+	  server. This allows to avoid conntrack and server resource usage
+	  during SYN-flood attacks.
+
+	  To compile it as a module, choose M here. If unsure, say N.
+
 config IP6_NF_MANGLE
 	tristate "Packet mangling"
 	default m if NETFILTER_ADVANCED=n
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index ad13bf0d25f0..2b53738f798c 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -37,3 +37,4 @@ obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o
 obj-$(CONFIG_IP6_NF_TARGET_MASQUERADE) += ip6t_MASQUERADE.o
 obj-$(CONFIG_IP6_NF_TARGET_NPT) += ip6t_NPT.o
 obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o
+obj-$(CONFIG_IP6_NF_TARGET_SYNPROXY) += ip6t_SYNPROXY.o
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
new file mode 100644
index 000000000000..4270a9b145e5
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -0,0 +1,495 @@
+/*
+ * Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/ip6_checksum.h>
+#include <net/ip6_route.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_SYNPROXY.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
+
+static struct ipv6hdr *
+synproxy_build_ip(struct sk_buff *skb, const struct in6_addr *saddr,
+				       const struct in6_addr *daddr)
+{
+	struct ipv6hdr *iph;
+
+	skb_reset_network_header(skb);
+	iph = (struct ipv6hdr *)skb_put(skb, sizeof(*iph));
+	ip6_flow_hdr(iph, 0, 0);
+	iph->hop_limit	= 64;	//XXX
+	iph->nexthdr	= IPPROTO_TCP;
+	iph->saddr	= *saddr;
+	iph->daddr	= *daddr;
+
+	return iph;
+}
+
+static void
+synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb,
+		  struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
+		  struct ipv6hdr *niph, struct tcphdr *nth,
+		  unsigned int tcp_hdr_size)
+{
+	struct net *net = nf_ct_net((struct nf_conn *)nfct);
+	struct dst_entry *dst;
+	struct flowi6 fl6;
+
+	nth->check = ~tcp_v6_check(tcp_hdr_size, &niph->saddr, &niph->daddr, 0);
+	nskb->ip_summed   = CHECKSUM_PARTIAL;
+	nskb->csum_start  = (unsigned char *)nth - nskb->head;
+	nskb->csum_offset = offsetof(struct tcphdr, check);
+
+	memset(&fl6, 0, sizeof(fl6));
+	fl6.flowi6_proto = IPPROTO_TCP;
+	fl6.saddr = niph->saddr;
+	fl6.daddr = niph->daddr;
+	fl6.fl6_sport = nth->source;
+	fl6.fl6_dport = nth->dest;
+	security_skb_classify_flow((struct sk_buff *)skb, flowi6_to_flowi(&fl6));
+	dst = ip6_route_output(net, NULL, &fl6);
+	if (dst == NULL || dst->error) {
+		dst_release(dst);
+		goto free_nskb;
+	}
+	dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+	if (IS_ERR(dst))
+		goto free_nskb;
+
+	skb_dst_set(nskb, dst);
+
+	if (nfct) {
+		nskb->nfct = nfct;
+		nskb->nfctinfo = ctinfo;
+		nf_conntrack_get(nfct);
+	}
+
+	ip6_local_out(nskb);
+	return;
+
+free_nskb:
+	kfree_skb(nskb);
+}
+
+static void
+synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th,
+			    const struct synproxy_options *opts)
+{
+	struct sk_buff *nskb;
+	struct ipv6hdr *iph, *niph;
+	struct tcphdr *nth;
+	unsigned int tcp_hdr_size;
+	u16 mss = opts->mss;
+
+	iph = ipv6_hdr(skb);
+
+	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+			 GFP_ATOMIC);
+	if (nskb == NULL)
+		return;
+	skb_reserve(nskb, MAX_TCP_HEADER);
+
+	niph = synproxy_build_ip(nskb, &iph->daddr, &iph->saddr);
+
+	skb_reset_transport_header(nskb);
+	nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+	nth->source	= th->dest;
+	nth->dest	= th->source;
+	nth->seq	= htonl(__cookie_v6_init_sequence(iph, th, &mss));
+	nth->ack_seq	= htonl(ntohl(th->seq) + 1);
+	tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
+	if (opts->options & XT_SYNPROXY_OPT_ECN)
+		tcp_flag_word(nth) |= TCP_FLAG_ECE;
+	nth->doff	= tcp_hdr_size / 4;
+	nth->window	= 0;
+	nth->check	= 0;
+	nth->urg_ptr	= 0;
+
+	synproxy_build_options(nth, opts);
+
+	synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
+			  niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_server_syn(const struct synproxy_net *snet,
+			 const struct sk_buff *skb, const struct tcphdr *th,
+			 const struct synproxy_options *opts, u32 recv_seq)
+{
+	struct sk_buff *nskb;
+	struct ipv6hdr *iph, *niph;
+	struct tcphdr *nth;
+	unsigned int tcp_hdr_size;
+
+	iph = ipv6_hdr(skb);
+
+	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+			 GFP_ATOMIC);
+	if (nskb == NULL)
+		return;
+	skb_reserve(nskb, MAX_TCP_HEADER);
+
+	niph = synproxy_build_ip(nskb, &iph->saddr, &iph->daddr);
+
+	skb_reset_transport_header(nskb);
+	nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+	nth->source	= th->source;
+	nth->dest	= th->dest;
+	nth->seq	= htonl(recv_seq - 1);
+	/* ack_seq is used to relay our ISN to the synproxy hook to initialize
+	 * sequence number translation once a connection tracking entry exists.
+	 */
+	nth->ack_seq	= htonl(ntohl(th->ack_seq) - 1);
+	tcp_flag_word(nth) = TCP_FLAG_SYN;
+	if (opts->options & XT_SYNPROXY_OPT_ECN)
+		tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
+	nth->doff	= tcp_hdr_size / 4;
+	nth->window	= th->window;
+	nth->check	= 0;
+	nth->urg_ptr	= 0;
+
+	synproxy_build_options(nth, opts);
+
+	synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
+			  niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_server_ack(const struct synproxy_net *snet,
+			 const struct ip_ct_tcp *state,
+			 const struct sk_buff *skb, const struct tcphdr *th,
+			 const struct synproxy_options *opts)
+{
+	struct sk_buff *nskb;
+	struct ipv6hdr *iph, *niph;
+	struct tcphdr *nth;
+	unsigned int tcp_hdr_size;
+
+	iph = ipv6_hdr(skb);
+
+	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+			 GFP_ATOMIC);
+	if (nskb == NULL)
+		return;
+	skb_reserve(nskb, MAX_TCP_HEADER);
+
+	niph = synproxy_build_ip(nskb, &iph->daddr, &iph->saddr);
+
+	skb_reset_transport_header(nskb);
+	nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+	nth->source	= th->dest;
+	nth->dest	= th->source;
+	nth->seq	= htonl(ntohl(th->ack_seq));
+	nth->ack_seq	= htonl(ntohl(th->seq) + 1);
+	tcp_flag_word(nth) = TCP_FLAG_ACK;
+	nth->doff	= tcp_hdr_size / 4;
+	nth->window	= htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
+	nth->check	= 0;
+	nth->urg_ptr	= 0;
+
+	synproxy_build_options(nth, opts);
+
+	synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_client_ack(const struct synproxy_net *snet,
+			 const struct sk_buff *skb, const struct tcphdr *th,
+			 const struct synproxy_options *opts)
+{
+	struct sk_buff *nskb;
+	struct ipv6hdr *iph, *niph;
+	struct tcphdr *nth;
+	unsigned int tcp_hdr_size;
+
+	iph = ipv6_hdr(skb);
+
+	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+			 GFP_ATOMIC);
+	if (nskb == NULL)
+		return;
+	skb_reserve(nskb, MAX_TCP_HEADER);
+
+	niph = synproxy_build_ip(nskb, &iph->saddr, &iph->daddr);
+
+	skb_reset_transport_header(nskb);
+	nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+	nth->source	= th->source;
+	nth->dest	= th->dest;
+	nth->seq	= htonl(ntohl(th->seq) + 1);
+	nth->ack_seq	= th->ack_seq;
+	tcp_flag_word(nth) = TCP_FLAG_ACK;
+	nth->doff	= tcp_hdr_size / 4;
+	nth->window	= ntohs(htons(th->window) >> opts->wscale);
+	nth->check	= 0;
+	nth->urg_ptr	= 0;
+
+	synproxy_build_options(nth, opts);
+
+	synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+}
+
+static bool
+synproxy_recv_client_ack(const struct synproxy_net *snet,
+			 const struct sk_buff *skb, const struct tcphdr *th,
+			 struct synproxy_options *opts, u32 recv_seq)
+{
+	int mss;
+
+	mss = __cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1);
+	if (mss == 0) {
+		this_cpu_inc(snet->stats->cookie_invalid);
+		return false;
+	}
+
+	this_cpu_inc(snet->stats->cookie_valid);
+	opts->mss = mss;
+
+	if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
+		synproxy_check_timestamp_cookie(opts);
+
+	synproxy_send_server_syn(snet, skb, th, opts, recv_seq);
+	return true;
+}
+
+static unsigned int
+synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_synproxy_info *info = par->targinfo;
+	struct synproxy_net *snet = synproxy_pernet(dev_net(par->in));
+	struct synproxy_options opts = {};
+	struct tcphdr *th, _th;
+
+	if (nf_ip6_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP))
+		return NF_DROP;
+
+	th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th);
+	if (th == NULL)
+		return NF_DROP;
+
+	synproxy_parse_options(skb, par->thoff, th, &opts);
+
+	if (th->syn) {
+		/* Initial SYN from client */
+		this_cpu_inc(snet->stats->syn_received);
+
+		if (th->ece && th->cwr)
+			opts.options |= XT_SYNPROXY_OPT_ECN;
+
+		opts.options &= info->options;
+		if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+			synproxy_init_timestamp_cookie(info, &opts);
+		else
+			opts.options &= ~(XT_SYNPROXY_OPT_WSCALE |
+					  XT_SYNPROXY_OPT_SACK_PERM |
+					  XT_SYNPROXY_OPT_ECN);
+
+		synproxy_send_client_synack(skb, th, &opts);
+	} else if (th->ack && !(th->fin || th->rst))
+		/* ACK from client */
+		synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq));
+
+	return NF_DROP;
+}
+
+static unsigned int ipv6_synproxy_hook(unsigned int hooknum,
+				       struct sk_buff *skb,
+				       const struct net_device *in,
+				       const struct net_device *out,
+				       int (*okfn)(struct sk_buff *))
+{
+	struct synproxy_net *snet = synproxy_pernet(dev_net(in ? : out));
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+	struct nf_conn_synproxy *synproxy;
+	struct synproxy_options opts = {};
+	const struct ip_ct_tcp *state;
+	struct tcphdr *th, _th;
+	__be16 frag_off;
+	u8 nexthdr;
+	int thoff;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct == NULL)
+		return NF_ACCEPT;
+
+	synproxy = nfct_synproxy(ct);
+	if (synproxy == NULL)
+		return NF_ACCEPT;
+
+	if (nf_is_loopback_packet(skb))
+		return NF_ACCEPT;
+
+	nexthdr = ipv6_hdr(skb)->nexthdr;
+	thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
+				 &frag_off);
+	if (thoff < 0)
+		return NF_ACCEPT;
+
+	th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
+	if (th == NULL)
+		return NF_DROP;
+
+	state = &ct->proto.tcp;
+	switch (state->state) {
+	case TCP_CONNTRACK_CLOSE:
+		if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+			nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
+						      ntohl(th->seq) + 1);
+			break;
+		}
+
+		if (!th->syn || th->ack ||
+		    CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+			break;
+
+		/* Reopened connection - reset the sequence number and timestamp
+		 * adjustments, they will get initialized once the connection is
+		 * reestablished.
+		 */
+		nf_ct_seqadj_init(ct, ctinfo, 0);
+		synproxy->tsoff = 0;
+		this_cpu_inc(snet->stats->conn_reopened);
+
+		/* fall through */
+	case TCP_CONNTRACK_SYN_SENT:
+		synproxy_parse_options(skb, thoff, th, &opts);
+
+		if (!th->syn && th->ack &&
+		    CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
+			/* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
+			 * therefore we need to add 1 to make the SYN sequence
+			 * number match the one of first SYN.
+			 */
+			if (synproxy_recv_client_ack(snet, skb, th, &opts,
+						     ntohl(th->seq) + 1))
+				this_cpu_inc(snet->stats->cookie_retrans);
+
+			return NF_DROP;
+		}
+
+		synproxy->isn = ntohl(th->ack_seq);
+		if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+			synproxy->its = opts.tsecr;
+		break;
+	case TCP_CONNTRACK_SYN_RECV:
+		if (!th->syn || !th->ack)
+			break;
+
+		synproxy_parse_options(skb, thoff, th, &opts);
+		if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+			synproxy->tsoff = opts.tsval - synproxy->its;
+
+		opts.options &= ~(XT_SYNPROXY_OPT_MSS |
+				  XT_SYNPROXY_OPT_WSCALE |
+				  XT_SYNPROXY_OPT_SACK_PERM);
+
+		swap(opts.tsval, opts.tsecr);
+		synproxy_send_server_ack(snet, state, skb, th, &opts);
+
+		nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
+
+		swap(opts.tsval, opts.tsecr);
+		synproxy_send_client_ack(snet, skb, th, &opts);
+
+		consume_skb(skb);
+		return NF_STOLEN;
+	default:
+		break;
+	}
+
+	synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
+	return NF_ACCEPT;
+}
+
+static int synproxy_tg6_check(const struct xt_tgchk_param *par)
+{
+	const struct ip6t_entry *e = par->entryinfo;
+
+	if (!(e->ipv6.flags & IP6T_F_PROTO) ||
+	    e->ipv6.proto != IPPROTO_TCP ||
+	    e->ipv6.invflags & XT_INV_PROTO)
+		return -EINVAL;
+
+	return nf_ct_l3proto_try_module_get(par->family);
+}
+
+static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par)
+{
+	nf_ct_l3proto_module_put(par->family);
+}
+
+static struct xt_target synproxy_tg6_reg __read_mostly = {
+	.name		= "SYNPROXY",
+	.family		= NFPROTO_IPV6,
+	.target		= synproxy_tg6,
+	.targetsize	= sizeof(struct xt_synproxy_info),
+	.checkentry	= synproxy_tg6_check,
+	.destroy	= synproxy_tg6_destroy,
+	.me		= THIS_MODULE,
+};
+
+static struct nf_hook_ops ipv6_synproxy_ops[] __read_mostly = {
+	{
+		.hook		= ipv6_synproxy_hook,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+	},
+	{
+		.hook		= ipv6_synproxy_hook,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_POST_ROUTING,
+		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+	},
+};
+
+static int __init synproxy_tg6_init(void)
+{
+	int err;
+
+	err = nf_register_hooks(ipv6_synproxy_ops,
+				ARRAY_SIZE(ipv6_synproxy_ops));
+	if (err < 0)
+		goto err1;
+
+	err = xt_register_target(&synproxy_tg6_reg);
+	if (err < 0)
+		goto err2;
+
+	return 0;
+
+err2:
+	nf_unregister_hooks(ipv6_synproxy_ops, ARRAY_SIZE(ipv6_synproxy_ops));
+err1:
+	return err;
+}
+
+static void __exit synproxy_tg6_exit(void)
+{
+	xt_unregister_target(&synproxy_tg6_reg);
+	nf_unregister_hooks(ipv6_synproxy_ops, ARRAY_SIZE(ipv6_synproxy_ops));
+}
+
+module_init(synproxy_tg6_init);
+module_exit(synproxy_tg6_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-- 
cgit v1.2.3


From b7e092c05b308674c642ed7fb754d555f0ebba81 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 27 Aug 2013 11:47:26 +0200
Subject: netfilter: ctnetlink: fix uninitialized variable

net/netfilter/nf_conntrack_netlink.c: In function 'ctnetlink_nfqueue_attach_expect':
'helper' may be used uninitialized in this function

It was only initialized in if CTA_EXPECT_HELP_NAME attribute was
present, it must be NULL otherwise.

Problem added recently in bd077937
(netfilter: nfnetlink_queue: allow to attach expectations to conntracks).

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_netlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 7c55745ececf..eea936b70d15 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2162,7 +2162,7 @@ ctnetlink_nfqueue_attach_expect(const struct nlattr *attr, struct nf_conn *ct,
 {
 	struct nlattr *cda[CTA_EXPECT_MAX+1];
 	struct nf_conntrack_tuple tuple, mask;
-	struct nf_conntrack_helper *helper;
+	struct nf_conntrack_helper *helper = NULL;
 	struct nf_conntrack_expect *exp;
 	int err;
 
-- 
cgit v1.2.3


From c54f38c9aa22ff65ca9f4c1bdbf2a11d017205f3 Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <simon@open-mesh.com>
Date: Mon, 29 Jul 2013 17:56:44 +0200
Subject: batman-adv: set skb priority according to content

The skb priority field may help the wireless driver to choose the right
queue (e.g. WMM queues). This should be set in batman-adv, as this
information is only available here.

This patch adds support for IPv4/IPv6 DS fields and VLAN PCP. Note that
only VLAN PCP is used if a VLAN header is present. Also initially set
TC_PRIO_CONTROL only for self-generated packets, and keep the priority
set by higher layers.

Signed-off-by: Simon Wunderlich <simon@open-mesh.com>
Signed-off-by: Marek Lindner <lindner_marek@yahoo.de>
Signed-off-by: Antonio Quartulli <ordex@autistici.org>
---
 net/batman-adv/bat_iv_ogm.c        |  1 +
 net/batman-adv/icmp_socket.c       |  1 +
 net/batman-adv/main.c              | 58 ++++++++++++++++++++++++++++++++++++++
 net/batman-adv/main.h              |  1 +
 net/batman-adv/routing.c           | 20 ++++++++++++-
 net/batman-adv/send.c              |  1 -
 net/batman-adv/soft-interface.c    |  2 ++
 net/batman-adv/translation-table.c |  5 ++++
 net/batman-adv/unicast.c           |  2 ++
 net/batman-adv/vis.c               |  2 ++
 10 files changed, 91 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 62da5278014a..9886a2f804fc 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -478,6 +478,7 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
 		kfree(forw_packet_aggr);
 		goto out;
 	}
+	forw_packet_aggr->skb->priority = TC_PRIO_CONTROL;
 	skb_reserve(forw_packet_aggr->skb, ETH_HLEN);
 
 	skb_buff = skb_put(forw_packet_aggr->skb, packet_len);
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index b27508b8085c..5a99bb4b6b82 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -183,6 +183,7 @@ static ssize_t batadv_socket_write(struct file *file, const char __user *buff,
 		goto out;
 	}
 
+	skb->priority = TC_PRIO_CONTROL;
 	skb_reserve(skb, ETH_HLEN);
 	icmp_packet = (struct batadv_icmp_packet_rr *)skb_put(skb, packet_len);
 
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 08125f3f6064..c72d1bcdcf49 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -19,6 +19,10 @@
 
 #include <linux/crc32c.h>
 #include <linux/highmem.h>
+#include <linux/if_vlan.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/dsfield.h>
 #include "main.h"
 #include "sysfs.h"
 #include "debugfs.h"
@@ -249,6 +253,60 @@ out:
 	return primary_if;
 }
 
+/**
+ * batadv_skb_set_priority - sets skb priority according to packet content
+ * @skb: the packet to be sent
+ * @offset: offset to the packet content
+ *
+ * This function sets a value between 256 and 263 (802.1d priority), which
+ * can be interpreted by the cfg80211 or other drivers.
+ */
+void batadv_skb_set_priority(struct sk_buff *skb, int offset)
+{
+	struct iphdr ip_hdr_tmp, *ip_hdr;
+	struct ipv6hdr ip6_hdr_tmp, *ip6_hdr;
+	struct ethhdr ethhdr_tmp, *ethhdr;
+	struct vlan_ethhdr *vhdr, vhdr_tmp;
+	u32 prio;
+
+	/* already set, do nothing */
+	if (skb->priority >= 256 && skb->priority <= 263)
+		return;
+
+	ethhdr = skb_header_pointer(skb, offset, sizeof(*ethhdr), &ethhdr_tmp);
+	if (!ethhdr)
+		return;
+
+	switch (ethhdr->h_proto) {
+	case htons(ETH_P_8021Q):
+		vhdr = skb_header_pointer(skb, offset + sizeof(*vhdr),
+					  sizeof(*vhdr), &vhdr_tmp);
+		if (!vhdr)
+			return;
+		prio = ntohs(vhdr->h_vlan_TCI) & VLAN_PRIO_MASK;
+		prio = prio >> VLAN_PRIO_SHIFT;
+		break;
+	case htons(ETH_P_IP):
+		ip_hdr = skb_header_pointer(skb, offset + sizeof(*ethhdr),
+					    sizeof(*ip_hdr), &ip_hdr_tmp);
+		if (!ip_hdr)
+			return;
+		prio = (ipv4_get_dsfield(ip_hdr) & 0xfc) >> 5;
+		break;
+	case htons(ETH_P_IPV6):
+		ip6_hdr = skb_header_pointer(skb, offset + sizeof(*ethhdr),
+					     sizeof(*ip6_hdr), &ip6_hdr_tmp);
+		if (!ip6_hdr)
+			return;
+		prio = (ipv6_get_dsfield(ip6_hdr) & 0xfc) >> 5;
+		break;
+	default:
+		return;
+	}
+
+	skb->priority = prio + 256;
+}
+
 static int batadv_recv_unhandled_packet(struct sk_buff *skb,
 					struct batadv_hard_iface *recv_if)
 {
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 5e9aebb7d56b..a7bca78a6251 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -184,6 +184,7 @@ void batadv_mesh_free(struct net_device *soft_iface);
 int batadv_is_my_mac(struct batadv_priv *bat_priv, const uint8_t *addr);
 struct batadv_hard_iface *
 batadv_seq_print_text_primary_if_get(struct seq_file *seq);
+void batadv_skb_set_priority(struct sk_buff *skb, int offset);
 int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
 			   struct packet_type *ptype,
 			   struct net_device *orig_dev);
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 2f0bd3ffe6e8..0439395d7ba5 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -775,7 +775,7 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
 	struct batadv_neigh_node *neigh_node = NULL;
 	struct batadv_unicast_packet *unicast_packet;
 	struct ethhdr *ethhdr = eth_hdr(skb);
-	int res, ret = NET_RX_DROP;
+	int res, hdr_len, ret = NET_RX_DROP;
 	struct sk_buff *new_skb;
 
 	unicast_packet = (struct batadv_unicast_packet *)skb->data;
@@ -835,6 +835,22 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
 	/* decrement ttl */
 	unicast_packet->header.ttl--;
 
+	switch (unicast_packet->header.packet_type) {
+	case BATADV_UNICAST_4ADDR:
+		hdr_len = sizeof(struct batadv_unicast_4addr_packet);
+		break;
+	case BATADV_UNICAST:
+		hdr_len = sizeof(struct batadv_unicast_packet);
+		break;
+	default:
+		/* other packet types not supported - yet */
+		hdr_len = -1;
+		break;
+	}
+
+	if (hdr_len > 0)
+		batadv_skb_set_priority(skb, hdr_len);
+
 	res = batadv_send_skb_to_orig(skb, orig_node, recv_if);
 
 	/* translate transmit result into receive result */
@@ -1193,6 +1209,8 @@ int batadv_recv_bcast_packet(struct sk_buff *skb,
 	if (batadv_bla_check_bcast_duplist(bat_priv, skb))
 		goto out;
 
+	batadv_skb_set_priority(skb, sizeof(struct batadv_bcast_packet));
+
 	/* rebroadcast packet */
 	batadv_add_bcast_packet_to_list(bat_priv, skb, 1);
 
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index e9ff8d801201..0266edd0fa7f 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -67,7 +67,6 @@ int batadv_send_skb_packet(struct sk_buff *skb,
 	ethhdr->h_proto = __constant_htons(ETH_P_BATMAN);
 
 	skb_set_network_header(skb, ETH_HLEN);
-	skb->priority = TC_PRIO_CONTROL;
 	skb->protocol = __constant_htons(ETH_P_BATMAN);
 
 	skb->dev = hard_iface->net_dev;
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 0f04e1c302b4..4493913f0d5c 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -229,6 +229,8 @@ static int batadv_interface_tx(struct sk_buff *skb,
 		 */
 	}
 
+	batadv_skb_set_priority(skb, 0);
+
 	/* ethernet packet should be broadcasted */
 	if (do_bcast) {
 		primary_if = batadv_primary_if_get_selected(bat_priv);
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 429aeef3d8b2..34510f38708f 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -1626,6 +1626,7 @@ batadv_tt_response_fill_table(uint16_t tt_len, uint8_t ttvn,
 	if (!skb)
 		goto out;
 
+	skb->priority = TC_PRIO_CONTROL;
 	skb_reserve(skb, ETH_HLEN);
 	tt_response = (struct batadv_tt_query_packet *)skb_put(skb, len);
 	tt_response->ttvn = ttvn;
@@ -1691,6 +1692,7 @@ static int batadv_send_tt_request(struct batadv_priv *bat_priv,
 	if (!skb)
 		goto out;
 
+	skb->priority = TC_PRIO_CONTROL;
 	skb_reserve(skb, ETH_HLEN);
 
 	tt_req_len = sizeof(*tt_request);
@@ -1788,6 +1790,7 @@ batadv_send_other_tt_response(struct batadv_priv *bat_priv,
 		if (!skb)
 			goto unlock;
 
+		skb->priority = TC_PRIO_CONTROL;
 		skb_reserve(skb, ETH_HLEN);
 		packet_pos = skb_put(skb, len);
 		tt_response = (struct batadv_tt_query_packet *)packet_pos;
@@ -1906,6 +1909,7 @@ batadv_send_my_tt_response(struct batadv_priv *bat_priv,
 		if (!skb)
 			goto unlock;
 
+		skb->priority = TC_PRIO_CONTROL;
 		skb_reserve(skb, ETH_HLEN);
 		packet_pos = skb_put(skb, len);
 		tt_response = (struct batadv_tt_query_packet *)packet_pos;
@@ -2240,6 +2244,7 @@ static void batadv_send_roam_adv(struct batadv_priv *bat_priv, uint8_t *client,
 	if (!skb)
 		goto out;
 
+	skb->priority = TC_PRIO_CONTROL;
 	skb_reserve(skb, ETH_HLEN);
 
 	roam_adv_packet = (struct batadv_roam_adv_packet *)skb_put(skb, len);
diff --git a/net/batman-adv/unicast.c b/net/batman-adv/unicast.c
index 857e1b8349ee..48b31d33ce6b 100644
--- a/net/batman-adv/unicast.c
+++ b/net/batman-adv/unicast.c
@@ -242,6 +242,8 @@ int batadv_frag_send_skb(struct sk_buff *skb, struct batadv_priv *bat_priv,
 	frag_skb = dev_alloc_skb(data_len - (data_len / 2) + ucf_hdr_len);
 	if (!frag_skb)
 		goto dropped;
+
+	skb->priority = TC_PRIO_CONTROL;
 	skb_reserve(frag_skb, ucf_hdr_len);
 
 	unicast_packet = (struct batadv_unicast_packet *)skb->data;
diff --git a/net/batman-adv/vis.c b/net/batman-adv/vis.c
index 4983340f1943..d8ea31a58457 100644
--- a/net/batman-adv/vis.c
+++ b/net/batman-adv/vis.c
@@ -397,6 +397,7 @@ batadv_add_packet(struct batadv_priv *bat_priv,
 		kfree(info);
 		return NULL;
 	}
+	info->skb_packet->priority = TC_PRIO_CONTROL;
 	skb_reserve(info->skb_packet, ETH_HLEN);
 	packet = (struct batadv_vis_packet *)skb_put(info->skb_packet, len);
 
@@ -861,6 +862,7 @@ int batadv_vis_init(struct batadv_priv *bat_priv)
 	if (!bat_priv->vis.my_info->skb_packet)
 		goto free_info;
 
+	bat_priv->vis.my_info->skb_packet->priority = TC_PRIO_CONTROL;
 	skb_reserve(bat_priv->vis.my_info->skb_packet, ETH_HLEN);
 	tmp_skb = bat_priv->vis.my_info->skb_packet;
 	packet = (struct batadv_vis_packet *)skb_put(tmp_skb, sizeof(*packet));
-- 
cgit v1.2.3


From 791c2a2d3f1848b56b14706059d9972dade5ec1e Mon Sep 17 00:00:00 2001
From: Antonio Quartulli <ordex@autistici.org>
Date: Sat, 17 Aug 2013 12:44:44 +0200
Subject: batman-adv: move enum definition at the top of the file

Signed-off-by: Antonio Quartulli <ordex@autistici.org>
---
 net/batman-adv/bat_iv_ogm.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 9886a2f804fc..0a8a80cd4bf1 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -28,6 +28,22 @@
 #include "bat_algo.h"
 #include "network-coding.h"
 
+
+/**
+ * batadv_dup_status - duplicate status
+ * @BATADV_NO_DUP: the packet is a duplicate
+ * @BATADV_ORIG_DUP: OGM is a duplicate in the originator (but not for the
+ *  neighbor)
+ * @BATADV_NEIGH_DUP: OGM is a duplicate for the neighbor
+ * @BATADV_PROTECTED: originator is currently protected (after reboot)
+ */
+enum batadv_dup_status {
+	BATADV_NO_DUP = 0,
+	BATADV_ORIG_DUP,
+	BATADV_NEIGH_DUP,
+	BATADV_PROTECTED,
+};
+
 /**
  * batadv_ring_buffer_set - update the ring buffer with the given value
  * @lq_recv: pointer to the ring buffer
@@ -71,21 +87,6 @@ static uint8_t batadv_ring_buffer_avg(const uint8_t lq_recv[])
 	return (uint8_t)(sum / count);
 }
 
-/*
- * batadv_dup_status - duplicate status
- * @BATADV_NO_DUP: the packet is a duplicate
- * @BATADV_ORIG_DUP: OGM is a duplicate in the originator (but not for the
- *  neighbor)
- * @BATADV_NEIGH_DUP: OGM is a duplicate for the neighbor
- * @BATADV_PROTECTED: originator is currently protected (after reboot)
- */
-enum batadv_dup_status {
-	BATADV_NO_DUP = 0,
-	BATADV_ORIG_DUP,
-	BATADV_NEIGH_DUP,
-	BATADV_PROTECTED,
-};
-
 static struct batadv_neigh_node *
 batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface,
 			const uint8_t *neigh_addr,
-- 
cgit v1.2.3


From c00a072d3fec446d763e68b50f6f6cfd9e2957e2 Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Date: Sun, 21 Jul 2013 23:03:15 +0200
Subject: batman-adv: Start new development cycle

Signed-off-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Antonio Quartulli <ordex@autistici.org>
---
 net/batman-adv/main.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index a7bca78a6251..5d00f23b4621 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -26,7 +26,7 @@
 #define BATADV_DRIVER_DEVICE "batman-adv"
 
 #ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2013.3.0"
+#define BATADV_SOURCE_VERSION "2013.4.0"
 #endif
 
 /* B.A.T.M.A.N. parameters */
-- 
cgit v1.2.3


From c6eaa3f067d6f9fa55fd9645e93ed79411bebdaf Mon Sep 17 00:00:00 2001
From: Antonio Quartulli <antonio@open-mesh.com>
Date: Sat, 13 Jul 2013 00:06:00 +0200
Subject: batman-adv: send GW_DEL event when the gw client mode is deselected

Whenever the GW client mode is deselected, a DEL event has
to be sent in order to tell userspace that the current
gateway has been lost. Send the uevent on state change only
if a gateway was currently selected.

Reported-by: Marek Lindner <lindner_marek@yahoo.de>
Signed-off-by: Antonio Quartulli <antonio@open-mesh.com>
Signed-off-by: Marek Lindner <lindner_marek@yahoo.de>
---
 net/batman-adv/gateway_client.c | 27 +++++++++++++++++++++++++++
 net/batman-adv/gateway_client.h |  1 +
 net/batman-adv/sysfs.c          |  4 ++++
 3 files changed, 32 insertions(+)

(limited to 'net')

diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 7614af31daff..1ce4b8763ef2 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -190,6 +190,33 @@ next:
 	return curr_gw;
 }
 
+/**
+ * batadv_gw_check_client_stop - check if client mode has been switched off
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * This function assumes the caller has checked that the gw state *is actually
+ * changing*. This function is not supposed to be called when there is no state
+ * change.
+ */
+void batadv_gw_check_client_stop(struct batadv_priv *bat_priv)
+{
+	struct batadv_gw_node *curr_gw;
+
+	if (atomic_read(&bat_priv->gw_mode) != BATADV_GW_MODE_CLIENT)
+		return;
+
+	curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
+	if (!curr_gw)
+		return;
+
+	/* if batman-adv is switching the gw client mode off and a gateway was
+	 * already selected, send a DEL uevent
+	 */
+	batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_DEL, NULL);
+
+	batadv_gw_node_free_ref(curr_gw);
+}
+
 void batadv_gw_election(struct batadv_priv *bat_priv)
 {
 	struct batadv_gw_node *curr_gw = NULL, *next_gw = NULL;
diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h
index 1037d75da51f..ceef4ebe8bcd 100644
--- a/net/batman-adv/gateway_client.h
+++ b/net/batman-adv/gateway_client.h
@@ -20,6 +20,7 @@
 #ifndef _NET_BATMAN_ADV_GATEWAY_CLIENT_H_
 #define _NET_BATMAN_ADV_GATEWAY_CLIENT_H_
 
+void batadv_gw_check_client_stop(struct batadv_priv *bat_priv);
 void batadv_gw_deselect(struct batadv_priv *bat_priv);
 void batadv_gw_election(struct batadv_priv *bat_priv);
 struct batadv_orig_node *
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index 929e304dacb2..4114b961bc2c 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -385,6 +385,10 @@ static ssize_t batadv_store_gw_mode(struct kobject *kobj,
 		    curr_gw_mode_str, buff);
 
 	batadv_gw_deselect(bat_priv);
+	/* always call batadv_gw_check_client_stop() before changing the gateway
+	 * state
+	 */
+	batadv_gw_check_client_stop(bat_priv);
 	atomic_set(&bat_priv->gw_mode, (unsigned int)gw_mode_tmp);
 	return count;
 }
-- 
cgit v1.2.3


From fd094808a06e290432fc13c09aae808aea34d2ca Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 27 Aug 2013 12:03:53 +0100
Subject: bridge: inherit slave devices needed_headroom

Some slave devices may have set a dev->needed_headroom value which is
different than the default one, most likely in order to prepend a
hardware descriptor in front of the Ethernet frame to send. Whenever a
new slave is added to a bridge, ensure that we update the
needed_headroom value accordingly to account for the slave
needed_headroom value.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_if.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index aa6c9a8ba32a..c41d5fbb91d0 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -383,6 +383,9 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 
 	netdev_update_features(br->dev);
 
+	if (br->dev->needed_headroom < dev->needed_headroom)
+		br->dev->needed_headroom = dev->needed_headroom;
+
 	spin_lock_bh(&br->lock);
 	changed_addr = br_stp_recalculate_bridge_id(br);
 
-- 
cgit v1.2.3


From b800c3b966bcf004bd8592293a49ed5cb7ea67a9 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Tue, 27 Aug 2013 01:36:51 +0200
Subject: ipv6: drop fragmented ndisc packets by default (RFC 6980)

This patch implements RFC6980: Drop fragmented ndisc packets by
default. If a fragmented ndisc packet is received the user is informed
that it is possible to disable the check.

Cc: Fernando Gont <fernando@gont.com.ar>
Cc: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  6 ++++++
 include/linux/ipv6.h                   |  1 +
 include/uapi/linux/ipv6.h              |  1 +
 net/ipv6/addrconf.c                    | 10 ++++++++++
 net/ipv6/ndisc.c                       | 17 +++++++++++++++++
 5 files changed, 35 insertions(+)

(limited to 'net')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index debfe857d8f9..a2be556032c9 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1349,6 +1349,12 @@ mldv2_unsolicited_report_interval - INTEGER
 	MLDv2 report retransmit will take place.
 	Default: 1000 (1 second)
 
+suppress_frag_ndisc - INTEGER
+	Control RFC 6980 (Security Implications of IPv6 Fragmentation
+	with IPv6 Neighbor Discovery) behavior:
+	1 - (default) discard fragmented neighbor discovery packets
+	0 - allow fragmented neighbor discovery packets
+
 icmp/*:
 ratelimit - INTEGER
 	Limit the maximal rates for sending ICMPv6 packets.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 9ac5047062c8..28ea38439313 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -50,6 +50,7 @@ struct ipv6_devconf {
 	__s32		accept_dad;
 	__s32		force_tllao;
 	__s32           ndisc_notify;
+	__s32		suppress_frag_ndisc;
 	void		*sysctl;
 };
 
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index d07ac6903e59..593b0e32d956 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -162,6 +162,7 @@ enum {
 	DEVCONF_NDISC_NOTIFY,
 	DEVCONF_MLDV1_UNSOLICITED_REPORT_INTERVAL,
 	DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL,
+	DEVCONF_SUPPRESS_FRAG_NDISC,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 2d6d1793bbfe..a7183fc9bbc2 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -204,6 +204,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.accept_source_route	= 0,	/* we do not accept RH0 by default. */
 	.disable_ipv6		= 0,
 	.accept_dad		= 1,
+	.suppress_frag_ndisc	= 1,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -241,6 +242,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.accept_source_route	= 0,	/* we do not accept RH0 by default. */
 	.disable_ipv6		= 0,
 	.accept_dad		= 1,
+	.suppress_frag_ndisc	= 1,
 };
 
 /* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */
@@ -4188,6 +4190,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_ACCEPT_DAD] = cnf->accept_dad;
 	array[DEVCONF_FORCE_TLLAO] = cnf->force_tllao;
 	array[DEVCONF_NDISC_NOTIFY] = cnf->ndisc_notify;
+	array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5001,6 +5004,13 @@ static struct addrconf_sysctl_table
 			.mode           = 0644,
 			.proc_handler   = proc_dointvec
 		},
+		{
+			.procname	= "suppress_frag_ndisc",
+			.data		= &ipv6_devconf.suppress_frag_ndisc,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec
+		},
 		{
 			/* sentinel */
 		}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 04d31c2fbef1..41720feeaa64 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1519,10 +1519,27 @@ static void pndisc_redo(struct sk_buff *skb)
 	kfree_skb(skb);
 }
 
+static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb)
+{
+	struct inet6_dev *idev = __in6_dev_get(skb->dev);
+
+	if (!idev)
+		return true;
+	if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED &&
+	    idev->cnf.suppress_frag_ndisc) {
+		net_warn_ratelimited("Received fragmented ndisc packet. Carefully consider disabling suppress_frag_ndisc.\n");
+		return true;
+	}
+	return false;
+}
+
 int ndisc_rcv(struct sk_buff *skb)
 {
 	struct nd_msg *msg;
 
+	if (ndisc_suppress_frag_ndisc(skb))
+		return 0;
+
 	if (skb_linearize(skb))
 		return 0;
 
-- 
cgit v1.2.3


From 95bd09eb27507691520d39ee1044d6ad831c1168 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 27 Aug 2013 05:46:32 -0700
Subject: tcp: TSO packets automatic sizing

After hearing many people over past years complaining against TSO being
bursty or even buggy, we are proud to present automatic sizing of TSO
packets.

One part of the problem is that tcp_tso_should_defer() uses an heuristic
relying on upcoming ACKS instead of a timer, but more generally, having
big TSO packets makes little sense for low rates, as it tends to create
micro bursts on the network, and general consensus is to reduce the
buffering amount.

This patch introduces a per socket sk_pacing_rate, that approximates
the current sending rate, and allows us to size the TSO packets so
that we try to send one packet every ms.

This field could be set by other transports.

Patch has no impact for high speed flows, where having large TSO packets
makes sense to reach line rate.

For other flows, this helps better packet scheduling and ACK clocking.

This patch increases performance of TCP flows in lossy environments.

A new sysctl (tcp_min_tso_segs) is added, to specify the
minimal size of a TSO packet (default being 2).

A follow-up patch will provide a new packet scheduler (FQ), using
sk_pacing_rate as an input to perform optional per flow pacing.

This explains why we chose to set sk_pacing_rate to twice the current
rate, allowing 'slow start' ramp up.

sk_pacing_rate = 2 * cwnd * mss / srtt

v2: Neal Cardwell reported a suspect deferring of last two segments on
initial write of 10 MSS, I had to change tcp_tso_should_defer() to take
into account tp->xmit_size_goal_segs

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Van Jacobson <vanj@google.com>
Cc: Tom Herbert <therbert@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  9 +++++++++
 include/net/sock.h                     |  2 ++
 include/net/tcp.h                      |  1 +
 net/ipv4/sysctl_net_ipv4.c             | 10 ++++++++++
 net/ipv4/tcp.c                         | 28 +++++++++++++++++++++++-----
 net/ipv4/tcp_input.c                   | 32 +++++++++++++++++++++++++++++++-
 net/ipv4/tcp_output.c                  |  2 +-
 7 files changed, 77 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index a2be556032c9..1cb3aeb4baff 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -482,6 +482,15 @@ tcp_syn_retries - INTEGER
 tcp_timestamps - BOOLEAN
 	Enable timestamps as defined in RFC1323.
 
+tcp_min_tso_segs - INTEGER
+	Minimal number of segments per TSO frame.
+	Since linux-3.12, TCP does an automatic sizing of TSO frames,
+	depending on flow rate, instead of filling 64Kbytes packets.
+	For specific usages, it's possible to force TCP to build big
+	TSO frames. Note that TCP stack might split too big TSO packets
+	if available window is too small.
+	Default: 2
+
 tcp_tso_win_divisor - INTEGER
 	This allows control over what percentage of the congestion window
 	can be consumed by a single TSO frame.
diff --git a/include/net/sock.h b/include/net/sock.h
index e4bbcbfd07ea..6ba2e7b0e2b1 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -232,6 +232,7 @@ struct cg_proto;
   *	@sk_napi_id: id of the last napi context to receive data for sk
   *	@sk_ll_usec: usecs to busypoll when there is no data
   *	@sk_allocation: allocation mode
+  *	@sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler)
   *	@sk_sndbuf: size of send buffer in bytes
   *	@sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
   *		   %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
@@ -361,6 +362,7 @@ struct sock {
 	kmemcheck_bitfield_end(flags);
 	int			sk_wmem_queued;
 	gfp_t			sk_allocation;
+	u32			sk_pacing_rate; /* bytes per second */
 	netdev_features_t	sk_route_caps;
 	netdev_features_t	sk_route_nocaps;
 	int			sk_gso_type;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index dd5e16f66f84..6a6a88db462d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -281,6 +281,7 @@ extern int sysctl_tcp_early_retrans;
 extern int sysctl_tcp_limit_output_bytes;
 extern int sysctl_tcp_challenge_ack_limit;
 extern unsigned int sysctl_tcp_notsent_lowat;
+extern int sysctl_tcp_min_tso_segs;
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 8ed7c32ae28e..540279f4c531 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -29,6 +29,7 @@
 static int zero;
 static int one = 1;
 static int four = 4;
+static int gso_max_segs = GSO_MAX_SEGS;
 static int tcp_retr1_max = 255;
 static int ip_local_port_range_min[] = { 1, 1 };
 static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -760,6 +761,15 @@ static struct ctl_table ipv4_table[] = {
 		.extra1		= &zero,
 		.extra2		= &four,
 	},
+	{
+		.procname	= "tcp_min_tso_segs",
+		.data		= &sysctl_tcp_min_tso_segs,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &gso_max_segs,
+	},
 	{
 		.procname	= "udp_mem",
 		.data		= &sysctl_udp_mem,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4e42c03859f4..fdf74090a001 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -283,6 +283,8 @@
 
 int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
 
+int sysctl_tcp_min_tso_segs __read_mostly = 2;
+
 struct percpu_counter tcp_orphan_count;
 EXPORT_SYMBOL_GPL(tcp_orphan_count);
 
@@ -785,12 +787,28 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
 	xmit_size_goal = mss_now;
 
 	if (large_allowed && sk_can_gso(sk)) {
-		xmit_size_goal = ((sk->sk_gso_max_size - 1) -
-				  inet_csk(sk)->icsk_af_ops->net_header_len -
-				  inet_csk(sk)->icsk_ext_hdr_len -
-				  tp->tcp_header_len);
+		u32 gso_size, hlen;
+
+		/* Maybe we should/could use sk->sk_prot->max_header here ? */
+		hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
+		       inet_csk(sk)->icsk_ext_hdr_len +
+		       tp->tcp_header_len;
+
+		/* Goal is to send at least one packet per ms,
+		 * not one big TSO packet every 100 ms.
+		 * This preserves ACK clocking and is consistent
+		 * with tcp_tso_should_defer() heuristic.
+		 */
+		gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC);
+		gso_size = max_t(u32, gso_size,
+				 sysctl_tcp_min_tso_segs * mss_now);
+
+		xmit_size_goal = min_t(u32, gso_size,
+				       sk->sk_gso_max_size - 1 - hlen);
 
-		/* TSQ : try to have two TSO segments in flight */
+		/* TSQ : try to have at least two segments in flight
+		 * (one in NIC TX ring, another in Qdisc)
+		 */
 		xmit_size_goal = min_t(u32, xmit_size_goal,
 				       sysctl_tcp_limit_output_bytes >> 1);
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ec492eae0cd7..1a84fffe6993 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -688,6 +688,34 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
 	}
 }
 
+/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
+ * Note: TCP stack does not yet implement pacing.
+ * FQ packet scheduler can be used to implement cheap but effective
+ * TCP pacing, to smooth the burst on large writes when packets
+ * in flight is significantly lower than cwnd (or rwin)
+ */
+static void tcp_update_pacing_rate(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	u64 rate;
+
+	/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
+	rate = (u64)tp->mss_cache * 2 * (HZ << 3);
+
+	rate *= max(tp->snd_cwnd, tp->packets_out);
+
+	/* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3),
+	 * be conservative and assume srtt = 1 (125 us instead of 1.25 ms)
+	 * We probably need usec resolution in the future.
+	 * Note: This also takes care of possible srtt=0 case,
+	 * when tcp_rtt_estimator() was not yet called.
+	 */
+	if (tp->srtt > 8 + 2)
+		do_div(rate, tp->srtt);
+
+	sk->sk_pacing_rate = min_t(u64, rate, ~0U);
+}
+
 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
  * routine referred to above.
  */
@@ -3278,7 +3306,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	u32 ack_seq = TCP_SKB_CB(skb)->seq;
 	u32 ack = TCP_SKB_CB(skb)->ack_seq;
 	bool is_dupack = false;
-	u32 prior_in_flight;
+	u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt;
 	u32 prior_fackets;
 	int prior_packets = tp->packets_out;
 	const int prior_unsacked = tp->packets_out - tp->sacked_out;
@@ -3383,6 +3411,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS)
 		tcp_schedule_loss_probe(sk);
+	if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd)
+		tcp_update_pacing_rate(sk);
 	return 1;
 
 no_queue:
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 884efff5b531..e63ae4c9691d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1631,7 +1631,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
 
 	/* If a full-sized TSO skb can be sent, do it. */
 	if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
-			   sk->sk_gso_max_segs * tp->mss_cache))
+			   tp->xmit_size_goal_segs * tp->mss_cache))
 		goto send_now;
 
 	/* Middle in queue won't get any more data, full sendable already? */
-- 
cgit v1.2.3


From 7613f5fe11c518c16b6b50dabb4964052766b73b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Tue, 27 Aug 2013 16:53:52 +0200
Subject: net: sctp: sctp_verify_init: clean up mandatory checks and add
 comment

Add a comment related to RFC4960 explaning why we do not check for initial
TSN, and while at it, remove yoda notation checks and clean up code from
checks of mandatory conditions. That's probably just really minor, but makes
reviewing easier.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_make_chunk.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 01e97836ca6c..d244a23ab8d3 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2240,25 +2240,23 @@ int sctp_verify_init(struct net *net, const struct sctp_association *asoc,
 		     struct sctp_chunk **errp)
 {
 	union sctp_params param;
-	int has_cookie = 0;
+	bool has_cookie = false;
 	int result;
 
-	/* Verify stream values are non-zero. */
-	if ((0 == peer_init->init_hdr.num_outbound_streams) ||
-	    (0 == peer_init->init_hdr.num_inbound_streams) ||
-	    (0 == peer_init->init_hdr.init_tag) ||
-	    (SCTP_DEFAULT_MINWINDOW > ntohl(peer_init->init_hdr.a_rwnd))) {
-
+	/* Check for missing mandatory parameters. Note: Initial TSN is
+	 * also mandatory, but is not checked here since the valid range
+	 * is 0..2**32-1. RFC4960, section 3.3.3.
+	 */
+	if (peer_init->init_hdr.num_outbound_streams == 0 ||
+	    peer_init->init_hdr.num_inbound_streams == 0 ||
+	    peer_init->init_hdr.init_tag == 0 ||
+	    ntohl(peer_init->init_hdr.a_rwnd) < SCTP_DEFAULT_MINWINDOW)
 		return sctp_process_inv_mandatory(asoc, chunk, errp);
-	}
 
-	/* Check for missing mandatory parameters.  */
 	sctp_walk_params(param, peer_init, init_hdr.params) {
-
-		if (SCTP_PARAM_STATE_COOKIE == param.p->type)
-			has_cookie = 1;
-
-	} /* for (loop through all parameters) */
+		if (param.p->type == SCTP_PARAM_STATE_COOKIE)
+			has_cookie = true;
+	}
 
 	/* There is a possibility that a parameter length was bad and
 	 * in that case we would have stoped walking the parameters.
-- 
cgit v1.2.3


From aa9d85605f5ab070b64842b3eba797cf81698ae1 Mon Sep 17 00:00:00 2001
From: Veaceslav Falico <vfalico@redhat.com>
Date: Wed, 28 Aug 2013 23:25:04 +0200
Subject: net: rename netdev_upper to netdev_adjacent

Rename the structure to reflect the upcoming addition of lower_dev_list.

CC: "David S. Miller" <davem@davemloft.net>
CC: Eric Dumazet <edumazet@google.com>
CC: Jiri Pirko <jiri@resnulli.us>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Cong Wang <amwang@redhat.com>
Signed-off-by: Veaceslav Falico <vfalico@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 1ed2b66a10a6..5072e2c1a072 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4367,7 +4367,7 @@ softnet_break:
 	goto out;
 }
 
-struct netdev_upper {
+struct netdev_adjacent {
 	struct net_device *dev;
 	bool master;
 	struct list_head list;
@@ -4378,7 +4378,7 @@ struct netdev_upper {
 static void __append_search_uppers(struct list_head *search_list,
 				   struct net_device *dev)
 {
-	struct netdev_upper *upper;
+	struct netdev_adjacent *upper;
 
 	list_for_each_entry(upper, &dev->upper_dev_list, list) {
 		/* check if this upper is not already in search list */
@@ -4391,8 +4391,8 @@ static bool __netdev_search_upper_dev(struct net_device *dev,
 				      struct net_device *upper_dev)
 {
 	LIST_HEAD(search_list);
-	struct netdev_upper *upper;
-	struct netdev_upper *tmp;
+	struct netdev_adjacent *upper;
+	struct netdev_adjacent *tmp;
 	bool ret = false;
 
 	__append_search_uppers(&search_list, dev);
@@ -4408,10 +4408,10 @@ static bool __netdev_search_upper_dev(struct net_device *dev,
 	return ret;
 }
 
-static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
+static struct netdev_adjacent *__netdev_find_upper(struct net_device *dev,
 						struct net_device *upper_dev)
 {
-	struct netdev_upper *upper;
+	struct netdev_adjacent *upper;
 
 	list_for_each_entry(upper, &dev->upper_dev_list, list) {
 		if (upper->dev == upper_dev)
@@ -4462,7 +4462,7 @@ EXPORT_SYMBOL(netdev_has_any_upper_dev);
  */
 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
 {
-	struct netdev_upper *upper;
+	struct netdev_adjacent *upper;
 
 	ASSERT_RTNL();
 
@@ -4470,7 +4470,7 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
 		return NULL;
 
 	upper = list_first_entry(&dev->upper_dev_list,
-				 struct netdev_upper, list);
+				 struct netdev_adjacent, list);
 	if (likely(upper->master))
 		return upper->dev;
 	return NULL;
@@ -4486,10 +4486,10 @@ EXPORT_SYMBOL(netdev_master_upper_dev_get);
  */
 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
 {
-	struct netdev_upper *upper;
+	struct netdev_adjacent *upper;
 
 	upper = list_first_or_null_rcu(&dev->upper_dev_list,
-				       struct netdev_upper, list);
+				       struct netdev_adjacent, list);
 	if (upper && likely(upper->master))
 		return upper->dev;
 	return NULL;
@@ -4499,7 +4499,7 @@ EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
 static int __netdev_upper_dev_link(struct net_device *dev,
 				   struct net_device *upper_dev, bool master)
 {
-	struct netdev_upper *upper;
+	struct netdev_adjacent *upper;
 
 	ASSERT_RTNL();
 
@@ -4580,7 +4580,7 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link);
 void netdev_upper_dev_unlink(struct net_device *dev,
 			     struct net_device *upper_dev)
 {
-	struct netdev_upper *upper;
+	struct netdev_adjacent *upper;
 
 	ASSERT_RTNL();
 
-- 
cgit v1.2.3


From 5d261913ca3daf6c2d21d38924235667b3d07c40 Mon Sep 17 00:00:00 2001
From: Veaceslav Falico <vfalico@redhat.com>
Date: Wed, 28 Aug 2013 23:25:05 +0200
Subject: net: add lower_dev_list to net_device and make a full mesh

This patch adds lower_dev_list list_head to net_device, which is the same
as upper_dev_list, only for lower devices, and begins to use it in the same
way as the upper list.

It also changes the way the whole adjacent device lists work - now they
contain *all* of upper/lower devices, not only the first level. The first
level devices are distinguished by the bool neighbour field in
netdev_adjacent, also added by this patch.

There are cases when a device can be added several times to the adjacent
list, the simplest would be:

     /---- eth0.10 ---\
eth0-		       --- bond0
     \---- eth0.20 ---/

where both bond0 and eth0 'see' each other in the adjacent lists two times.
To avoid duplication of netdev_adjacent structures ref_nr is being kept as
the number of times the device was added to the list.

The 'full view' is achieved by adding, on link creation, all of the
upper_dev's upper_dev_list devices as upper devices to all of the
lower_dev's lower_dev_list devices (and to the lower_dev itself), and vice
versa. On unlink they are removed using the same logic.

I've tested it with thousands vlans/bonds/bridges, everything works ok and
no observable lags even on a huge number of interfaces.

Memory footprint for 128 devices interconnected with each other via both
upper and lower (which is impossible, but for the comparison) lists would be:

128*128*2*sizeof(netdev_adjacent) = 1.5MB

but in the real world we usualy have at most several devices with slaves
and a lot of vlans, so the footprint will be much lower.

CC: "David S. Miller" <davem@davemloft.net>
CC: Eric Dumazet <edumazet@google.com>
CC: Jiri Pirko <jiri@resnulli.us>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Cong Wang <amwang@redhat.com>
Signed-off-by: Veaceslav Falico <vfalico@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |   1 +
 net/core/dev.c            | 285 +++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 259 insertions(+), 27 deletions(-)

(limited to 'net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 077363dcd860..5ccf5b73c378 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1125,6 +1125,7 @@ struct net_device {
 	struct list_head	napi_list;
 	struct list_head	unreg_list;
 	struct list_head	upper_dev_list; /* List of upper devices */
+	struct list_head	lower_dev_list;
 
 
 	/* currently active device features */
diff --git a/net/core/dev.c b/net/core/dev.c
index 5072e2c1a072..2aa914eee057 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4369,7 +4369,16 @@ softnet_break:
 
 struct netdev_adjacent {
 	struct net_device *dev;
+
+	/* upper master flag, there can only be one master device per list */
 	bool master;
+
+	/* indicates that this dev is our first-level lower/upper device */
+	bool neighbour;
+
+	/* counter for the number of times this device was added to us */
+	u16 ref_nr;
+
 	struct list_head list;
 	struct rcu_head rcu;
 	struct list_head search_list;
@@ -4408,18 +4417,34 @@ static bool __netdev_search_upper_dev(struct net_device *dev,
 	return ret;
 }
 
-static struct netdev_adjacent *__netdev_find_upper(struct net_device *dev,
-						struct net_device *upper_dev)
+static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
+						 struct net_device *adj_dev,
+						 bool upper)
 {
-	struct netdev_adjacent *upper;
+	struct netdev_adjacent *adj;
+	struct list_head *dev_list;
 
-	list_for_each_entry(upper, &dev->upper_dev_list, list) {
-		if (upper->dev == upper_dev)
-			return upper;
+	dev_list = upper ? &dev->upper_dev_list : &dev->lower_dev_list;
+
+	list_for_each_entry(adj, dev_list, list) {
+		if (adj->dev == adj_dev)
+			return adj;
 	}
 	return NULL;
 }
 
+static inline struct netdev_adjacent *__netdev_find_upper(struct net_device *dev,
+							  struct net_device *udev)
+{
+	return __netdev_find_adj(dev, udev, true);
+}
+
+static inline struct netdev_adjacent *__netdev_find_lower(struct net_device *dev,
+							  struct net_device *ldev)
+{
+	return __netdev_find_adj(dev, ldev, false);
+}
+
 /**
  * netdev_has_upper_dev - Check if device is linked to an upper device
  * @dev: device
@@ -4496,10 +4521,149 @@ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
 }
 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
 
+static int __netdev_adjacent_dev_insert(struct net_device *dev,
+					struct net_device *adj_dev,
+					bool neighbour, bool master,
+					bool upper)
+{
+	struct netdev_adjacent *adj;
+
+	adj = __netdev_find_adj(dev, adj_dev, upper);
+
+	if (adj) {
+		BUG_ON(neighbour);
+		adj->ref_nr++;
+		return 0;
+	}
+
+	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
+	if (!adj)
+		return -ENOMEM;
+
+	adj->dev = adj_dev;
+	adj->master = master;
+	adj->neighbour = neighbour;
+	adj->ref_nr = 1;
+	INIT_LIST_HEAD(&adj->search_list);
+
+	dev_hold(adj_dev);
+	pr_debug("dev_hold for %s, because of %s link added from %s to %s\n",
+		 adj_dev->name, upper ? "upper" : "lower", dev->name,
+		 adj_dev->name);
+
+	if (!upper) {
+		list_add_tail_rcu(&adj->list, &dev->lower_dev_list);
+		return 0;
+	}
+
+	/* Ensure that master upper link is always the first item in list. */
+	if (master)
+		list_add_rcu(&adj->list, &dev->upper_dev_list);
+	else
+		list_add_tail_rcu(&adj->list, &dev->upper_dev_list);
+
+	return 0;
+}
+
+static inline int __netdev_upper_dev_insert(struct net_device *dev,
+					    struct net_device *udev,
+					    bool master, bool neighbour)
+{
+	return __netdev_adjacent_dev_insert(dev, udev, neighbour, master,
+					    true);
+}
+
+static inline int __netdev_lower_dev_insert(struct net_device *dev,
+					    struct net_device *ldev,
+					    bool neighbour)
+{
+	return __netdev_adjacent_dev_insert(dev, ldev, neighbour, false,
+					    false);
+}
+
+void __netdev_adjacent_dev_remove(struct net_device *dev,
+				  struct net_device *adj_dev, bool upper)
+{
+	struct netdev_adjacent *adj;
+
+	if (upper)
+		adj = __netdev_find_upper(dev, adj_dev);
+	else
+		adj = __netdev_find_lower(dev, adj_dev);
+
+	if (!adj)
+		BUG();
+
+	if (adj->ref_nr > 1) {
+		adj->ref_nr--;
+		return;
+	}
+
+	list_del_rcu(&adj->list);
+	pr_debug("dev_put for %s, because of %s link removed from %s to %s\n",
+		 adj_dev->name, upper ? "upper" : "lower", dev->name,
+		 adj_dev->name);
+	dev_put(adj_dev);
+	kfree_rcu(adj, rcu);
+}
+
+static inline void __netdev_upper_dev_remove(struct net_device *dev,
+					     struct net_device *udev)
+{
+	return __netdev_adjacent_dev_remove(dev, udev, true);
+}
+
+static inline void __netdev_lower_dev_remove(struct net_device *dev,
+					     struct net_device *ldev)
+{
+	return __netdev_adjacent_dev_remove(dev, ldev, false);
+}
+
+int __netdev_adjacent_dev_insert_link(struct net_device *dev,
+				      struct net_device *upper_dev,
+				      bool master, bool neighbour)
+{
+	int ret;
+
+	ret = __netdev_upper_dev_insert(dev, upper_dev, master, neighbour);
+	if (ret)
+		return ret;
+
+	ret = __netdev_lower_dev_insert(upper_dev, dev, neighbour);
+	if (ret) {
+		__netdev_upper_dev_remove(dev, upper_dev);
+		return ret;
+	}
+
+	return 0;
+}
+
+static inline int __netdev_adjacent_dev_link(struct net_device *dev,
+					     struct net_device *udev)
+{
+	return __netdev_adjacent_dev_insert_link(dev, udev, false, false);
+}
+
+static inline int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
+						       struct net_device *udev,
+						       bool master)
+{
+	return __netdev_adjacent_dev_insert_link(dev, udev, master, true);
+}
+
+void __netdev_adjacent_dev_unlink(struct net_device *dev,
+				  struct net_device *upper_dev)
+{
+	__netdev_upper_dev_remove(dev, upper_dev);
+	__netdev_lower_dev_remove(upper_dev, dev);
+}
+
+
 static int __netdev_upper_dev_link(struct net_device *dev,
 				   struct net_device *upper_dev, bool master)
 {
-	struct netdev_adjacent *upper;
+	struct netdev_adjacent *i, *j, *to_i, *to_j;
+	int ret = 0;
 
 	ASSERT_RTNL();
 
@@ -4516,22 +4680,76 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 	if (master && netdev_master_upper_dev_get(dev))
 		return -EBUSY;
 
-	upper = kmalloc(sizeof(*upper), GFP_KERNEL);
-	if (!upper)
-		return -ENOMEM;
+	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, master);
+	if (ret)
+		return ret;
 
-	upper->dev = upper_dev;
-	upper->master = master;
-	INIT_LIST_HEAD(&upper->search_list);
+	/* Now that we linked these devs, make all the upper_dev's
+	 * upper_dev_list visible to every dev's lower_dev_list and vice
+	 * versa, and don't forget the devices itself. All of these
+	 * links are non-neighbours.
+	 */
+	list_for_each_entry(i, &upper_dev->upper_dev_list, list) {
+		list_for_each_entry(j, &dev->lower_dev_list, list) {
+			ret = __netdev_adjacent_dev_link(i->dev, j->dev);
+			if (ret)
+				goto rollback_mesh;
+		}
+	}
+
+	/* add dev to every upper_dev's upper device */
+	list_for_each_entry(i, &upper_dev->upper_dev_list, list) {
+		ret = __netdev_adjacent_dev_link(dev, i->dev);
+		if (ret)
+			goto rollback_upper_mesh;
+	}
+
+	/* add upper_dev to every dev's lower device */
+	list_for_each_entry(i, &dev->lower_dev_list, list) {
+		ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
+		if (ret)
+			goto rollback_lower_mesh;
+	}
 
-	/* Ensure that master upper link is always the first item in list. */
-	if (master)
-		list_add_rcu(&upper->list, &dev->upper_dev_list);
-	else
-		list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
-	dev_hold(upper_dev);
 	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
 	return 0;
+
+rollback_lower_mesh:
+	to_i = i;
+	list_for_each_entry(i, &dev->lower_dev_list, list) {
+		if (i == to_i)
+			break;
+		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
+	}
+
+	i = NULL;
+
+rollback_upper_mesh:
+	to_i = i;
+	list_for_each_entry(i, &upper_dev->upper_dev_list, list) {
+		if (i == to_i)
+			break;
+		__netdev_adjacent_dev_unlink(dev, i->dev);
+	}
+
+	i = j = NULL;
+
+rollback_mesh:
+	to_i = i;
+	to_j = j;
+	list_for_each_entry(i, &dev->lower_dev_list, list) {
+		list_for_each_entry(j, &upper_dev->upper_dev_list, list) {
+			if (i == to_i && j == to_j)
+				break;
+			__netdev_adjacent_dev_unlink(i->dev, j->dev);
+		}
+		if (i == to_i)
+			break;
+	}
+
+	__netdev_adjacent_dev_unlink(dev, upper_dev);
+
+	return ret;
 }
 
 /**
@@ -4580,16 +4798,28 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link);
 void netdev_upper_dev_unlink(struct net_device *dev,
 			     struct net_device *upper_dev)
 {
-	struct netdev_adjacent *upper;
-
+	struct netdev_adjacent *i, *j;
 	ASSERT_RTNL();
 
-	upper = __netdev_find_upper(dev, upper_dev);
-	if (!upper)
-		return;
-	list_del_rcu(&upper->list);
-	dev_put(upper_dev);
-	kfree_rcu(upper, rcu);
+	__netdev_adjacent_dev_unlink(dev, upper_dev);
+
+	/* Here is the tricky part. We must remove all dev's lower
+	 * devices from all upper_dev's upper devices and vice
+	 * versa, to maintain the graph relationship.
+	 */
+	list_for_each_entry(i, &dev->lower_dev_list, list)
+		list_for_each_entry(j, &upper_dev->upper_dev_list, list)
+			__netdev_adjacent_dev_unlink(i->dev, j->dev);
+
+	/* remove also the devices itself from lower/upper device
+	 * list
+	 */
+	list_for_each_entry(i, &dev->lower_dev_list, list)
+		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
+
+	list_for_each_entry(i, &upper_dev->upper_dev_list, list)
+		__netdev_adjacent_dev_unlink(dev, i->dev);
+
 	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
 }
 EXPORT_SYMBOL(netdev_upper_dev_unlink);
@@ -5850,6 +6080,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	INIT_LIST_HEAD(&dev->unreg_list);
 	INIT_LIST_HEAD(&dev->link_watch_list);
 	INIT_LIST_HEAD(&dev->upper_dev_list);
+	INIT_LIST_HEAD(&dev->lower_dev_list);
 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
 	setup(dev);
 
-- 
cgit v1.2.3


From 620f3186caa8124e0efaf329751cf51c5d55c731 Mon Sep 17 00:00:00 2001
From: Veaceslav Falico <vfalico@redhat.com>
Date: Wed, 28 Aug 2013 23:25:06 +0200
Subject: net: remove search_list from netdev_adjacent

We already don't need it cause we see every upper/lower device in the list
already.

CC: "David S. Miller" <davem@davemloft.net>
CC: Eric Dumazet <edumazet@google.com>
CC: Jiri Pirko <jiri@resnulli.us>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Cong Wang <amwang@redhat.com>
Signed-off-by: Veaceslav Falico <vfalico@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 37 +------------------------------------
 1 file changed, 1 insertion(+), 36 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 2aa914eee057..749925a040a4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4381,42 +4381,8 @@ struct netdev_adjacent {
 
 	struct list_head list;
 	struct rcu_head rcu;
-	struct list_head search_list;
 };
 
-static void __append_search_uppers(struct list_head *search_list,
-				   struct net_device *dev)
-{
-	struct netdev_adjacent *upper;
-
-	list_for_each_entry(upper, &dev->upper_dev_list, list) {
-		/* check if this upper is not already in search list */
-		if (list_empty(&upper->search_list))
-			list_add_tail(&upper->search_list, search_list);
-	}
-}
-
-static bool __netdev_search_upper_dev(struct net_device *dev,
-				      struct net_device *upper_dev)
-{
-	LIST_HEAD(search_list);
-	struct netdev_adjacent *upper;
-	struct netdev_adjacent *tmp;
-	bool ret = false;
-
-	__append_search_uppers(&search_list, dev);
-	list_for_each_entry(upper, &search_list, search_list) {
-		if (upper->dev == upper_dev) {
-			ret = true;
-			break;
-		}
-		__append_search_uppers(&search_list, upper->dev);
-	}
-	list_for_each_entry_safe(upper, tmp, &search_list, search_list)
-		INIT_LIST_HEAD(&upper->search_list);
-	return ret;
-}
-
 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
 						 struct net_device *adj_dev,
 						 bool upper)
@@ -4544,7 +4510,6 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
 	adj->master = master;
 	adj->neighbour = neighbour;
 	adj->ref_nr = 1;
-	INIT_LIST_HEAD(&adj->search_list);
 
 	dev_hold(adj_dev);
 	pr_debug("dev_hold for %s, because of %s link added from %s to %s\n",
@@ -4671,7 +4636,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 		return -EBUSY;
 
 	/* To prevent loops, check if dev is not upper device to upper_dev. */
-	if (__netdev_search_upper_dev(upper_dev, dev))
+	if (__netdev_find_upper(upper_dev, dev))
 		return -EBUSY;
 
 	if (__netdev_find_upper(dev, upper_dev))
-- 
cgit v1.2.3


From 48311f46853c0361f9fba7e0e6bb1652d633c049 Mon Sep 17 00:00:00 2001
From: Veaceslav Falico <vfalico@redhat.com>
Date: Wed, 28 Aug 2013 23:25:07 +0200
Subject: net: add netdev_upper_get_next_dev_rcu(dev, iter)

This function returns the next dev in the dev->upper_dev_list after the
struct list_head **iter position, and updates *iter accordingly. Returns
NULL if there are no devices left.

Caller must hold RCU read lock.

CC: "David S. Miller" <davem@davemloft.net>
CC: Eric Dumazet <edumazet@google.com>
CC: Jiri Pirko <jiri@resnulli.us>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Cong Wang <amwang@redhat.com>
Signed-off-by: Veaceslav Falico <vfalico@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 749925a040a4..6fbb0c90849b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4468,6 +4468,31 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
 }
 EXPORT_SYMBOL(netdev_master_upper_dev_get);
 
+/* netdev_upper_get_next_dev_rcu - Get the next dev from upper list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next device from the dev's upper list, starting from iter
+ * position. The caller must hold RCU read lock.
+ */
+struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
+						 struct list_head **iter)
+{
+	struct netdev_adjacent *upper;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+
+	if (&upper->list == &dev->upper_dev_list)
+		return NULL;
+
+	*iter = &upper->list;
+
+	return upper->dev;
+}
+EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
+
 /**
  * netdev_master_upper_dev_get_rcu - Get master upper device
  * @dev: device
-- 
cgit v1.2.3


From 5df0ddfbc9209ffafc82236509ba0e975120e3c3 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Wed, 28 Aug 2013 22:13:09 +0200
Subject: net: packet: add randomized fanout scheduler

We currently allow for different fanout scheduling policies in pf_packet
such as scheduling by skb's rxhash, round-robin, by cpu, and rollover.
Also allow for a random, equidistributed selection of the socket from the
fanout process group.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_packet.h |  1 +
 net/packet/af_packet.c         | 13 ++++++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h
index b950c02030c0..dbf06667394b 100644
--- a/include/uapi/linux/if_packet.h
+++ b/include/uapi/linux/if_packet.h
@@ -56,6 +56,7 @@ struct sockaddr_ll {
 #define PACKET_FANOUT_LB		1
 #define PACKET_FANOUT_CPU		2
 #define PACKET_FANOUT_ROLLOVER		3
+#define PACKET_FANOUT_RND		4
 #define PACKET_FANOUT_FLAG_ROLLOVER	0x1000
 #define PACKET_FANOUT_FLAG_DEFRAG	0x8000
 
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 1fdf9ab91c3f..bee9bfdc8d05 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -88,7 +88,7 @@
 #include <linux/virtio_net.h>
 #include <linux/errqueue.h>
 #include <linux/net_tstamp.h>
-
+#include <linux/reciprocal_div.h>
 #ifdef CONFIG_INET
 #include <net/inet_common.h>
 #endif
@@ -1158,6 +1158,13 @@ static unsigned int fanout_demux_cpu(struct packet_fanout *f,
 	return smp_processor_id() % num;
 }
 
+static unsigned int fanout_demux_rnd(struct packet_fanout *f,
+				     struct sk_buff *skb,
+				     unsigned int num)
+{
+	return reciprocal_divide(prandom_u32(), num);
+}
+
 static unsigned int fanout_demux_rollover(struct packet_fanout *f,
 					  struct sk_buff *skb,
 					  unsigned int idx, unsigned int skip,
@@ -1215,6 +1222,9 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
 	case PACKET_FANOUT_CPU:
 		idx = fanout_demux_cpu(f, skb, num);
 		break;
+	case PACKET_FANOUT_RND:
+		idx = fanout_demux_rnd(f, skb, num);
+		break;
 	case PACKET_FANOUT_ROLLOVER:
 		idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
 		break;
@@ -1284,6 +1294,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
 	case PACKET_FANOUT_HASH:
 	case PACKET_FANOUT_LB:
 	case PACKET_FANOUT_CPU:
+	case PACKET_FANOUT_RND:
 		break;
 	default:
 		return -EINVAL;
-- 
cgit v1.2.3


From f55d112e529386af3667fac3a507132a361b2154 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Wed, 28 Aug 2013 22:13:10 +0200
Subject: net: packet: use reciprocal_divide in fanout_demux_hash

Instead of hard-coding reciprocal_divide function, use the inline
function from reciprocal_div.h.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index bee9bfdc8d05..2e8286b47c28 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1135,7 +1135,7 @@ static unsigned int fanout_demux_hash(struct packet_fanout *f,
 				      struct sk_buff *skb,
 				      unsigned int num)
 {
-	return (((u64)skb->rxhash) * num) >> 32;
+	return reciprocal_divide(skb->rxhash, num);
 }
 
 static unsigned int fanout_demux_lb(struct packet_fanout *f,
-- 
cgit v1.2.3


From 391ac1282dd7ff1cb8245cccc5262e8e4173edc4 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Mon, 26 Aug 2013 15:05:36 +0200
Subject: can: gw: add a per rule limitation of frame hops

Usually the received CAN frames can be processed/routed as much as 'max_hops'
times (which is given at module load time of the can-gw module).
Introduce a new configuration option to reduce the number of possible hops
for a specific gateway rule to a value smaller then max_hops.

Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/uapi/linux/can/gw.h |  9 ++++++++-
 net/can/gw.c                | 35 +++++++++++++++++++++++++++++++----
 2 files changed, 39 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/can/gw.h b/include/uapi/linux/can/gw.h
index ae07bec74f4b..4e27c82b564a 100644
--- a/include/uapi/linux/can/gw.h
+++ b/include/uapi/linux/can/gw.h
@@ -45,6 +45,7 @@ enum {
 	CGW_DST_IF,	/* ifindex of destination network interface */
 	CGW_FILTER,	/* specify struct can_filter on source CAN device */
 	CGW_DELETED,	/* number of deleted CAN frames (see max_hops param) */
+	CGW_LIM_HOPS,	/* limit the number of hops of this specific rule */
 	__CGW_MAX
 };
 
@@ -116,13 +117,19 @@ enum {
  * Sets a CAN receive filter for the gateway job specified by the
  * struct can_filter described in include/linux/can.h
  *
- * CGW_MOD_XXX (length 17 bytes):
+ * CGW_MOD_(AND|OR|XOR|SET) (length 17 bytes):
  * Specifies a modification that's done to a received CAN frame before it is
  * send out to the destination interface.
  *
  * <struct can_frame> data used as operator
  * <u8> affected CAN frame elements
  *
+ * CGW_LIM_HOPS (length 1 byte):
+ * Limit the number of hops of this specific rule. Usually the received CAN
+ * frame can be processed as much as 'max_hops' times (which is given at module
+ * load time of the can-gw module). This value is used to reduce the number of
+ * possible hops for this gateway rule to a value smaller then max_hops.
+ *
  * CGW_CS_XOR (length 4 bytes):
  * Set a simple XOR checksum starting with an initial value into
  * data[result-idx] using data[start-idx] .. data[end-idx]
diff --git a/net/can/gw.c b/net/can/gw.c
index 2f291f961a17..3f9b0f3a2818 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -146,6 +146,7 @@ struct cgw_job {
 		/* tbc */
 	};
 	u8 gwtype;
+	u8 limit_hops;
 	u16 flags;
 };
 
@@ -402,6 +403,11 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data)
 
 	/* put the incremented hop counter in the cloned skb */
 	cgw_hops(nskb) = cgw_hops(skb) + 1;
+
+	/* first processing of this CAN frame -> adjust to private hop limit */
+	if (gwj->limit_hops && cgw_hops(nskb) == 1)
+		cgw_hops(nskb) = max_hops - gwj->limit_hops + 1;
+
 	nskb->dev = gwj->dst.dev;
 
 	/* pointer to modifiable CAN frame */
@@ -509,6 +515,11 @@ static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type,
 
 	/* check non default settings of attributes */
 
+	if (gwj->limit_hops) {
+		if (nla_put_u8(skb, CGW_LIM_HOPS, gwj->limit_hops) < 0)
+			goto cancel;
+	}
+
 	if (gwj->mod.modtype.and) {
 		memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf));
 		mb.modtype = gwj->mod.modtype.and;
@@ -606,11 +617,12 @@ static const struct nla_policy cgw_policy[CGW_MAX+1] = {
 	[CGW_SRC_IF]	= { .type = NLA_U32 },
 	[CGW_DST_IF]	= { .type = NLA_U32 },
 	[CGW_FILTER]	= { .len = sizeof(struct can_filter) },
+	[CGW_LIM_HOPS]	= { .type = NLA_U8 },
 };
 
 /* check for common and gwtype specific attributes */
 static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod,
-			  u8 gwtype, void *gwtypeattr)
+			  u8 gwtype, void *gwtypeattr, u8 *limhops)
 {
 	struct nlattr *tb[CGW_MAX+1];
 	struct cgw_frame_mod mb;
@@ -625,6 +637,13 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod,
 	if (err < 0)
 		return err;
 
+	if (tb[CGW_LIM_HOPS]) {
+		*limhops = nla_get_u8(tb[CGW_LIM_HOPS]);
+
+		if (*limhops < 1 || *limhops > max_hops)
+			return -EINVAL;
+	}
+
 	/* check for AND/OR/XOR/SET modifications */
 
 	if (tb[CGW_MOD_AND]) {
@@ -782,6 +801,7 @@ static int cgw_create_job(struct sk_buff *skb,  struct nlmsghdr *nlh)
 {
 	struct rtcanmsg *r;
 	struct cgw_job *gwj;
+	u8 limhops = 0;
 	int err = 0;
 
 	if (!capable(CAP_NET_ADMIN))
@@ -808,7 +828,8 @@ static int cgw_create_job(struct sk_buff *skb,  struct nlmsghdr *nlh)
 	gwj->flags = r->flags;
 	gwj->gwtype = r->gwtype;
 
-	err = cgw_parse_attr(nlh, &gwj->mod, CGW_TYPE_CAN_CAN, &gwj->ccgw);
+	err = cgw_parse_attr(nlh, &gwj->mod, CGW_TYPE_CAN_CAN, &gwj->ccgw,
+			     &limhops);
 	if (err < 0)
 		goto out;
 
@@ -836,6 +857,8 @@ static int cgw_create_job(struct sk_buff *skb,  struct nlmsghdr *nlh)
 	if (gwj->dst.dev->type != ARPHRD_CAN || gwj->dst.dev->header_ops)
 		goto put_src_dst_out;
 
+	gwj->limit_hops = limhops;
+
 	ASSERT_RTNL();
 
 	err = cgw_register_filter(gwj);
@@ -867,13 +890,14 @@ static void cgw_remove_all_jobs(void)
 	}
 }
 
-static int cgw_remove_job(struct sk_buff *skb,  struct nlmsghdr *nlh)
+static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
 	struct cgw_job *gwj = NULL;
 	struct hlist_node *nx;
 	struct rtcanmsg *r;
 	struct cf_mod mod;
 	struct can_can_gw ccgw;
+	u8 limhops = 0;
 	int err = 0;
 
 	if (!capable(CAP_NET_ADMIN))
@@ -890,7 +914,7 @@ static int cgw_remove_job(struct sk_buff *skb,  struct nlmsghdr *nlh)
 	if (r->gwtype != CGW_TYPE_CAN_CAN)
 		return -EINVAL;
 
-	err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw);
+	err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops);
 	if (err < 0)
 		return err;
 
@@ -910,6 +934,9 @@ static int cgw_remove_job(struct sk_buff *skb,  struct nlmsghdr *nlh)
 		if (gwj->flags != r->flags)
 			continue;
 
+		if (gwj->limit_hops != limhops)
+			continue;
+
 		if (memcmp(&gwj->mod, &mod, sizeof(mod)))
 			continue;
 
-- 
cgit v1.2.3


From afe4fd062416b158a8a8538b23adc1930a9b88dc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 29 Aug 2013 15:49:55 -0700
Subject: pkt_sched: fq: Fair Queue packet scheduler

- Uses perfect flow match (not stochastic hash like SFQ/FQ_codel)
- Uses the new_flow/old_flow separation from FQ_codel
- New flows get an initial credit allowing IW10 without added delay.
- Special FIFO queue for high prio packets (no need for PRIO + FQ)
- Uses a hash table of RB trees to locate the flows at enqueue() time
- Smart on demand gc (at enqueue() time, RB tree lookup evicts old
  unused flows)
- Dynamic memory allocations.
- Designed to allow millions of concurrent flows per Qdisc.
- Small memory footprint : ~8K per Qdisc, and 104 bytes per flow.
- Single high resolution timer for throttled flows (if any).
- One RB tree to link throttled flows.
- Ability to have a max rate per flow. We might add a socket option
  to add per socket limitation.

Attempts have been made to add TCP pacing in TCP stack, but this
seems to add complex code to an already complex stack.

TCP pacing is welcomed for flows having idle times, as the cwnd
permits TCP stack to queue a possibly large number of packets.

This removes the 'slow start after idle' choice, hitting badly
large BDP flows, and applications delivering chunks of data
as video streams.

Nicely spaced packets :
Here interface is 10Gbit, but flow bottleneck is ~20Mbit

cwin is big, yet FQ avoids the typical bursts generated by TCP
(as in netperf TCP_RR -- -r 100000,100000)

15:01:23.545279 IP A > B: . 78193:81089(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.545394 IP B > A: . ack 81089 win 3668 <nop,nop,timestamp 11597985 1115>
15:01:23.546488 IP A > B: . 81089:83985(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.546565 IP B > A: . ack 83985 win 3668 <nop,nop,timestamp 11597986 1115>
15:01:23.547713 IP A > B: . 83985:86881(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.547778 IP B > A: . ack 86881 win 3668 <nop,nop,timestamp 11597987 1115>
15:01:23.548911 IP A > B: . 86881:89777(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.548949 IP B > A: . ack 89777 win 3668 <nop,nop,timestamp 11597988 1115>
15:01:23.550116 IP A > B: . 89777:92673(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.550182 IP B > A: . ack 92673 win 3668 <nop,nop,timestamp 11597989 1115>
15:01:23.551333 IP A > B: . 92673:95569(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.551406 IP B > A: . ack 95569 win 3668 <nop,nop,timestamp 11597991 1115>
15:01:23.552539 IP A > B: . 95569:98465(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.552576 IP B > A: . ack 98465 win 3668 <nop,nop,timestamp 11597992 1115>
15:01:23.553756 IP A > B: . 98465:99913(1448) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.554138 IP A > B: P 99913:100001(88) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.554204 IP B > A: . ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.554234 IP B > A: . 65248:68144(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.555620 IP B > A: . 68144:71040(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.557005 IP B > A: . 71040:73936(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.558390 IP B > A: . 73936:76832(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.559773 IP B > A: . 76832:79728(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.561158 IP B > A: . 79728:82624(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.562543 IP B > A: . 82624:85520(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.563928 IP B > A: . 85520:88416(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.565313 IP B > A: . 88416:91312(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.566698 IP B > A: . 91312:94208(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.568083 IP B > A: . 94208:97104(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.569467 IP B > A: . 97104:100000(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.570852 IP B > A: . 100000:102896(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.572237 IP B > A: . 102896:105792(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.573639 IP B > A: . 105792:108688(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.575024 IP B > A: . 108688:111584(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.576408 IP B > A: . 111584:114480(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.577793 IP B > A: . 114480:117376(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>

TCP timestamps show that most packets from B were queued in the same ms
timeframe (TSval 1159799{3,4}), but FQ managed to send them right
in time to avoid a big burst.

In slow start or steady state, very few packets are throttled [1]

FQ gets a bunch of tunables as :

  limit : max number of packets on whole Qdisc (default 10000)

  flow_limit : max number of packets per flow (default 100)

  quantum : the credit per RR round (default is 2 MTU)

  initial_quantum : initial credit for new flows (default is 10 MTU)

  maxrate : max per flow rate (default : unlimited)

  buckets : number of RB trees (default : 1024) in hash table.
               (consumes 8 bytes per bucket)

  [no]pacing : disable/enable pacing (default is enable)

All of them can be changed on a live qdisc.

$ tc qd add dev eth0 root fq help
Usage: ... fq [ limit PACKETS ] [ flow_limit PACKETS ]
              [ quantum BYTES ] [ initial_quantum BYTES ]
              [ maxrate RATE  ] [ buckets NUMBER ]
              [ [no]pacing ]

$ tc -s -d qd
qdisc fq 8002: dev eth0 root refcnt 32 limit 10000p flow_limit 100p buckets 256 quantum 3028 initial_quantum 15140
 Sent 216532416 bytes 148395 pkt (dropped 0, overlimits 0 requeues 14)
 backlog 0b 0p requeues 14
  511 flows, 511 inactive, 0 throttled
  110 gc, 0 highprio, 0 retrans, 1143 throttled, 0 flows_plimit

[1] Except if initial srtt is overestimated, as if using
cached srtt in tcp metrics. We'll provide a fix for this issue.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  41 +++
 net/sched/Kconfig              |  14 +
 net/sched/Makefile             |   1 +
 net/sched/sch_fq.c             | 792 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 848 insertions(+)
 create mode 100644 net/sched/sch_fq.c

(limited to 'net')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 09d62b9228ff..9b829134d422 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -744,4 +744,45 @@ struct tc_fq_codel_xstats {
 	};
 };
 
+/* FQ */
+
+enum {
+	TCA_FQ_UNSPEC,
+
+	TCA_FQ_PLIMIT,		/* limit of total number of packets in queue */
+
+	TCA_FQ_FLOW_PLIMIT,	/* limit of packets per flow */
+
+	TCA_FQ_QUANTUM,		/* RR quantum */
+
+	TCA_FQ_INITIAL_QUANTUM,		/* RR quantum for new flow */
+
+	TCA_FQ_RATE_ENABLE,	/* enable/disable rate limiting */
+
+	TCA_FQ_FLOW_DEFAULT_RATE,/* for sockets with unspecified sk_rate,
+				  * use the following rate
+				  */
+
+	TCA_FQ_FLOW_MAX_RATE,	/* per flow max rate */
+
+	TCA_FQ_BUCKETS_LOG,	/* log2(number of buckets) */
+	__TCA_FQ_MAX
+};
+
+#define TCA_FQ_MAX	(__TCA_FQ_MAX - 1)
+
+struct tc_fq_qd_stats {
+	__u64	gc_flows;
+	__u64	highprio_packets;
+	__u64	tcp_retrans;
+	__u64	throttled;
+	__u64	flows_plimit;
+	__u64	pkts_too_long;
+	__u64	allocation_errors;
+	__s64	time_next_delayed_flow;
+	__u32	flows;
+	__u32	inactive_flows;
+	__u32	throttled_flows;
+	__u32	pad;
+};
 #endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 235e01acac51..c03a32a0418e 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -272,6 +272,20 @@ config NET_SCH_FQ_CODEL
 
 	  If unsure, say N.
 
+config NET_SCH_FQ
+	tristate "Fair Queue"
+	help
+	  Say Y here if you want to use the FQ packet scheduling algorithm.
+
+	  FQ does flow separation, and is able to respect pacing requirements
+	  set by TCP stack into sk->sk_pacing_rate (for localy generated
+	  traffic)
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called sch_fq.
+
+	  If unsure, say N.
+
 config NET_SCH_INGRESS
 	tristate "Ingress Qdisc"
 	depends on NET_CLS_ACT
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 978cbf004e80..e5f9abe9a5db 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_NET_SCH_CHOKE)	+= sch_choke.o
 obj-$(CONFIG_NET_SCH_QFQ)	+= sch_qfq.o
 obj-$(CONFIG_NET_SCH_CODEL)	+= sch_codel.o
 obj-$(CONFIG_NET_SCH_FQ_CODEL)	+= sch_fq_codel.o
+obj-$(CONFIG_NET_SCH_FQ)	+= sch_fq.o
 
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
new file mode 100644
index 000000000000..91ceca7bcb52
--- /dev/null
+++ b/net/sched/sch_fq.c
@@ -0,0 +1,792 @@
+/*
+ * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
+ *
+ *  Copyright (C) 2013 Eric Dumazet <edumazet@google.com>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *  Meant to be mostly used for localy generated traffic :
+ *  Fast classification depends on skb->sk being set before reaching us.
+ *  If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
+ *  All packets belonging to a socket are considered as a 'flow'.
+ *
+ *  Flows are dynamically allocated and stored in a hash table of RB trees
+ *  They are also part of one Round Robin 'queues' (new or old flows)
+ *
+ *  Burst avoidance (aka pacing) capability :
+ *
+ *  Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
+ *  bunch of packets, and this packet scheduler adds delay between
+ *  packets to respect rate limitation.
+ *
+ *  enqueue() :
+ *   - lookup one RB tree (out of 1024 or more) to find the flow.
+ *     If non existent flow, create it, add it to the tree.
+ *     Add skb to the per flow list of skb (fifo).
+ *   - Use a special fifo for high prio packets
+ *
+ *  dequeue() : serves flows in Round Robin
+ *  Note : When a flow becomes empty, we do not immediately remove it from
+ *  rb trees, for performance reasons (its expected to send additional packets,
+ *  or SLAB cache will reuse socket for another flow)
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/hash.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+
+/*
+ * Per flow structure, dynamically allocated
+ */
+struct fq_flow {
+	struct sk_buff	*head;		/* list of skbs for this flow : first skb */
+	union {
+		struct sk_buff *tail;	/* last skb in the list */
+		unsigned long  age;	/* jiffies when flow was emptied, for gc */
+	};
+	struct rb_node	fq_node; 	/* anchor in fq_root[] trees */
+	struct sock	*sk;
+	int		qlen;		/* number of packets in flow queue */
+	int		credit;
+	u32		socket_hash;	/* sk_hash */
+	struct fq_flow *next;		/* next pointer in RR lists, or &detached */
+
+	struct rb_node  rate_node;	/* anchor in q->delayed tree */
+	u64		time_next_packet;
+};
+
+struct fq_flow_head {
+	struct fq_flow *first;
+	struct fq_flow *last;
+};
+
+struct fq_sched_data {
+	struct fq_flow_head new_flows;
+
+	struct fq_flow_head old_flows;
+
+	struct rb_root	delayed;	/* for rate limited flows */
+	u64		time_next_delayed_flow;
+
+	struct fq_flow	internal;	/* for non classified or high prio packets */
+	u32		quantum;
+	u32		initial_quantum;
+	u32		flow_default_rate;/* rate per flow : bytes per second */
+	u32		flow_max_rate;	/* optional max rate per flow */
+	u32		flow_plimit;	/* max packets per flow */
+	struct rb_root	*fq_root;
+	u8		rate_enable;
+	u8		fq_trees_log;
+
+	u32		flows;
+	u32		inactive_flows;
+	u32		throttled_flows;
+
+	u64		stat_gc_flows;
+	u64		stat_internal_packets;
+	u64		stat_tcp_retrans;
+	u64		stat_throttled;
+	u64		stat_flows_plimit;
+	u64		stat_pkts_too_long;
+	u64		stat_allocation_errors;
+	struct qdisc_watchdog watchdog;
+};
+
+/* special value to mark a detached flow (not on old/new list) */
+static struct fq_flow detached, throttled;
+
+static void fq_flow_set_detached(struct fq_flow *f)
+{
+	f->next = &detached;
+}
+
+static bool fq_flow_is_detached(const struct fq_flow *f)
+{
+	return f->next == &detached;
+}
+
+static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
+{
+	struct rb_node **p = &q->delayed.rb_node, *parent = NULL;
+
+	while (*p) {
+		struct fq_flow *aux;
+
+		parent = *p;
+		aux = container_of(parent, struct fq_flow, rate_node);
+		if (f->time_next_packet >= aux->time_next_packet)
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+	rb_link_node(&f->rate_node, parent, p);
+	rb_insert_color(&f->rate_node, &q->delayed);
+	q->throttled_flows++;
+	q->stat_throttled++;
+
+	f->next = &throttled;
+	if (q->time_next_delayed_flow > f->time_next_packet)
+		q->time_next_delayed_flow = f->time_next_packet;
+}
+
+
+static struct kmem_cache *fq_flow_cachep __read_mostly;
+
+static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow)
+{
+	if (head->first)
+		head->last->next = flow;
+	else
+		head->first = flow;
+	head->last = flow;
+	flow->next = NULL;
+}
+
+/* limit number of collected flows per round */
+#define FQ_GC_MAX 8
+#define FQ_GC_AGE (3*HZ)
+
+static bool fq_gc_candidate(const struct fq_flow *f)
+{
+	return fq_flow_is_detached(f) &&
+	       time_after(jiffies, f->age + FQ_GC_AGE);
+}
+
+static void fq_gc(struct fq_sched_data *q,
+		  struct rb_root *root,
+		  struct sock *sk)
+{
+	struct fq_flow *f, *tofree[FQ_GC_MAX];
+	struct rb_node **p, *parent;
+	int fcnt = 0;
+
+	p = &root->rb_node;
+	parent = NULL;
+	while (*p) {
+		parent = *p;
+
+		f = container_of(parent, struct fq_flow, fq_node);
+		if (f->sk == sk)
+			break;
+
+		if (fq_gc_candidate(f)) {
+			tofree[fcnt++] = f;
+			if (fcnt == FQ_GC_MAX)
+				break;
+		}
+
+		if (f->sk > sk)
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+
+	q->flows -= fcnt;
+	q->inactive_flows -= fcnt;
+	q->stat_gc_flows += fcnt;
+	while (fcnt) {
+		struct fq_flow *f = tofree[--fcnt];
+
+		rb_erase(&f->fq_node, root);
+		kmem_cache_free(fq_flow_cachep, f);
+	}
+}
+
+static const u8 prio2band[TC_PRIO_MAX + 1] = {
+	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
+};
+
+static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
+{
+	struct rb_node **p, *parent;
+	struct sock *sk = skb->sk;
+	struct rb_root *root;
+	struct fq_flow *f;
+	int band;
+
+	/* warning: no starvation prevention... */
+	band = prio2band[skb->priority & TC_PRIO_MAX];
+	if (unlikely(band == 0))
+		return &q->internal;
+
+	if (unlikely(!sk)) {
+		/* By forcing low order bit to 1, we make sure to not
+		 * collide with a local flow (socket pointers are word aligned)
+		 */
+		sk = (struct sock *)(skb_get_rxhash(skb) | 1L);
+	}
+
+	root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)];
+
+	if (q->flows >= (2U << q->fq_trees_log) &&
+	    q->inactive_flows > q->flows/2)
+		fq_gc(q, root, sk);
+
+	p = &root->rb_node;
+	parent = NULL;
+	while (*p) {
+		parent = *p;
+
+		f = container_of(parent, struct fq_flow, fq_node);
+		if (f->sk == sk) {
+			/* socket might have been reallocated, so check
+			 * if its sk_hash is the same.
+			 * It not, we need to refill credit with
+			 * initial quantum
+			 */
+			if (unlikely(skb->sk &&
+				     f->socket_hash != sk->sk_hash)) {
+				f->credit = q->initial_quantum;
+				f->socket_hash = sk->sk_hash;
+			}
+			return f;
+		}
+		if (f->sk > sk)
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+
+	f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN);
+	if (unlikely(!f)) {
+		q->stat_allocation_errors++;
+		return &q->internal;
+	}
+	fq_flow_set_detached(f);
+	f->sk = sk;
+	if (skb->sk)
+		f->socket_hash = sk->sk_hash;
+	f->credit = q->initial_quantum;
+
+	rb_link_node(&f->fq_node, parent, p);
+	rb_insert_color(&f->fq_node, root);
+
+	q->flows++;
+	q->inactive_flows++;
+	return f;
+}
+
+
+/* remove one skb from head of flow queue */
+static struct sk_buff *fq_dequeue_head(struct fq_flow *flow)
+{
+	struct sk_buff *skb = flow->head;
+
+	if (skb) {
+		flow->head = skb->next;
+		skb->next = NULL;
+		flow->qlen--;
+	}
+	return skb;
+}
+
+/* We might add in the future detection of retransmits
+ * For the time being, just return false
+ */
+static bool skb_is_retransmit(struct sk_buff *skb)
+{
+	return false;
+}
+
+/* add skb to flow queue
+ * flow queue is a linked list, kind of FIFO, except for TCP retransmits
+ * We special case tcp retransmits to be transmitted before other packets.
+ * We rely on fact that TCP retransmits are unlikely, so we do not waste
+ * a separate queue or a pointer.
+ * head->  [retrans pkt 1]
+ *         [retrans pkt 2]
+ *         [ normal pkt 1]
+ *         [ normal pkt 2]
+ *         [ normal pkt 3]
+ * tail->  [ normal pkt 4]
+ */
+static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
+{
+	struct sk_buff *prev, *head = flow->head;
+
+	skb->next = NULL;
+	if (!head) {
+		flow->head = skb;
+		flow->tail = skb;
+		return;
+	}
+	if (likely(!skb_is_retransmit(skb))) {
+		flow->tail->next = skb;
+		flow->tail = skb;
+		return;
+	}
+
+	/* This skb is a tcp retransmit,
+	 * find the last retrans packet in the queue
+	 */
+	prev = NULL;
+	while (skb_is_retransmit(head)) {
+		prev = head;
+		head = head->next;
+		if (!head)
+			break;
+	}
+	if (!prev) { /* no rtx packet in queue, become the new head */
+		skb->next = flow->head;
+		flow->head = skb;
+	} else {
+		if (prev == flow->tail)
+			flow->tail = skb;
+		else
+			skb->next = prev->next;
+		prev->next = skb;
+	}
+}
+
+static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	struct fq_flow *f;
+
+	if (unlikely(sch->q.qlen >= sch->limit))
+		return qdisc_drop(skb, sch);
+
+	f = fq_classify(skb, q);
+	if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
+		q->stat_flows_plimit++;
+		return qdisc_drop(skb, sch);
+	}
+
+	f->qlen++;
+	flow_queue_add(f, skb);
+	if (skb_is_retransmit(skb))
+		q->stat_tcp_retrans++;
+	sch->qstats.backlog += qdisc_pkt_len(skb);
+	if (fq_flow_is_detached(f)) {
+		fq_flow_add_tail(&q->new_flows, f);
+		if (q->quantum > f->credit)
+			f->credit = q->quantum;
+		q->inactive_flows--;
+		qdisc_unthrottled(sch);
+	}
+	if (unlikely(f == &q->internal)) {
+		q->stat_internal_packets++;
+		qdisc_unthrottled(sch);
+	}
+	sch->q.qlen++;
+
+	return NET_XMIT_SUCCESS;
+}
+
+static void fq_check_throttled(struct fq_sched_data *q, u64 now)
+{
+	struct rb_node *p;
+
+	if (q->time_next_delayed_flow > now)
+		return;
+
+	q->time_next_delayed_flow = ~0ULL;
+	while ((p = rb_first(&q->delayed)) != NULL) {
+		struct fq_flow *f = container_of(p, struct fq_flow, rate_node);
+
+		if (f->time_next_packet > now) {
+			q->time_next_delayed_flow = f->time_next_packet;
+			break;
+		}
+		rb_erase(p, &q->delayed);
+		q->throttled_flows--;
+		fq_flow_add_tail(&q->old_flows, f);
+	}
+}
+
+static struct sk_buff *fq_dequeue(struct Qdisc *sch)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	u64 now = ktime_to_ns(ktime_get());
+	struct fq_flow_head *head;
+	struct sk_buff *skb;
+	struct fq_flow *f;
+
+	skb = fq_dequeue_head(&q->internal);
+	if (skb)
+		goto out;
+	fq_check_throttled(q, now);
+begin:
+	head = &q->new_flows;
+	if (!head->first) {
+		head = &q->old_flows;
+		if (!head->first) {
+			if (q->time_next_delayed_flow != ~0ULL)
+				qdisc_watchdog_schedule_ns(&q->watchdog,
+							   q->time_next_delayed_flow);
+			return NULL;
+		}
+	}
+	f = head->first;
+
+	if (f->credit <= 0) {
+		f->credit += q->quantum;
+		head->first = f->next;
+		fq_flow_add_tail(&q->old_flows, f);
+		goto begin;
+	}
+
+	if (unlikely(f->head && now < f->time_next_packet)) {
+		head->first = f->next;
+		fq_flow_set_throttled(q, f);
+		goto begin;
+	}
+
+	skb = fq_dequeue_head(f);
+	if (!skb) {
+		head->first = f->next;
+		/* force a pass through old_flows to prevent starvation */
+		if ((head == &q->new_flows) && q->old_flows.first) {
+			fq_flow_add_tail(&q->old_flows, f);
+		} else {
+			fq_flow_set_detached(f);
+			f->age = jiffies;
+			q->inactive_flows++;
+		}
+		goto begin;
+	}
+	f->time_next_packet = now;
+	f->credit -= qdisc_pkt_len(skb);
+
+	if (f->credit <= 0 &&
+	    q->rate_enable &&
+	    skb->sk && skb->sk->sk_state != TCP_TIME_WAIT) {
+		u32 rate = skb->sk->sk_pacing_rate ?: q->flow_default_rate;
+
+		rate = min(rate, q->flow_max_rate);
+		if (rate) {
+			u64 len = (u64)qdisc_pkt_len(skb) * NSEC_PER_SEC;
+
+			do_div(len, rate);
+			/* Since socket rate can change later,
+			 * clamp the delay to 125 ms.
+			 * TODO: maybe segment the too big skb, as in commit
+			 * e43ac79a4bc ("sch_tbf: segment too big GSO packets")
+			 */
+			if (unlikely(len > 125 * NSEC_PER_MSEC)) {
+				len = 125 * NSEC_PER_MSEC;
+				q->stat_pkts_too_long++;
+			}
+
+			f->time_next_packet = now + len;
+		}
+	}
+out:
+	prefetch(&skb->end);
+	sch->qstats.backlog -= qdisc_pkt_len(skb);
+	qdisc_bstats_update(sch, skb);
+	sch->q.qlen--;
+	qdisc_unthrottled(sch);
+	return skb;
+}
+
+static void fq_reset(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+
+	while ((skb = fq_dequeue(sch)) != NULL)
+		kfree_skb(skb);
+}
+
+static void fq_rehash(struct fq_sched_data *q,
+		      struct rb_root *old_array, u32 old_log,
+		      struct rb_root *new_array, u32 new_log)
+{
+	struct rb_node *op, **np, *parent;
+	struct rb_root *oroot, *nroot;
+	struct fq_flow *of, *nf;
+	int fcnt = 0;
+	u32 idx;
+
+	for (idx = 0; idx < (1U << old_log); idx++) {
+		oroot = &old_array[idx];
+		while ((op = rb_first(oroot)) != NULL) {
+			rb_erase(op, oroot);
+			of = container_of(op, struct fq_flow, fq_node);
+			if (fq_gc_candidate(of)) {
+				fcnt++;
+				kmem_cache_free(fq_flow_cachep, of);
+				continue;
+			}
+			nroot = &new_array[hash_32((u32)(long)of->sk, new_log)];
+
+			np = &nroot->rb_node;
+			parent = NULL;
+			while (*np) {
+				parent = *np;
+
+				nf = container_of(parent, struct fq_flow, fq_node);
+				BUG_ON(nf->sk == of->sk);
+
+				if (nf->sk > of->sk)
+					np = &parent->rb_right;
+				else
+					np = &parent->rb_left;
+			}
+
+			rb_link_node(&of->fq_node, parent, np);
+			rb_insert_color(&of->fq_node, nroot);
+		}
+	}
+	q->flows -= fcnt;
+	q->inactive_flows -= fcnt;
+	q->stat_gc_flows += fcnt;
+}
+
+static int fq_resize(struct fq_sched_data *q, u32 log)
+{
+	struct rb_root *array;
+	u32 idx;
+
+	if (q->fq_root && log == q->fq_trees_log)
+		return 0;
+
+	array = kmalloc(sizeof(struct rb_root) << log, GFP_KERNEL);
+	if (!array)
+		return -ENOMEM;
+
+	for (idx = 0; idx < (1U << log); idx++)
+		array[idx] = RB_ROOT;
+
+	if (q->fq_root) {
+		fq_rehash(q, q->fq_root, q->fq_trees_log, array, log);
+		kfree(q->fq_root);
+	}
+	q->fq_root = array;
+	q->fq_trees_log = log;
+
+	return 0;
+}
+
+static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
+	[TCA_FQ_PLIMIT]			= { .type = NLA_U32 },
+	[TCA_FQ_FLOW_PLIMIT]		= { .type = NLA_U32 },
+	[TCA_FQ_QUANTUM]		= { .type = NLA_U32 },
+	[TCA_FQ_INITIAL_QUANTUM]	= { .type = NLA_U32 },
+	[TCA_FQ_RATE_ENABLE]		= { .type = NLA_U32 },
+	[TCA_FQ_FLOW_DEFAULT_RATE]	= { .type = NLA_U32 },
+	[TCA_FQ_FLOW_MAX_RATE]		= { .type = NLA_U32 },
+	[TCA_FQ_BUCKETS_LOG]		= { .type = NLA_U32 },
+};
+
+static int fq_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_FQ_MAX + 1];
+	int err, drop_count = 0;
+	u32 fq_log;
+
+	if (!opt)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy);
+	if (err < 0)
+		return err;
+
+	sch_tree_lock(sch);
+
+	fq_log = q->fq_trees_log;
+
+	if (tb[TCA_FQ_BUCKETS_LOG]) {
+		u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);
+
+		if (nval >= 1 && nval <= ilog2(256*1024))
+			fq_log = nval;
+		else
+			err = -EINVAL;
+	}
+	if (tb[TCA_FQ_PLIMIT])
+		sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]);
+
+	if (tb[TCA_FQ_FLOW_PLIMIT])
+		q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);
+
+	if (tb[TCA_FQ_QUANTUM])
+		q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
+
+	if (tb[TCA_FQ_INITIAL_QUANTUM])
+		q->quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
+
+	if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
+		q->flow_default_rate = nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]);
+
+	if (tb[TCA_FQ_FLOW_MAX_RATE])
+		q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
+
+	if (tb[TCA_FQ_RATE_ENABLE]) {
+		u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
+
+		if (enable <= 1)
+			q->rate_enable = enable;
+		else
+			err = -EINVAL;
+	}
+
+	if (!err)
+		err = fq_resize(q, fq_log);
+
+	while (sch->q.qlen > sch->limit) {
+		struct sk_buff *skb = fq_dequeue(sch);
+
+		kfree_skb(skb);
+		drop_count++;
+	}
+	qdisc_tree_decrease_qlen(sch, drop_count);
+
+	sch_tree_unlock(sch);
+	return err;
+}
+
+static void fq_destroy(struct Qdisc *sch)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	struct rb_root *root;
+	struct rb_node *p;
+	unsigned int idx;
+
+	if (q->fq_root) {
+		for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
+			root = &q->fq_root[idx];
+			while ((p = rb_first(root)) != NULL) {
+				rb_erase(p, root);
+				kmem_cache_free(fq_flow_cachep,
+						container_of(p, struct fq_flow, fq_node));
+			}
+		}
+		kfree(q->fq_root);
+	}
+	qdisc_watchdog_cancel(&q->watchdog);
+}
+
+static int fq_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	int err;
+
+	sch->limit		= 10000;
+	q->flow_plimit		= 100;
+	q->quantum		= 2 * psched_mtu(qdisc_dev(sch));
+	q->initial_quantum	= 10 * psched_mtu(qdisc_dev(sch));
+	q->flow_default_rate	= 0;
+	q->flow_max_rate	= ~0U;
+	q->rate_enable		= 1;
+	q->new_flows.first	= NULL;
+	q->old_flows.first	= NULL;
+	q->delayed		= RB_ROOT;
+	q->fq_root		= NULL;
+	q->fq_trees_log		= ilog2(1024);
+	qdisc_watchdog_init(&q->watchdog, sch);
+
+	if (opt)
+		err = fq_change(sch, opt);
+	else
+		err = fq_resize(q, q->fq_trees_log);
+
+	return err;
+}
+
+static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts;
+
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
+	    nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
+	    nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
+	    nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
+	    nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
+	    nla_put_u32(skb, TCA_FQ_FLOW_DEFAULT_RATE, q->flow_default_rate) ||
+	    nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) ||
+	    nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
+		goto nla_put_failure;
+
+	nla_nest_end(skb, opts);
+	return skb->len;
+
+nla_put_failure:
+	return -1;
+}
+
+static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	u64 now = ktime_to_ns(ktime_get());
+	struct tc_fq_qd_stats st = {
+		.gc_flows		= q->stat_gc_flows,
+		.highprio_packets	= q->stat_internal_packets,
+		.tcp_retrans		= q->stat_tcp_retrans,
+		.throttled		= q->stat_throttled,
+		.flows_plimit		= q->stat_flows_plimit,
+		.pkts_too_long		= q->stat_pkts_too_long,
+		.allocation_errors	= q->stat_allocation_errors,
+		.flows			= q->flows,
+		.inactive_flows		= q->inactive_flows,
+		.throttled_flows	= q->throttled_flows,
+		.time_next_delayed_flow	= q->time_next_delayed_flow - now,
+	};
+
+	return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
+	.id		=	"fq",
+	.priv_size	=	sizeof(struct fq_sched_data),
+
+	.enqueue	=	fq_enqueue,
+	.dequeue	=	fq_dequeue,
+	.peek		=	qdisc_peek_dequeued,
+	.init		=	fq_init,
+	.reset		=	fq_reset,
+	.destroy	=	fq_destroy,
+	.change		=	fq_change,
+	.dump		=	fq_dump,
+	.dump_stats	=	fq_dump_stats,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init fq_module_init(void)
+{
+	int ret;
+
+	fq_flow_cachep = kmem_cache_create("fq_flow_cache",
+					   sizeof(struct fq_flow),
+					   0, 0, NULL);
+	if (!fq_flow_cachep)
+		return -ENOMEM;
+
+	ret = register_qdisc(&fq_qdisc_ops);
+	if (ret)
+		kmem_cache_destroy(fq_flow_cachep);
+	return ret;
+}
+
+static void __exit fq_module_exit(void)
+{
+	unregister_qdisc(&fq_qdisc_ops);
+	kmem_cache_destroy(fq_flow_cachep);
+}
+
+module_init(fq_module_init)
+module_exit(fq_module_exit)
+MODULE_AUTHOR("Eric Dumazet");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 08f89b981b0e6b46058dae2957241d4099129521 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 30 Aug 2013 09:46:43 -0700
Subject: pkt_sched: fq: prefetch() fix

kbuild bot reported following m68k build error :

  net/sched/sch_fq.c: In function 'fq_dequeue':
>> net/sched/sch_fq.c:491:2: error: implicit declaration of function
'prefetch' [-Werror=implicit-function-declaration]
   cc1: some warnings being treated as errors

While we are fixing this, move this prefetch() call a bit earlier.

Reported-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_fq.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 91ceca7bcb52..32ad015ee8ce 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -46,6 +46,7 @@
 #include <linux/slab.h>
 #include <linux/rbtree.h>
 #include <linux/hash.h>
+#include <linux/prefetch.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
 #include <net/sock.h>
@@ -461,6 +462,7 @@ begin:
 		}
 		goto begin;
 	}
+	prefetch(&skb->end);
 	f->time_next_packet = now;
 	f->credit -= qdisc_pkt_len(skb);
 
@@ -488,7 +490,6 @@ begin:
 		}
 	}
 out:
-	prefetch(&skb->end);
 	sch->qstats.backlog -= qdisc_pkt_len(skb);
 	qdisc_bstats_update(sch, skb);
 	sch->q.qlen--;
-- 
cgit v1.2.3


From 1b7fdd2ab5852717a4fc7d79847759c67065d7e9 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Fri, 30 Aug 2013 08:35:53 -0700
Subject: tcp: do not use cached RTT for RTT estimation

RTT cached in the TCP metrics are valuable for the initial timeout
because SYN RTT usually does not account for serialization delays
on low BW path.

However using it to seed the RTT estimator maybe disruptive because
other components (e.g., pacing) require the smooth RTT to be obtained
from actual connection.

The solution is to use the higher cached RTT to set the first RTO
conservatively like tcp_rtt_estimator(), but avoid seeding the other
RTT estimator variables such as srtt.  It is also a good idea to
keep RTO conservative to obtain the first RTT sample, and the
performance is insured by TCP loss probe if SYN RTT is available.

To keep the seeding formula consistent across SYN RTT and cached RTT,
the rttvar is twice the cached RTT instead of cached RTTVAR value. The
reason is because cached variation may be too small (near min RTO)
which defeats the purpose of being conservative on first RTO. However
the metrics still keep the RTT variations as they might be useful for
user applications (through ip).

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Tested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_metrics.c | 44 +++++++++++---------------------------------
 1 file changed, 11 insertions(+), 33 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index f6a005c485a9..273ed735cca2 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -443,7 +443,7 @@ void tcp_init_metrics(struct sock *sk)
 	struct dst_entry *dst = __sk_dst_get(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_metrics_block *tm;
-	u32 val;
+	u32 val, crtt = 0; /* cached RTT scaled by 8 */
 
 	if (dst == NULL)
 		goto reset;
@@ -478,40 +478,18 @@ void tcp_init_metrics(struct sock *sk)
 		tp->reordering = val;
 	}
 
-	val = tcp_metric_get(tm, TCP_METRIC_RTT);
-	if (val == 0 || tp->srtt == 0) {
-		rcu_read_unlock();
-		goto reset;
-	}
-	/* Initial rtt is determined from SYN,SYN-ACK.
-	 * The segment is small and rtt may appear much
-	 * less than real one. Use per-dst memory
-	 * to make it more realistic.
-	 *
-	 * A bit of theory. RTT is time passed after "normal" sized packet
-	 * is sent until it is ACKed. In normal circumstances sending small
-	 * packets force peer to delay ACKs and calculation is correct too.
-	 * The algorithm is adaptive and, provided we follow specs, it
-	 * NEVER underestimate RTT. BUT! If peer tries to make some clever
-	 * tricks sort of "quick acks" for time long enough to decrease RTT
-	 * to low value, and then abruptly stops to do it and starts to delay
-	 * ACKs, wait for troubles.
-	 */
-	val = msecs_to_jiffies(val);
-	if (val > tp->srtt) {
-		tp->srtt = val;
-		tp->rtt_seq = tp->snd_nxt;
-	}
-	val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
-	if (val > tp->mdev) {
-		tp->mdev = val;
-		tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
-	}
+	crtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
 	rcu_read_unlock();
-
-	tcp_set_rto(sk);
 reset:
-	if (tp->srtt == 0) {
+	if (crtt > tp->srtt) {
+		/* Initial RTT (tp->srtt) from SYN usually don't measure
+		 * serialization delay on low BW links well so RTO may be
+		 * under-estimated. Stay conservative and seed RTO with
+		 * the RTTs from past data exchanges, using the same seeding
+		 * formula in tcp_rtt_estimator().
+		 */
+		inet_csk(sk)->icsk_rto = crtt + max(crtt >> 2, tcp_rto_min(sk));
+	} else if (tp->srtt == 0) {
 		/* RFC6298: 5.7 We've failed to get a valid RTT sample from
 		 * 3WHS. This is most likely due to retransmission,
 		 * including spurious one. Reset the RTO back to 3secs
-- 
cgit v1.2.3


From 816c5b5b016acde51f0547cf1657d87cf22a6d47 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Wed, 28 Aug 2013 12:05:38 +0200
Subject: ipv6: Remove redundant sk variable

A sk variable initialized to ndisc_sk is already available outside
of the branch.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ndisc.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 41720feeaa64..bb6fd95a5cd1 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -428,7 +428,6 @@ static void ndisc_send_skb(struct sk_buff *skb,
 	type = icmp6h->icmp6_type;
 
 	if (!dst) {
-		struct sock *sk = net->ipv6.ndisc_sk;
 		struct flowi6 fl6;
 
 		icmpv6_flow_init(sk, &fl6, type, saddr, daddr, skb->dev->ifindex);
-- 
cgit v1.2.3


From 6da7c8fcbcbdb50ec68c61b40d554c74850fdb91 Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Tue, 27 Aug 2013 16:19:08 -0700
Subject: qdisc: allow setting default queuing discipline

By default, the pfifo_fast queue discipline has been used by default
for all devices. But we have better choices now.

This patch allow setting the default queueing discipline with sysctl.
This allows easy use of better queueing disciplines on all devices
without having to use tc qdisc scripts. It is intended to allow
an easy path for distributions to make fq_codel or sfq the default
qdisc.

This patch also makes pfifo_fast more of a first class qdisc, since
it is now possible to manually override the default and explicitly
use pfifo_fast. The behavior for systems who do not use the sysctl
is unchanged, they still get pfifo_fast

Also removes leftover random # in sysctl net core.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/sysctl/net.txt | 13 ++++++++++
 include/net/pkt_sched.h      |  3 +++
 include/net/sch_generic.h    |  1 +
 net/core/sysctl_net_core.c   | 30 ++++++++++++++++++++++-
 net/sched/sch_api.c          | 58 ++++++++++++++++++++++++++++++++++++++++++++
 net/sched/sch_generic.c      | 11 +++++----
 net/sched/sch_mq.c           |  2 +-
 net/sched/sch_mqprio.c       |  2 +-
 8 files changed, 112 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index d569f2a424d5..9a0319a82470 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -50,6 +50,19 @@ The maximum number of packets that kernel can handle on a NAPI interrupt,
 it's a Per-CPU variable.
 Default: 64
 
+default_qdisc
+--------------
+
+The default queuing discipline to use for network devices. This allows
+overriding the default queue discipline of pfifo_fast with an
+alternative. Since the default queuing discipline is created with the
+no additional parameters so is best suited to queuing disciplines that
+work well without configuration like stochastic fair queue (sfq),
+CoDel (codel) or fair queue CoDel (fq_codel). Don't use queuing disciplines
+like Hierarchical Token Bucket or Deficit Round Robin which require setting
+up classes and bandwidths.
+Default: pfifo_fast
+
 busy_read
 ----------------
 Low latency busy poll timeout for socket reads. (needs CONFIG_NET_RX_BUSY_POLL)
diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index f7c24f8fbdc5..59ec3cd15d68 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -85,6 +85,9 @@ struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops,
 
 int register_qdisc(struct Qdisc_ops *qops);
 int unregister_qdisc(struct Qdisc_ops *qops);
+void qdisc_get_default(char *id, size_t len);
+int qdisc_set_default(const char *id);
+
 void qdisc_list_del(struct Qdisc *q);
 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle);
 struct Qdisc *qdisc_lookup_class(struct net_device *dev, u32 handle);
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 76368c9d4503..2e31bf14df00 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -316,6 +316,7 @@ extern struct Qdisc noop_qdisc;
 extern struct Qdisc_ops noop_qdisc_ops;
 extern struct Qdisc_ops pfifo_fast_ops;
 extern struct Qdisc_ops mq_qdisc_ops;
+extern const struct Qdisc_ops *default_qdisc_ops;
 
 struct Qdisc_class_common {
 	u32			classid;
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 31107abd2783..cca444190907 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -20,6 +20,7 @@
 #include <net/sock.h>
 #include <net/net_ratelimit.h>
 #include <net/busy_poll.h>
+#include <net/pkt_sched.h>
 
 static int zero = 0;
 static int one = 1;
@@ -193,6 +194,26 @@ static int flow_limit_table_len_sysctl(struct ctl_table *table, int write,
 }
 #endif /* CONFIG_NET_FLOW_LIMIT */
 
+#ifdef CONFIG_NET_SCHED
+static int set_default_qdisc(struct ctl_table *table, int write,
+			     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	char id[IFNAMSIZ];
+	struct ctl_table tbl = {
+		.data = id,
+		.maxlen = IFNAMSIZ,
+	};
+	int ret;
+
+	qdisc_get_default(id, IFNAMSIZ);
+
+	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+	if (write && ret == 0)
+		ret = qdisc_set_default(id);
+	return ret;
+}
+#endif
+
 static struct ctl_table net_core_table[] = {
 #ifdef CONFIG_NET
 	{
@@ -315,7 +336,14 @@ static struct ctl_table net_core_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-#
+#endif
+#ifdef CONFIG_NET_SCHED
+	{
+		.procname	= "default_qdisc",
+		.mode		= 0644,
+		.maxlen		= IFNAMSIZ,
+		.proc_handler	= set_default_qdisc
+	},
 #endif
 #endif /* CONFIG_NET */
 	{
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 51b968d3febb..812e57900591 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -131,6 +131,11 @@ static DEFINE_RWLOCK(qdisc_mod_lock);
  ************************************************/
 
 
+/* Qdisc to use by default */
+
+const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
+EXPORT_SYMBOL(default_qdisc_ops);
+
 /* The list of all installed queueing disciplines. */
 
 static struct Qdisc_ops *qdisc_base;
@@ -200,6 +205,58 @@ int unregister_qdisc(struct Qdisc_ops *qops)
 }
 EXPORT_SYMBOL(unregister_qdisc);
 
+/* Get default qdisc if not otherwise specified */
+void qdisc_get_default(char *name, size_t len)
+{
+	read_lock(&qdisc_mod_lock);
+	strlcpy(name, default_qdisc_ops->id, len);
+	read_unlock(&qdisc_mod_lock);
+}
+
+static struct Qdisc_ops *qdisc_lookup_default(const char *name)
+{
+	struct Qdisc_ops *q = NULL;
+
+	for (q = qdisc_base; q; q = q->next) {
+		if (!strcmp(name, q->id)) {
+			if (!try_module_get(q->owner))
+				q = NULL;
+			break;
+		}
+	}
+
+	return q;
+}
+
+/* Set new default qdisc to use */
+int qdisc_set_default(const char *name)
+{
+	const struct Qdisc_ops *ops;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	write_lock(&qdisc_mod_lock);
+	ops = qdisc_lookup_default(name);
+	if (!ops) {
+		/* Not found, drop lock and try to load module */
+		write_unlock(&qdisc_mod_lock);
+		request_module("sch_%s", name);
+		write_lock(&qdisc_mod_lock);
+
+		ops = qdisc_lookup_default(name);
+	}
+
+	if (ops) {
+		/* Set new default */
+		module_put(default_qdisc_ops->owner);
+		default_qdisc_ops = ops;
+	}
+	write_unlock(&qdisc_mod_lock);
+
+	return ops ? 0 : -ENOENT;
+}
+
 /* We know handle. Find qdisc among all qdisc's attached to device
    (root qdisc, all its children, children of children etc.)
  */
@@ -1854,6 +1911,7 @@ static int __init pktsched_init(void)
 		return err;
 	}
 
+	register_qdisc(&pfifo_fast_ops);
 	register_qdisc(&pfifo_qdisc_ops);
 	register_qdisc(&bfifo_qdisc_ops);
 	register_qdisc(&pfifo_head_drop_qdisc_ops);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 48be3d5c0d92..5078e0c5db8d 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -530,7 +530,6 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = {
 	.dump		=	pfifo_fast_dump,
 	.owner		=	THIS_MODULE,
 };
-EXPORT_SYMBOL(pfifo_fast_ops);
 
 static struct lock_class_key qdisc_tx_busylock;
 
@@ -583,6 +582,9 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
 {
 	struct Qdisc *sch;
 
+	if (!try_module_get(ops->owner))
+		goto errout;
+
 	sch = qdisc_alloc(dev_queue, ops);
 	if (IS_ERR(sch))
 		goto errout;
@@ -686,7 +688,7 @@ static void attach_one_default_qdisc(struct net_device *dev,
 
 	if (dev->tx_queue_len) {
 		qdisc = qdisc_create_dflt(dev_queue,
-					  &pfifo_fast_ops, TC_H_ROOT);
+					  default_qdisc_ops, TC_H_ROOT);
 		if (!qdisc) {
 			netdev_info(dev, "activation failed\n");
 			return;
@@ -739,9 +741,8 @@ void dev_activate(struct net_device *dev)
 	int need_watchdog;
 
 	/* No queueing discipline is attached to device;
-	   create default one i.e. pfifo_fast for devices,
-	   which need queueing and noqueue_qdisc for
-	   virtual interfaces
+	 * create default one for devices, which need queueing
+	 * and noqueue_qdisc for virtual interfaces
 	 */
 
 	if (dev->qdisc == &noop_qdisc)
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 5da78a19ac9a..2e56185736d6 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -57,7 +57,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
 
 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
 		dev_queue = netdev_get_tx_queue(dev, ntx);
-		qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops,
+		qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops,
 					  TC_H_MAKE(TC_H_MAJ(sch->handle),
 						    TC_H_MIN(ntx + 1)));
 		if (qdisc == NULL)
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index accec33c454c..d44c868cb537 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -124,7 +124,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		dev_queue = netdev_get_tx_queue(dev, i);
-		qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops,
+		qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops,
 					  TC_H_MAKE(TC_H_MAJ(sch->handle),
 						    TC_H_MIN(i + 1)));
 		if (qdisc == NULL) {
-- 
cgit v1.2.3


From d2a7f269f91299b8e5f6c7d2a41ba46248d73d24 Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Sat, 31 Aug 2013 10:15:50 -0700
Subject: qdisc: make args to qdisc_create_default const

Fixes warnings introduced by the qdisc default patch.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 4 ++--
 net/sched/sch_generic.c   | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 2e31bf14df00..f4eb365f7dcd 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -370,9 +370,9 @@ void qdisc_reset(struct Qdisc *qdisc);
 void qdisc_destroy(struct Qdisc *qdisc);
 void qdisc_tree_decrease_qlen(struct Qdisc *qdisc, unsigned int n);
 struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
-			  struct Qdisc_ops *ops);
+			  const struct Qdisc_ops *ops);
 struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
-				struct Qdisc_ops *ops, u32 parentid);
+				const struct Qdisc_ops *ops, u32 parentid);
 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
 			       const struct qdisc_size_table *stab);
 void tcf_destroy(struct tcf_proto *tp);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 5078e0c5db8d..6b021106542c 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -534,7 +534,7 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = {
 static struct lock_class_key qdisc_tx_busylock;
 
 struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
-			  struct Qdisc_ops *ops)
+			  const struct Qdisc_ops *ops)
 {
 	void *p;
 	struct Qdisc *sch;
@@ -578,7 +578,8 @@ errout:
 }
 
 struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
-				struct Qdisc_ops *ops, unsigned int parentid)
+				const struct Qdisc_ops *ops,
+				unsigned int parentid)
 {
 	struct Qdisc *sch;
 
-- 
cgit v1.2.3


From 34aedd3f3b289edba118e66450e95790ccab5091 Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Sat, 31 Aug 2013 10:15:33 -0700
Subject: qdisc: fix build with !CONFIG_NET_SCHED

Multiqueue scheduler refers to default_qdisc_ops; therefore the
variable definition needs to be moved to handle case where net
scheduler API is not available.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_api.c     | 5 -----
 net/sched/sch_generic.c | 4 ++++
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 812e57900591..2adda7fa2d39 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -131,11 +131,6 @@ static DEFINE_RWLOCK(qdisc_mod_lock);
  ************************************************/
 
 
-/* Qdisc to use by default */
-
-const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
-EXPORT_SYMBOL(default_qdisc_ops);
-
 /* The list of all installed queueing disciplines. */
 
 static struct Qdisc_ops *qdisc_base;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6b021106542c..a74e278654aa 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -30,6 +30,10 @@
 #include <net/pkt_sched.h>
 #include <net/dst.h>
 
+/* Qdisc to use by default */
+const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
+EXPORT_SYMBOL(default_qdisc_ops);
+
 /* Main transmission queue. */
 
 /* Modifications to data participating in scheduling must be protected with
-- 
cgit v1.2.3


From 3ce9b35ff6de8dfebb0b0045e667c000f632e563 Mon Sep 17 00:00:00 2001
From: Cong Wang <amwang@redhat.com>
Date: Sat, 31 Aug 2013 13:44:28 +0800
Subject: ipv6: move ip6_dst_hoplimit() into core kernel

It will be used by vxlan, and may not be inlined.

Cc: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Cong Wang <amwang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h |  2 --
 include/net/ipv6.h      |  2 ++
 net/ipv6/output_core.c  | 22 ++++++++++++++++++++++
 net/ipv6/route.c        | 19 -------------------
 4 files changed, 24 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index f667248202b6..f525e7038cca 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -112,8 +112,6 @@ extern struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
 					   const struct in6_addr *addr,
 					   bool anycast);
 
-extern int			ip6_dst_hoplimit(struct dst_entry *dst);
-
 /*
  *	support functions for ND
  *
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 7bdff0455065..bbf1c8fb8511 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -658,6 +658,8 @@ static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_add
 
 extern void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt);
 
+extern int ip6_dst_hoplimit(struct dst_entry *dst);
+
 /*
  *	Header manipulation
  */
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index ab92a3673fbb..53a062187db1 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -5,6 +5,7 @@
 #include <linux/export.h>
 #include <net/ipv6.h>
 #include <net/ip6_fib.h>
+#include <net/addrconf.h>
 
 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
 {
@@ -75,3 +76,24 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 	return offset;
 }
 EXPORT_SYMBOL(ip6_find_1stfragopt);
+
+#if IS_ENABLED(CONFIG_IPV6)
+int ip6_dst_hoplimit(struct dst_entry *dst)
+{
+	int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
+	if (hoplimit == 0) {
+		struct net_device *dev = dst->dev;
+		struct inet6_dev *idev;
+
+		rcu_read_lock();
+		idev = __in6_dev_get(dev);
+		if (idev)
+			hoplimit = idev->cnf.hop_limit;
+		else
+			hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
+		rcu_read_unlock();
+	}
+	return hoplimit;
+}
+EXPORT_SYMBOL(ip6_dst_hoplimit);
+#endif
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 55236a84c748..b770085ae36d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1354,25 +1354,6 @@ out:
 	return entries > rt_max_size;
 }
 
-int ip6_dst_hoplimit(struct dst_entry *dst)
-{
-	int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
-	if (hoplimit == 0) {
-		struct net_device *dev = dst->dev;
-		struct inet6_dev *idev;
-
-		rcu_read_lock();
-		idev = __in6_dev_get(dev);
-		if (idev)
-			hoplimit = idev->cnf.hop_limit;
-		else
-			hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
-		rcu_read_unlock();
-	}
-	return hoplimit;
-}
-EXPORT_SYMBOL(ip6_dst_hoplimit);
-
 /*
  *
  */
-- 
cgit v1.2.3


From 788787b55913d456b9851d926d9a80ff00892c34 Mon Sep 17 00:00:00 2001
From: Cong Wang <amwang@redhat.com>
Date: Sat, 31 Aug 2013 13:44:29 +0800
Subject: ipv6: move ip6_local_out into core kernel

It will be used the vxlan kernel module.

Signed-off-by: Cong Wang <amwang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_output.c  | 25 -------------------------
 net/ipv6/output_core.c | 26 ++++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 6e3ddf806ec2..dd08cfd8999e 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -56,31 +56,6 @@
 #include <net/checksum.h>
 #include <linux/mroute6.h>
 
-int __ip6_local_out(struct sk_buff *skb)
-{
-	int len;
-
-	len = skb->len - sizeof(struct ipv6hdr);
-	if (len > IPV6_MAXPLEN)
-		len = 0;
-	ipv6_hdr(skb)->payload_len = htons(len);
-
-	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
-		       skb_dst(skb)->dev, dst_output);
-}
-
-int ip6_local_out(struct sk_buff *skb)
-{
-	int err;
-
-	err = __ip6_local_out(skb);
-	if (likely(err == 1))
-		err = dst_output(skb);
-
-	return err;
-}
-EXPORT_SYMBOL_GPL(ip6_local_out);
-
 static int ip6_finish_output2(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index 53a062187db1..827f795209cf 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -97,3 +97,29 @@ int ip6_dst_hoplimit(struct dst_entry *dst)
 }
 EXPORT_SYMBOL(ip6_dst_hoplimit);
 #endif
+
+int __ip6_local_out(struct sk_buff *skb)
+{
+	int len;
+
+	len = skb->len - sizeof(struct ipv6hdr);
+	if (len > IPV6_MAXPLEN)
+		len = 0;
+	ipv6_hdr(skb)->payload_len = htons(len);
+
+	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
+		       skb_dst(skb)->dev, dst_output);
+}
+EXPORT_SYMBOL_GPL(__ip6_local_out);
+
+int ip6_local_out(struct sk_buff *skb)
+{
+	int err;
+
+	err = __ip6_local_out(skb);
+	if (likely(err == 1))
+		err = dst_output(skb);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(ip6_local_out);
-- 
cgit v1.2.3


From 5f81bd2e5d804ca93f3ec8873451b22d2f454721 Mon Sep 17 00:00:00 2001
From: Cong Wang <amwang@redhat.com>
Date: Sat, 31 Aug 2013 13:44:30 +0800
Subject: ipv6: export a stub for IPv6 symbols used by vxlan

In case IPv6 is compiled as a module, introduce a stub
for ipv6_sock_mc_join and ipv6_sock_mc_drop etc.. It will be used
by vxlan module. Suggested by Ben.

This is an ugly but easy solution for now.

Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Cong Wang <amwang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h   | 15 +++++++++++++++
 net/ipv6/addrconf_core.c |  3 +++
 net/ipv6/af_inet6.c      | 11 +++++++++++
 3 files changed, 29 insertions(+)

(limited to 'net')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 43fa31a610b8..5339cab356bb 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -141,6 +141,21 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
 			 const struct in6_addr *src_addr);
 
 void ipv6_mc_dad_complete(struct inet6_dev *idev);
+
+/* A stub used by vxlan module. This is ugly, ideally these
+ * symbols should be built into the core kernel.
+ */
+struct ipv6_stub {
+	int (*ipv6_sock_mc_join)(struct sock *sk, int ifindex,
+				 const struct in6_addr *addr);
+	int (*ipv6_sock_mc_drop)(struct sock *sk, int ifindex,
+				 const struct in6_addr *addr);
+	int (*ipv6_dst_lookup)(struct sock *sk, struct dst_entry **dst,
+				struct flowi6 *fl6);
+	void (*udpv6_encap_enable)(void);
+};
+extern const struct ipv6_stub *ipv6_stub __read_mostly;
+
 /*
  * identify MLD packets for MLD filter exceptions
  */
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index d2f87427244b..93b24c6bcae7 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -98,3 +98,6 @@ int inet6addr_notifier_call_chain(unsigned long val, void *v)
 	return atomic_notifier_call_chain(&inet6addr_chain, val, v);
 }
 EXPORT_SYMBOL(inet6addr_notifier_call_chain);
+
+const struct ipv6_stub *ipv6_stub __read_mostly;
+EXPORT_SYMBOL_GPL(ipv6_stub);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 0d1a9b153fbb..0c9c22f7487a 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -810,6 +810,13 @@ static struct pernet_operations inet6_net_ops = {
 	.exit = inet6_net_exit,
 };
 
+static const struct ipv6_stub ipv6_stub_impl = {
+	.ipv6_sock_mc_join = ipv6_sock_mc_join,
+	.ipv6_sock_mc_drop = ipv6_sock_mc_drop,
+	.ipv6_dst_lookup = ip6_dst_lookup,
+	.udpv6_encap_enable = udpv6_encap_enable,
+};
+
 static int __init inet6_init(void)
 {
 	struct list_head *r;
@@ -884,6 +891,9 @@ static int __init inet6_init(void)
 	err = igmp6_init();
 	if (err)
 		goto igmp_fail;
+
+	ipv6_stub = &ipv6_stub_impl;
+
 	err = ipv6_netfilter_init();
 	if (err)
 		goto netfilter_fail;
@@ -1040,6 +1050,7 @@ static void __exit inet6_exit(void)
 	raw6_proc_exit();
 #endif
 	ipv6_netfilter_fini();
+	ipv6_stub = NULL;
 	igmp6_cleanup();
 	ndisc_cleanup();
 	ip6_mr_cleanup();
-- 
cgit v1.2.3


From 034dfc5df99eb8d263211524983b1a737b25c06b Mon Sep 17 00:00:00 2001
From: Cong Wang <amwang@redhat.com>
Date: Sat, 31 Aug 2013 13:44:31 +0800
Subject: ipv6: export in6addr_loopback to modules

It is needed by vxlan module. Noticed by Mike.

Cc: Mike Rapoport <mike.rapoport@ravellosystems.com>
Signed-off-by: Cong Wang <amwang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c      |  9 ---------
 net/ipv6/addrconf_core.c | 16 ++++++++++++++++
 2 files changed, 16 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a7183fc9bbc2..619451336bf1 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -245,15 +245,6 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.suppress_frag_ndisc	= 1,
 };
 
-/* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */
-const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT;
-const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT;
-const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
-const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT;
-const struct in6_addr in6addr_interfacelocal_allnodes = IN6ADDR_INTERFACELOCAL_ALLNODES_INIT;
-const struct in6_addr in6addr_interfacelocal_allrouters = IN6ADDR_INTERFACELOCAL_ALLROUTERS_INIT;
-const struct in6_addr in6addr_sitelocal_allrouters = IN6ADDR_SITELOCAL_ALLROUTERS_INIT;
-
 /* Check if a valid qdisc is available */
 static inline bool addrconf_qdisc_ok(const struct net_device *dev)
 {
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 93b24c6bcae7..a864033d9e78 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -101,3 +101,19 @@ EXPORT_SYMBOL(inet6addr_notifier_call_chain);
 
 const struct ipv6_stub *ipv6_stub __read_mostly;
 EXPORT_SYMBOL_GPL(ipv6_stub);
+
+/* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */
+const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT;
+EXPORT_SYMBOL(in6addr_loopback);
+const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT;
+EXPORT_SYMBOL(in6addr_any);
+const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
+EXPORT_SYMBOL(in6addr_linklocal_allnodes);
+const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT;
+EXPORT_SYMBOL(in6addr_linklocal_allrouters);
+const struct in6_addr in6addr_interfacelocal_allnodes = IN6ADDR_INTERFACELOCAL_ALLNODES_INIT;
+EXPORT_SYMBOL(in6addr_interfacelocal_allnodes);
+const struct in6_addr in6addr_interfacelocal_allrouters = IN6ADDR_INTERFACELOCAL_ALLROUTERS_INIT;
+EXPORT_SYMBOL(in6addr_interfacelocal_allrouters);
+const struct in6_addr in6addr_sitelocal_allrouters = IN6ADDR_SITELOCAL_ALLROUTERS_INIT;
+EXPORT_SYMBOL(in6addr_sitelocal_allrouters);
-- 
cgit v1.2.3


From caf92bc4007036cfac8ee06c845fdfe6496e4fb3 Mon Sep 17 00:00:00 2001
From: Cong Wang <amwang@redhat.com>
Date: Sat, 31 Aug 2013 13:44:32 +0800
Subject: ipv6: do not call ndisc_send_rs() with write lock

Because vxlan module will call ip6_dst_lookup() in TX path,
which will hold write lock. So we have to release this write lock
before calling ndisc_send_rs(), otherwise could deadlock.

Reviewed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Cong Wang <amwang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 619451336bf1..baaaead69ee1 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3090,6 +3090,7 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 static void addrconf_rs_timer(unsigned long data)
 {
 	struct inet6_dev *idev = (struct inet6_dev *)data;
+	struct net_device *dev = idev->dev;
 	struct in6_addr lladdr;
 
 	write_lock(&idev->lock);
@@ -3104,12 +3105,14 @@ static void addrconf_rs_timer(unsigned long data)
 		goto out;
 
 	if (idev->rs_probes++ < idev->cnf.rtr_solicits) {
-		if (!__ipv6_get_lladdr(idev, &lladdr, IFA_F_TENTATIVE))
-			ndisc_send_rs(idev->dev, &lladdr,
+		write_unlock(&idev->lock);
+		if (!ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE))
+			ndisc_send_rs(dev, &lladdr,
 				      &in6addr_linklocal_allrouters);
 		else
-			goto out;
+			goto put;
 
+		write_lock(&idev->lock);
 		/* The wait after the last probe can be shorter */
 		addrconf_mod_rs_timer(idev, (idev->rs_probes ==
 					     idev->cnf.rtr_solicits) ?
@@ -3125,6 +3128,7 @@ static void addrconf_rs_timer(unsigned long data)
 
 out:
 	write_unlock(&idev->lock);
+put:
 	in6_dev_put(idev);
 }
 
-- 
cgit v1.2.3


From e4c7ed415387cf718ffbec305396c30cee092987 Mon Sep 17 00:00:00 2001
From: Cong Wang <amwang@redhat.com>
Date: Sat, 31 Aug 2013 13:44:33 +0800
Subject: vxlan: add ipv6 support

This patch adds IPv6 support to vxlan device, as the new version
RFC already mentions it:

   http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-03

Cc: David Stevens <dlstevens@us.ibm.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Cong Wang <amwang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c           | 764 ++++++++++++++++++++++++++++++++++--------
 include/net/vxlan.h           |   2 +-
 include/uapi/linux/if_link.h  |   2 +
 net/openvswitch/vport-vxlan.c |   2 +-
 4 files changed, 622 insertions(+), 148 deletions(-)

(limited to 'net')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 3b21aca0c0c2..faf131e02a72 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -6,9 +6,6 @@
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
- *
- * TODO
- *  - IPv6 (not in RFC)
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -43,6 +40,11 @@
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 #include <net/vxlan.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+#include <net/ip6_tunnel.h>
+#endif
 
 #define VXLAN_VERSION	"0.1"
 
@@ -59,6 +61,8 @@
 #define VXLAN_VID_MASK	(VXLAN_N_VID - 1)
 /* IP header + UDP + VXLAN + Ethernet header */
 #define VXLAN_HEADROOM (20 + 8 + 8 + 14)
+/* IPv6 header + UDP + VXLAN + Ethernet header */
+#define VXLAN6_HEADROOM (40 + 8 + 8 + 14)
 #define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr))
 
 #define VXLAN_FLAGS 0x08000000	/* struct vxlanhdr.vx_flags required value. */
@@ -92,8 +96,14 @@ struct vxlan_net {
 	spinlock_t	  sock_lock;
 };
 
+union vxlan_addr {
+	struct sockaddr_in sin;
+	struct sockaddr_in6 sin6;
+	struct sockaddr sa;
+};
+
 struct vxlan_rdst {
-	__be32			 remote_ip;
+	union vxlan_addr	 remote_ip;
 	__be16			 remote_port;
 	u32			 remote_vni;
 	u32			 remote_ifindex;
@@ -120,7 +130,7 @@ struct vxlan_dev {
 	struct vxlan_sock *vn_sock;	/* listening socket */
 	struct net_device *dev;
 	struct vxlan_rdst default_dst;	/* default destination */
-	__be32		  saddr;	/* source address */
+	union vxlan_addr  saddr;	/* source address */
 	__be16		  dst_port;
 	__u16		  port_min;	/* source port range */
 	__u16		  port_max;
@@ -146,6 +156,7 @@ struct vxlan_dev {
 #define VXLAN_F_RSC	0x04
 #define VXLAN_F_L2MISS	0x08
 #define VXLAN_F_L3MISS	0x10
+#define VXLAN_F_IPV6	0x20 /* internal flag */
 
 /* salt for hash table */
 static u32 vxlan_salt __read_mostly;
@@ -153,6 +164,96 @@ static struct workqueue_struct *vxlan_wq;
 
 static void vxlan_sock_work(struct work_struct *work);
 
+#if IS_ENABLED(CONFIG_IPV6)
+static inline
+bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
+{
+       if (a->sa.sa_family != b->sa.sa_family)
+               return false;
+       if (a->sa.sa_family == AF_INET6)
+               return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr);
+       else
+               return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
+}
+
+static inline bool vxlan_addr_any(const union vxlan_addr *ipa)
+{
+       if (ipa->sa.sa_family == AF_INET6)
+               return ipv6_addr_any(&ipa->sin6.sin6_addr);
+       else
+               return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY);
+}
+
+static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa)
+{
+       if (ipa->sa.sa_family == AF_INET6)
+               return ipv6_addr_is_multicast(&ipa->sin6.sin6_addr);
+       else
+               return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr));
+}
+
+static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
+{
+       if (nla_len(nla) >= sizeof(struct in6_addr)) {
+               nla_memcpy(&ip->sin6.sin6_addr, nla, sizeof(struct in6_addr));
+               ip->sa.sa_family = AF_INET6;
+               return 0;
+       } else if (nla_len(nla) >= sizeof(__be32)) {
+               ip->sin.sin_addr.s_addr = nla_get_be32(nla);
+               ip->sa.sa_family = AF_INET;
+               return 0;
+       } else {
+               return -EAFNOSUPPORT;
+       }
+}
+
+static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
+                             const union vxlan_addr *ip)
+{
+       if (ip->sa.sa_family == AF_INET6)
+               return nla_put(skb, attr, sizeof(struct in6_addr), &ip->sin6.sin6_addr);
+       else
+               return nla_put_be32(skb, attr, ip->sin.sin_addr.s_addr);
+}
+
+#else /* !CONFIG_IPV6 */
+
+static inline
+bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
+{
+       return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
+}
+
+static inline bool vxlan_addr_any(const union vxlan_addr *ipa)
+{
+       return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY);
+}
+
+static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa)
+{
+       return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr));
+}
+
+static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
+{
+       if (nla_len(nla) >= sizeof(struct in6_addr)) {
+               return -EAFNOSUPPORT;
+       } else if (nla_len(nla) >= sizeof(__be32)) {
+               ip->sin.sin_addr.s_addr = nla_get_be32(nla);
+               ip->sa.sa_family = AF_INET;
+               return 0;
+       } else {
+               return -EAFNOSUPPORT;
+       }
+}
+
+static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
+                             const union vxlan_addr *ip)
+{
+       return nla_put_be32(skb, attr, ip->sin.sin_addr.s_addr);
+}
+#endif
+
 /* Virtual Network hash table head */
 static inline struct hlist_head *vni_head(struct vxlan_sock *vs, u32 id)
 {
@@ -239,7 +340,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
 
 	if (type == RTM_GETNEIGH) {
 		ndm->ndm_family	= AF_INET;
-		send_ip = rdst->remote_ip != htonl(INADDR_ANY);
+		send_ip = !vxlan_addr_any(&rdst->remote_ip);
 		send_eth = !is_zero_ether_addr(fdb->eth_addr);
 	} else
 		ndm->ndm_family	= AF_BRIDGE;
@@ -251,7 +352,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
 	if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
 		goto nla_put_failure;
 
-	if (send_ip && nla_put_be32(skb, NDA_DST, rdst->remote_ip))
+	if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip))
 		goto nla_put_failure;
 
 	if (rdst->remote_port && rdst->remote_port != vxlan->dst_port &&
@@ -283,7 +384,7 @@ static inline size_t vxlan_nlmsg_size(void)
 {
 	return NLMSG_ALIGN(sizeof(struct ndmsg))
 		+ nla_total_size(ETH_ALEN) /* NDA_LLADDR */
-		+ nla_total_size(sizeof(__be32)) /* NDA_DST */
+		+ nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */
 		+ nla_total_size(sizeof(__be16)) /* NDA_PORT */
 		+ nla_total_size(sizeof(__be32)) /* NDA_VNI */
 		+ nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */
@@ -317,14 +418,14 @@ errout:
 		rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
 }
 
-static void vxlan_ip_miss(struct net_device *dev, __be32 ipa)
+static void vxlan_ip_miss(struct net_device *dev, union vxlan_addr *ipa)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct vxlan_fdb f = {
 		.state = NUD_STALE,
 	};
 	struct vxlan_rdst remote = {
-		.remote_ip = ipa, /* goes to NDA_DST */
+		.remote_ip = *ipa, /* goes to NDA_DST */
 		.remote_vni = VXLAN_N_VID,
 	};
 
@@ -397,13 +498,13 @@ static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
 
 /* caller should hold vxlan->hash_lock */
 static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f,
-					      __be32 ip, __be16 port,
+					      union vxlan_addr *ip, __be16 port,
 					      __u32 vni, __u32 ifindex)
 {
 	struct vxlan_rdst *rd;
 
 	list_for_each_entry(rd, &f->remotes, list) {
-		if (rd->remote_ip == ip &&
+		if (vxlan_addr_equal(&rd->remote_ip, ip) &&
 		    rd->remote_port == port &&
 		    rd->remote_vni == vni &&
 		    rd->remote_ifindex == ifindex)
@@ -415,7 +516,7 @@ static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f,
 
 /* Replace destination of unicast mac */
 static int vxlan_fdb_replace(struct vxlan_fdb *f,
-			    __be32 ip, __be16 port, __u32 vni, __u32 ifindex)
+			     union vxlan_addr *ip, __be16 port, __u32 vni, __u32 ifindex)
 {
 	struct vxlan_rdst *rd;
 
@@ -426,7 +527,7 @@ static int vxlan_fdb_replace(struct vxlan_fdb *f,
 	rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list);
 	if (!rd)
 		return 0;
-	rd->remote_ip = ip;
+	rd->remote_ip = *ip;
 	rd->remote_port = port;
 	rd->remote_vni = vni;
 	rd->remote_ifindex = ifindex;
@@ -435,7 +536,7 @@ static int vxlan_fdb_replace(struct vxlan_fdb *f,
 
 /* Add/update destinations for multicast */
 static int vxlan_fdb_append(struct vxlan_fdb *f,
-			    __be32 ip, __be16 port, __u32 vni, __u32 ifindex)
+			    union vxlan_addr *ip, __be16 port, __u32 vni, __u32 ifindex)
 {
 	struct vxlan_rdst *rd;
 
@@ -446,7 +547,7 @@ static int vxlan_fdb_append(struct vxlan_fdb *f,
 	rd = kmalloc(sizeof(*rd), GFP_ATOMIC);
 	if (rd == NULL)
 		return -ENOBUFS;
-	rd->remote_ip = ip;
+	rd->remote_ip = *ip;
 	rd->remote_port = port;
 	rd->remote_vni = vni;
 	rd->remote_ifindex = ifindex;
@@ -458,7 +559,7 @@ static int vxlan_fdb_append(struct vxlan_fdb *f,
 
 /* Add new entry to forwarding table -- assumes lock held */
 static int vxlan_fdb_create(struct vxlan_dev *vxlan,
-			    const u8 *mac, __be32 ip,
+			    const u8 *mac, union vxlan_addr *ip,
 			    __u16 state, __u16 flags,
 			    __be16 port, __u32 vni, __u32 ifindex,
 			    __u8 ndm_flags)
@@ -517,7 +618,7 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan,
 		    (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac)))
 			return -EOPNOTSUPP;
 
-		netdev_dbg(vxlan->dev, "add %pM -> %pI4\n", mac, &ip);
+		netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
 		f = kmalloc(sizeof(*f), GFP_ATOMIC);
 		if (!f)
 			return -ENOMEM;
@@ -565,17 +666,26 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f)
 }
 
 static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan,
-			   __be32 *ip, __be16 *port, u32 *vni, u32 *ifindex)
+			   union vxlan_addr *ip, __be16 *port, u32 *vni, u32 *ifindex)
 {
 	struct net *net = dev_net(vxlan->dev);
+	int err;
 
 	if (tb[NDA_DST]) {
-		if (nla_len(tb[NDA_DST]) != sizeof(__be32))
-			return -EAFNOSUPPORT;
-
-		*ip = nla_get_be32(tb[NDA_DST]);
+		err = vxlan_nla_get_addr(ip, tb[NDA_DST]);
+		if (err)
+			return err;
 	} else {
-		*ip = htonl(INADDR_ANY);
+		union vxlan_addr *remote = &vxlan->default_dst.remote_ip;
+		if (remote->sa.sa_family == AF_INET) {
+			ip->sin.sin_addr.s_addr = htonl(INADDR_ANY);
+			ip->sa.sa_family = AF_INET;
+#if IS_ENABLED(CONFIG_IPV6)
+		} else {
+			ip->sin6.sin6_addr = in6addr_any;
+			ip->sa.sa_family = AF_INET6;
+#endif
+		}
 	}
 
 	if (tb[NDA_PORT]) {
@@ -618,7 +728,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	/* struct net *net = dev_net(vxlan->dev); */
-	__be32 ip;
+	union vxlan_addr ip;
 	__be16 port;
 	u32 vni, ifindex;
 	int err;
@@ -637,7 +747,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 		return err;
 
 	spin_lock_bh(&vxlan->hash_lock);
-	err = vxlan_fdb_create(vxlan, addr, ip, ndm->ndm_state, flags,
+	err = vxlan_fdb_create(vxlan, addr, &ip, ndm->ndm_state, flags,
 			       port, vni, ifindex, ndm->ndm_flags);
 	spin_unlock_bh(&vxlan->hash_lock);
 
@@ -652,7 +762,7 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct vxlan_fdb *f;
 	struct vxlan_rdst *rd = NULL;
-	__be32 ip;
+	union vxlan_addr ip;
 	__be16 port;
 	u32 vni, ifindex;
 	int err;
@@ -668,8 +778,8 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 	if (!f)
 		goto out;
 
-	if (ip != htonl(INADDR_ANY)) {
-		rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
+	if (!vxlan_addr_any(&ip)) {
+		rd = vxlan_fdb_find_rdst(f, &ip, port, vni, ifindex);
 		if (!rd)
 			goto out;
 	}
@@ -732,7 +842,7 @@ out:
  * Return true if packet is bogus and should be droppped.
  */
 static bool vxlan_snoop(struct net_device *dev,
-			__be32 src_ip, const u8 *src_mac)
+			union vxlan_addr *src_ip, const u8 *src_mac)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct vxlan_fdb *f;
@@ -741,7 +851,7 @@ static bool vxlan_snoop(struct net_device *dev,
 	if (likely(f)) {
 		struct vxlan_rdst *rdst = first_remote_rcu(f);
 
-		if (likely(rdst->remote_ip == src_ip))
+		if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip)))
 			return false;
 
 		/* Don't migrate static entries, drop packets */
@@ -750,10 +860,10 @@ static bool vxlan_snoop(struct net_device *dev,
 
 		if (net_ratelimit())
 			netdev_info(dev,
-				    "%pM migrated from %pI4 to %pI4\n",
+				    "%pM migrated from %pIS to %pIS\n",
 				    src_mac, &rdst->remote_ip, &src_ip);
 
-		rdst->remote_ip = src_ip;
+		rdst->remote_ip = *src_ip;
 		f->updated = jiffies;
 		vxlan_fdb_notify(vxlan, f, RTM_NEWNEIGH);
 	} else {
@@ -775,7 +885,7 @@ static bool vxlan_snoop(struct net_device *dev,
 }
 
 /* See if multicast group is already in use by other ID */
-static bool vxlan_group_used(struct vxlan_net *vn, __be32 remote_ip)
+static bool vxlan_group_used(struct vxlan_net *vn, union vxlan_addr *remote_ip)
 {
 	struct vxlan_dev *vxlan;
 
@@ -783,7 +893,8 @@ static bool vxlan_group_used(struct vxlan_net *vn, __be32 remote_ip)
 		if (!netif_running(vxlan->dev))
 			continue;
 
-		if (vxlan->default_dst.remote_ip == remote_ip)
+		if (vxlan_addr_equal(&vxlan->default_dst.remote_ip,
+				     remote_ip))
 			return true;
 	}
 
@@ -819,13 +930,23 @@ static void vxlan_igmp_join(struct work_struct *work)
 	struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_join);
 	struct vxlan_sock *vs = vxlan->vn_sock;
 	struct sock *sk = vs->sock->sk;
-	struct ip_mreqn mreq = {
-		.imr_multiaddr.s_addr	= vxlan->default_dst.remote_ip,
-		.imr_ifindex		= vxlan->default_dst.remote_ifindex,
-	};
+	union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
+	int ifindex = vxlan->default_dst.remote_ifindex;
 
 	lock_sock(sk);
-	ip_mc_join_group(sk, &mreq);
+	if (ip->sa.sa_family == AF_INET) {
+		struct ip_mreqn mreq = {
+			.imr_multiaddr.s_addr	= ip->sin.sin_addr.s_addr,
+			.imr_ifindex		= ifindex,
+		};
+
+		ip_mc_join_group(sk, &mreq);
+#if IS_ENABLED(CONFIG_IPV6)
+	} else {
+		ipv6_stub->ipv6_sock_mc_join(sk, ifindex,
+					     &ip->sin6.sin6_addr);
+#endif
+	}
 	release_sock(sk);
 
 	vxlan_sock_release(vs);
@@ -838,13 +959,24 @@ static void vxlan_igmp_leave(struct work_struct *work)
 	struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_leave);
 	struct vxlan_sock *vs = vxlan->vn_sock;
 	struct sock *sk = vs->sock->sk;
-	struct ip_mreqn mreq = {
-		.imr_multiaddr.s_addr	= vxlan->default_dst.remote_ip,
-		.imr_ifindex		= vxlan->default_dst.remote_ifindex,
-	};
+	union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
+	int ifindex = vxlan->default_dst.remote_ifindex;
 
 	lock_sock(sk);
-	ip_mc_leave_group(sk, &mreq);
+	if (ip->sa.sa_family == AF_INET) {
+		struct ip_mreqn mreq = {
+			.imr_multiaddr.s_addr	= ip->sin.sin_addr.s_addr,
+			.imr_ifindex		= ifindex,
+		};
+
+		ip_mc_leave_group(sk, &mreq);
+#if IS_ENABLED(CONFIG_IPV6)
+	} else {
+		ipv6_stub->ipv6_sock_mc_drop(sk, ifindex,
+					     &ip->sin6.sin6_addr);
+#endif
+	}
+
 	release_sock(sk);
 
 	vxlan_sock_release(vs);
@@ -896,11 +1028,14 @@ error:
 static void vxlan_rcv(struct vxlan_sock *vs,
 		      struct sk_buff *skb, __be32 vx_vni)
 {
-	struct iphdr *oip;
+	struct iphdr *oip = NULL;
+	struct ipv6hdr *oip6 = NULL;
 	struct vxlan_dev *vxlan;
 	struct pcpu_tstats *stats;
+	union vxlan_addr saddr;
 	__u32 vni;
-	int err;
+	int err = 0;
+	union vxlan_addr *remote_ip;
 
 	vni = ntohl(vx_vni) >> 8;
 	/* Is this VNI defined? */
@@ -908,6 +1043,7 @@ static void vxlan_rcv(struct vxlan_sock *vs,
 	if (!vxlan)
 		goto drop;
 
+	remote_ip = &vxlan->default_dst.remote_ip;
 	skb_reset_mac_header(skb);
 	skb->protocol = eth_type_trans(skb, vxlan->dev);
 
@@ -917,9 +1053,20 @@ static void vxlan_rcv(struct vxlan_sock *vs,
 		goto drop;
 
 	/* Re-examine inner Ethernet packet */
-	oip = ip_hdr(skb);
+	if (remote_ip->sa.sa_family == AF_INET) {
+		oip = ip_hdr(skb);
+		saddr.sin.sin_addr.s_addr = oip->saddr;
+		saddr.sa.sa_family = AF_INET;
+#if IS_ENABLED(CONFIG_IPV6)
+	} else {
+		oip6 = ipv6_hdr(skb);
+		saddr.sin6.sin6_addr = oip6->saddr;
+		saddr.sa.sa_family = AF_INET6;
+#endif
+	}
+
 	if ((vxlan->flags & VXLAN_F_LEARN) &&
-	    vxlan_snoop(skb->dev, oip->saddr, eth_hdr(skb)->h_source))
+	    vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source))
 		goto drop;
 
 	skb_reset_network_header(skb);
@@ -935,11 +1082,20 @@ static void vxlan_rcv(struct vxlan_sock *vs,
 
 	skb->encapsulation = 0;
 
-	err = IP_ECN_decapsulate(oip, skb);
+	if (oip6)
+		err = IP6_ECN_decapsulate(oip6, skb);
+	if (oip)
+		err = IP_ECN_decapsulate(oip, skb);
+
 	if (unlikely(err)) {
-		if (log_ecn_error)
-			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
-					     &oip->saddr, oip->tos);
+		if (log_ecn_error) {
+			if (oip6)
+				net_info_ratelimited("non-ECT from %pI6\n",
+						     &oip6->saddr);
+			if (oip)
+				net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
+						     &oip->saddr, oip->tos);
+		}
 		if (err > 1) {
 			++vxlan->dev->stats.rx_frame_errors;
 			++vxlan->dev->stats.rx_errors;
@@ -1009,7 +1165,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb)
 		}
 
 		f = vxlan_find_mac(vxlan, n->ha);
-		if (f && first_remote_rcu(f)->remote_ip == htonl(INADDR_ANY)) {
+		if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
 			/* bridge-local neighbor */
 			neigh_release(n);
 			goto out;
@@ -1027,8 +1183,14 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb)
 
 		if (netif_rx_ni(reply) == NET_RX_DROP)
 			dev->stats.rx_dropped++;
-	} else if (vxlan->flags & VXLAN_F_L3MISS)
-		vxlan_ip_miss(dev, tip);
+	} else if (vxlan->flags & VXLAN_F_L3MISS) {
+		union vxlan_addr ipa = {
+			.sin.sin_addr.s_addr = tip,
+			.sa.sa_family = AF_INET,
+		};
+
+		vxlan_ip_miss(dev, &ipa);
+	}
 out:
 	consume_skb(skb);
 	return NETDEV_TX_OK;
@@ -1050,6 +1212,16 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
 			return false;
 		pip = ip_hdr(skb);
 		n = neigh_lookup(&arp_tbl, &pip->daddr, dev);
+		if (!n && (vxlan->flags & VXLAN_F_L3MISS)) {
+			union vxlan_addr ipa = {
+				.sin.sin_addr.s_addr = pip->daddr,
+				.sa.sa_family = AF_INET,
+			};
+
+			vxlan_ip_miss(dev, &ipa);
+			return false;
+		}
+
 		break;
 	default:
 		return false;
@@ -1066,8 +1238,8 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
 		}
 		neigh_release(n);
 		return diff;
-	} else if (vxlan->flags & VXLAN_F_L3MISS)
-		vxlan_ip_miss(dev, pip->daddr);
+	}
+
 	return false;
 }
 
@@ -1118,6 +1290,102 @@ static int handle_offloads(struct sk_buff *skb)
 	return 0;
 }
 
+#if IS_ENABLED(CONFIG_IPV6)
+static int vxlan6_xmit_skb(struct net *net, struct vxlan_sock *vs,
+			   struct dst_entry *dst, struct sk_buff *skb,
+			   struct net_device *dev, struct in6_addr *saddr,
+			   struct in6_addr *daddr, __u8 prio, __u8 ttl,
+			   __be16 src_port, __be16 dst_port, __be32 vni)
+{
+	struct ipv6hdr *ip6h;
+	struct vxlanhdr *vxh;
+	struct udphdr *uh;
+	int min_headroom;
+	int err;
+
+	if (!skb->encapsulation) {
+		skb_reset_inner_headers(skb);
+		skb->encapsulation = 1;
+	}
+
+	min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
+			+ VXLAN_HLEN + sizeof(struct ipv6hdr)
+			+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+
+	/* Need space for new headers (invalidates iph ptr) */
+	err = skb_cow_head(skb, min_headroom);
+	if (unlikely(err))
+		return err;
+
+	if (vlan_tx_tag_present(skb)) {
+		if (WARN_ON(!__vlan_put_tag(skb,
+					    skb->vlan_proto,
+					    vlan_tx_tag_get(skb))))
+			return -ENOMEM;
+
+		skb->vlan_tci = 0;
+	}
+
+	vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
+	vxh->vx_flags = htonl(VXLAN_FLAGS);
+	vxh->vx_vni = vni;
+
+	__skb_push(skb, sizeof(*uh));
+	skb_reset_transport_header(skb);
+	uh = udp_hdr(skb);
+
+	uh->dest = dst_port;
+	uh->source = src_port;
+
+	uh->len = htons(skb->len);
+	uh->check = 0;
+
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
+			      IPSKB_REROUTED);
+	skb_dst_drop(skb);
+	skb_dst_set(skb, dst);
+
+	if (!skb_is_gso(skb) && !(dst->dev->features & NETIF_F_IPV6_CSUM)) {
+		__wsum csum = skb_checksum(skb, 0, skb->len, 0);
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+		uh->check = csum_ipv6_magic(saddr, daddr, skb->len,
+					    IPPROTO_UDP, csum);
+		if (uh->check == 0)
+			uh->check = CSUM_MANGLED_0;
+	} else {
+		skb->ip_summed = CHECKSUM_PARTIAL;
+		skb->csum_start = skb_transport_header(skb) - skb->head;
+		skb->csum_offset = offsetof(struct udphdr, check);
+		uh->check = ~csum_ipv6_magic(saddr, daddr,
+					     skb->len, IPPROTO_UDP, 0);
+	}
+
+	__skb_push(skb, sizeof(*ip6h));
+	skb_reset_network_header(skb);
+	ip6h		  = ipv6_hdr(skb);
+	ip6h->version	  = 6;
+	ip6h->priority	  = prio;
+	ip6h->flow_lbl[0] = 0;
+	ip6h->flow_lbl[1] = 0;
+	ip6h->flow_lbl[2] = 0;
+	ip6h->payload_len = htons(skb->len);
+	ip6h->nexthdr     = IPPROTO_UDP;
+	ip6h->hop_limit   = ttl;
+	ip6h->daddr	  = *daddr;
+	ip6h->saddr	  = *saddr;
+
+	vxlan_set_owner(vs->sock->sk, skb);
+
+	err = handle_offloads(skb);
+	if (err)
+		return err;
+
+	ip6tunnel_xmit(skb, dev);
+	return 0;
+}
+#endif
+
 int vxlan_xmit_skb(struct net *net, struct vxlan_sock *vs,
 		   struct rtable *rt, struct sk_buff *skb,
 		   __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
@@ -1182,15 +1450,26 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
 {
 	struct pcpu_tstats *tx_stats = this_cpu_ptr(src_vxlan->dev->tstats);
 	struct pcpu_tstats *rx_stats = this_cpu_ptr(dst_vxlan->dev->tstats);
+	union vxlan_addr loopback;
+	union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip;
 
 	skb->pkt_type = PACKET_HOST;
 	skb->encapsulation = 0;
 	skb->dev = dst_vxlan->dev;
 	__skb_pull(skb, skb_network_offset(skb));
 
+	if (remote_ip->sa.sa_family == AF_INET) {
+		loopback.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+		loopback.sa.sa_family =  AF_INET;
+#if IS_ENABLED(CONFIG_IPV6)
+	} else {
+		loopback.sin6.sin6_addr = in6addr_loopback;
+		loopback.sa.sa_family =  AF_INET6;
+#endif
+	}
+
 	if (dst_vxlan->flags & VXLAN_F_LEARN)
-		vxlan_snoop(skb->dev, htonl(INADDR_LOOPBACK),
-			    eth_hdr(skb)->h_source);
+		vxlan_snoop(skb->dev, &loopback, eth_hdr(skb)->h_source);
 
 	u64_stats_update_begin(&tx_stats->syncp);
 	tx_stats->tx_packets++;
@@ -1211,11 +1490,11 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			   struct vxlan_rdst *rdst, bool did_rsc)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
-	struct rtable *rt;
+	struct rtable *rt = NULL;
 	const struct iphdr *old_iph;
 	struct flowi4 fl4;
-	__be32 dst;
-	__be16 src_port, dst_port;
+	union vxlan_addr *dst;
+	__be16 src_port = 0, dst_port;
 	u32 vni;
 	__be16 df = 0;
 	__u8 tos, ttl;
@@ -1223,9 +1502,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 
 	dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port;
 	vni = rdst->remote_vni;
-	dst = rdst->remote_ip;
+	dst = &rdst->remote_ip;
 
-	if (!dst) {
+	if (vxlan_addr_any(dst)) {
 		if (did_rsc) {
 			/* short-circuited back to local bridge */
 			vxlan_encap_bypass(skb, vxlan, vxlan);
@@ -1237,7 +1516,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 	old_iph = ip_hdr(skb);
 
 	ttl = vxlan->ttl;
-	if (!ttl && IN_MULTICAST(ntohl(dst)))
+	if (!ttl && vxlan_addr_multicast(dst))
 		ttl = 1;
 
 	tos = vxlan->tos;
@@ -1246,48 +1525,101 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 
 	src_port = vxlan_src_port(vxlan->port_min, vxlan->port_max, skb);
 
-	memset(&fl4, 0, sizeof(fl4));
-	fl4.flowi4_oif = rdst->remote_ifindex;
-	fl4.flowi4_tos = RT_TOS(tos);
-	fl4.daddr = dst;
-	fl4.saddr = vxlan->saddr;
-
-	rt = ip_route_output_key(dev_net(dev), &fl4);
-	if (IS_ERR(rt)) {
-		netdev_dbg(dev, "no route to %pI4\n", &dst);
-		dev->stats.tx_carrier_errors++;
-		goto tx_error;
-	}
+	if (dst->sa.sa_family == AF_INET) {
+		memset(&fl4, 0, sizeof(fl4));
+		fl4.flowi4_oif = rdst->remote_ifindex;
+		fl4.flowi4_tos = RT_TOS(tos);
+		fl4.daddr = dst->sin.sin_addr.s_addr;
+		fl4.saddr = vxlan->saddr.sin.sin_addr.s_addr;
+
+		rt = ip_route_output_key(dev_net(dev), &fl4);
+		if (IS_ERR(rt)) {
+			netdev_dbg(dev, "no route to %pI4\n",
+				   &dst->sin.sin_addr.s_addr);
+			dev->stats.tx_carrier_errors++;
+			goto tx_error;
+		}
 
-	if (rt->dst.dev == dev) {
-		netdev_dbg(dev, "circular route to %pI4\n", &dst);
-		dev->stats.collisions++;
-		goto rt_tx_error;
-	}
+		if (rt->dst.dev == dev) {
+			netdev_dbg(dev, "circular route to %pI4\n",
+				   &dst->sin.sin_addr.s_addr);
+			dev->stats.collisions++;
+			goto tx_error;
+		}
+
+		/* Bypass encapsulation if the destination is local */
+		if (rt->rt_flags & RTCF_LOCAL &&
+		    !(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
+			struct vxlan_dev *dst_vxlan;
+
+			ip_rt_put(rt);
+			dst_vxlan = vxlan_find_vni(dev_net(dev), vni, dst_port);
+			if (!dst_vxlan)
+				goto tx_error;
+			vxlan_encap_bypass(skb, vxlan, dst_vxlan);
+			return;
+		}
 
-	/* Bypass encapsulation if the destination is local */
-	if (rt->rt_flags & RTCF_LOCAL &&
-	    !(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
-		struct vxlan_dev *dst_vxlan;
+		tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
+		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
 
-		ip_rt_put(rt);
-		dst_vxlan = vxlan_find_vni(dev_net(dev), vni, dst_port);
-		if (!dst_vxlan)
+		err = vxlan_xmit_skb(dev_net(dev), vxlan->vn_sock, rt, skb,
+				     fl4.saddr, dst->sin.sin_addr.s_addr,
+				     tos, ttl, df, src_port, dst_port,
+				     htonl(vni << 8));
+
+		if (err < 0)
+			goto rt_tx_error;
+		iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
+#if IS_ENABLED(CONFIG_IPV6)
+	} else {
+		struct sock *sk = vxlan->vn_sock->sock->sk;
+		struct dst_entry *ndst;
+		struct flowi6 fl6;
+		u32 flags;
+
+		memset(&fl6, 0, sizeof(fl6));
+		fl6.flowi6_oif = rdst->remote_ifindex;
+		fl6.daddr = dst->sin6.sin6_addr;
+		fl6.saddr = vxlan->saddr.sin6.sin6_addr;
+		fl6.flowi6_proto = skb->protocol;
+
+		if (ipv6_stub->ipv6_dst_lookup(sk, &ndst, &fl6)) {
+			netdev_dbg(dev, "no route to %pI6\n",
+				   &dst->sin6.sin6_addr);
+			dev->stats.tx_carrier_errors++;
 			goto tx_error;
-		vxlan_encap_bypass(skb, vxlan, dst_vxlan);
-		return;
-	}
+		}
 
-	tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
-	ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
+		if (ndst->dev == dev) {
+			netdev_dbg(dev, "circular route to %pI6\n",
+				   &dst->sin6.sin6_addr);
+			dst_release(ndst);
+			dev->stats.collisions++;
+			goto tx_error;
+		}
 
-	err = vxlan_xmit_skb(dev_net(dev), vxlan->vn_sock, rt, skb,
-			     fl4.saddr, dst, tos, ttl, df,
-			     src_port, dst_port, htonl(vni << 8));
+		/* Bypass encapsulation if the destination is local */
+		flags = ((struct rt6_info *)ndst)->rt6i_flags;
+		if (flags & RTF_LOCAL &&
+		    !(flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
+			struct vxlan_dev *dst_vxlan;
+
+			dst_release(ndst);
+			dst_vxlan = vxlan_find_vni(dev_net(dev), vni, dst_port);
+			if (!dst_vxlan)
+				goto tx_error;
+			vxlan_encap_bypass(skb, vxlan, dst_vxlan);
+			return;
+		}
 
-	if (err < 0)
-		goto rt_tx_error;
-	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
+		ttl = ttl ? : ip6_dst_hoplimit(ndst);
+
+		err = vxlan6_xmit_skb(dev_net(dev), vxlan->vn_sock, ndst, skb,
+				      dev, &fl6.saddr, &fl6.daddr, 0, ttl,
+				      src_port, dst_port, htonl(vni << 8));
+#endif
+	}
 
 	return;
 
@@ -1464,8 +1796,8 @@ static int vxlan_open(struct net_device *dev)
 	if (!vs)
 		return -ENOTCONN;
 
-	if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip)) &&
-	    vxlan_group_used(vn, vxlan->default_dst.remote_ip)) {
+	if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip) &&
+	    vxlan_group_used(vn, &vxlan->default_dst.remote_ip)) {
 		vxlan_sock_hold(vs);
 		dev_hold(dev);
 		queue_work(vxlan_wq, &vxlan->igmp_join);
@@ -1503,8 +1835,8 @@ static int vxlan_stop(struct net_device *dev)
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct vxlan_sock *vs = vxlan->vn_sock;
 
-	if (vs && IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip)) &&
-	    ! vxlan_group_used(vn, vxlan->default_dst.remote_ip)) {
+	if (vs && vxlan_addr_multicast(&vxlan->default_dst.remote_ip) &&
+	    ! vxlan_group_used(vn, &vxlan->default_dst.remote_ip)) {
 		vxlan_sock_hold(vs);
 		dev_hold(dev);
 		queue_work(vxlan_wq, &vxlan->igmp_leave);
@@ -1552,7 +1884,10 @@ static void vxlan_setup(struct net_device *dev)
 
 	eth_hw_addr_random(dev);
 	ether_setup(dev);
-	dev->hard_header_len = ETH_HLEN + VXLAN_HEADROOM;
+	if (vxlan->default_dst.remote_ip.sa.sa_family == AF_INET6)
+		dev->hard_header_len = ETH_HLEN + VXLAN6_HEADROOM;
+	else
+		dev->hard_header_len = ETH_HLEN + VXLAN_HEADROOM;
 
 	dev->netdev_ops = &vxlan_netdev_ops;
 	dev->destructor = free_netdev;
@@ -1597,8 +1932,10 @@ static void vxlan_setup(struct net_device *dev)
 static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
 	[IFLA_VXLAN_ID]		= { .type = NLA_U32 },
 	[IFLA_VXLAN_GROUP]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+	[IFLA_VXLAN_GROUP6]	= { .len = sizeof(struct in6_addr) },
 	[IFLA_VXLAN_LINK]	= { .type = NLA_U32 },
 	[IFLA_VXLAN_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
+	[IFLA_VXLAN_LOCAL6]	= { .len = sizeof(struct in6_addr) },
 	[IFLA_VXLAN_TOS]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_TTL]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_LEARNING]	= { .type = NLA_U8 },
@@ -1669,58 +2006,132 @@ static void vxlan_del_work(struct work_struct *work)
 	kfree_rcu(vs, rcu);
 }
 
-static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port,
-					      vxlan_rcv_t *rcv, void *data)
+#if IS_ENABLED(CONFIG_IPV6)
+/* Create UDP socket for encapsulation receive. AF_INET6 socket
+ * could be used for both IPv4 and IPv6 communications, but
+ * users may set bindv6only=1.
+ */
+static int create_v6_sock(struct net *net, __be16 port, struct socket **psock)
+{
+	struct sock *sk;
+	struct socket *sock;
+	struct sockaddr_in6 vxlan_addr = {
+		.sin6_family = AF_INET6,
+		.sin6_port = port,
+	};
+	int rc, val = 1;
+
+	rc = sock_create_kern(AF_INET6, SOCK_DGRAM, IPPROTO_UDP, &sock);
+	if (rc < 0) {
+		pr_debug("UDPv6 socket create failed\n");
+		return rc;
+	}
+
+	/* Put in proper namespace */
+	sk = sock->sk;
+	sk_change_net(sk, net);
+
+	kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY,
+			  (char *)&val, sizeof(val));
+	rc = kernel_bind(sock, (struct sockaddr *)&vxlan_addr,
+			 sizeof(struct sockaddr_in6));
+	if (rc < 0) {
+		pr_debug("bind for UDPv6 socket %pI6:%u (%d)\n",
+			 &vxlan_addr.sin6_addr, ntohs(vxlan_addr.sin6_port), rc);
+		sk_release_kernel(sk);
+		return rc;
+	}
+	/* At this point, IPv6 module should have been loaded in
+	 * sock_create_kern().
+	 */
+	BUG_ON(!ipv6_stub);
+
+	*psock = sock;
+	/* Disable multicast loopback */
+	inet_sk(sk)->mc_loop = 0;
+	return 0;
+}
+
+#else
+
+static int create_v6_sock(struct net *net, __be16 port, struct socket **psock)
+{
+		return -EPFNOSUPPORT;
+}
+#endif
+
+static int create_v4_sock(struct net *net, __be16 port, struct socket **psock)
 {
-	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
-	struct vxlan_sock *vs;
 	struct sock *sk;
+	struct socket *sock;
 	struct sockaddr_in vxlan_addr = {
 		.sin_family = AF_INET,
 		.sin_addr.s_addr = htonl(INADDR_ANY),
 		.sin_port = port,
 	};
 	int rc;
-	unsigned int h;
-
-	vs = kmalloc(sizeof(*vs), GFP_KERNEL);
-	if (!vs) {
-		pr_debug("memory alocation failure\n");
-		return ERR_PTR(-ENOMEM);
-	}
-
-	for (h = 0; h < VNI_HASH_SIZE; ++h)
-		INIT_HLIST_HEAD(&vs->vni_list[h]);
-
-	INIT_WORK(&vs->del_work, vxlan_del_work);
 
 	/* Create UDP socket for encapsulation receive. */
-	rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &vs->sock);
+	rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
 	if (rc < 0) {
 		pr_debug("UDP socket create failed\n");
-		kfree(vs);
-		return ERR_PTR(rc);
+		return rc;
 	}
 
 	/* Put in proper namespace */
-	sk = vs->sock->sk;
+	sk = sock->sk;
 	sk_change_net(sk, net);
 
-	rc = kernel_bind(vs->sock, (struct sockaddr *) &vxlan_addr,
+	rc = kernel_bind(sock, (struct sockaddr *) &vxlan_addr,
 			 sizeof(vxlan_addr));
 	if (rc < 0) {
 		pr_debug("bind for UDP socket %pI4:%u (%d)\n",
 			 &vxlan_addr.sin_addr, ntohs(vxlan_addr.sin_port), rc);
 		sk_release_kernel(sk);
+		return rc;
+	}
+
+	*psock = sock;
+	/* Disable multicast loopback */
+	inet_sk(sk)->mc_loop = 0;
+	return 0;
+}
+
+/* Create new listen socket if needed */
+static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port,
+					      vxlan_rcv_t *rcv, void *data, bool ipv6)
+{
+	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+	struct vxlan_sock *vs;
+	struct socket *sock;
+	struct sock *sk;
+	int rc = 0;
+	unsigned int h;
+
+	vs = kmalloc(sizeof(*vs), GFP_KERNEL);
+	if (!vs)
+		return ERR_PTR(-ENOMEM);
+
+	for (h = 0; h < VNI_HASH_SIZE; ++h)
+		INIT_HLIST_HEAD(&vs->vni_list[h]);
+
+	INIT_WORK(&vs->del_work, vxlan_del_work);
+
+	if (ipv6)
+		rc = create_v6_sock(net, port, &sock);
+	else
+		rc = create_v4_sock(net, port, &sock);
+	if (rc < 0) {
 		kfree(vs);
 		return ERR_PTR(rc);
 	}
+
+	vs->sock = sock;
+	sk = sock->sk;
 	atomic_set(&vs->refcnt, 1);
 	vs->rcv = rcv;
 	vs->data = data;
 
-	/* Disable multicast loopback */
-	inet_sk(sk)->mc_loop = 0;
 	spin_lock(&vn->sock_lock);
 	hlist_add_head_rcu(&vs->hlist, vs_head(net, port));
 	spin_unlock(&vn->sock_lock);
@@ -1728,18 +2139,24 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port,
 	/* Mark socket as an encapsulation socket. */
 	udp_sk(sk)->encap_type = 1;
 	udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv;
-	udp_encap_enable();
+#if IS_ENABLED(CONFIG_IPV6)
+	if (ipv6)
+		ipv6_stub->udpv6_encap_enable();
+	else
+#endif
+		udp_encap_enable();
+
 	return vs;
 }
 
 struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
 				  vxlan_rcv_t *rcv, void *data,
-				  bool no_share)
+				  bool no_share, bool ipv6)
 {
 	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
 	struct vxlan_sock *vs;
 
-	vs = vxlan_socket_create(net, port, rcv, data);
+	vs = vxlan_socket_create(net, port, rcv, data, ipv6);
 	if (!IS_ERR(vs))
 		return vs;
 
@@ -1772,7 +2189,7 @@ static void vxlan_sock_work(struct work_struct *work)
 	__be16 port = vxlan->dst_port;
 	struct vxlan_sock *nvs;
 
-	nvs = vxlan_sock_add(net, port, vxlan_rcv, NULL, false);
+	nvs = vxlan_sock_add(net, port, vxlan_rcv, NULL, false, vxlan->flags & VXLAN_F_IPV6);
 	spin_lock(&vn->sock_lock);
 	if (!IS_ERR(nvs))
 		vxlan_vs_add_dev(nvs, vxlan);
@@ -1789,6 +2206,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
 	struct vxlan_rdst *dst = &vxlan->default_dst;
 	__u32 vni;
 	int err;
+	bool use_ipv6 = false;
 
 	if (!data[IFLA_VXLAN_ID])
 		return -EINVAL;
@@ -1796,11 +2214,32 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
 	vni = nla_get_u32(data[IFLA_VXLAN_ID]);
 	dst->remote_vni = vni;
 
-	if (data[IFLA_VXLAN_GROUP])
-		dst->remote_ip = nla_get_be32(data[IFLA_VXLAN_GROUP]);
+	if (data[IFLA_VXLAN_GROUP]) {
+		dst->remote_ip.sin.sin_addr.s_addr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
+		dst->remote_ip.sa.sa_family = AF_INET;
+	} else if (data[IFLA_VXLAN_GROUP6]) {
+		if (!IS_ENABLED(CONFIG_IPV6))
+			return -EPFNOSUPPORT;
+
+		nla_memcpy(&dst->remote_ip.sin6.sin6_addr, data[IFLA_VXLAN_GROUP6],
+			   sizeof(struct in6_addr));
+		dst->remote_ip.sa.sa_family = AF_INET6;
+		use_ipv6 = true;
+	}
 
-	if (data[IFLA_VXLAN_LOCAL])
-		vxlan->saddr = nla_get_be32(data[IFLA_VXLAN_LOCAL]);
+	if (data[IFLA_VXLAN_LOCAL]) {
+		vxlan->saddr.sin.sin_addr.s_addr = nla_get_be32(data[IFLA_VXLAN_LOCAL]);
+		vxlan->saddr.sa.sa_family = AF_INET;
+	} else if (data[IFLA_VXLAN_LOCAL6]) {
+		if (!IS_ENABLED(CONFIG_IPV6))
+			return -EPFNOSUPPORT;
+
+		/* TODO: respect scope id */
+		nla_memcpy(&vxlan->saddr.sin6.sin6_addr, data[IFLA_VXLAN_LOCAL6],
+			   sizeof(struct in6_addr));
+		vxlan->saddr.sa.sa_family = AF_INET6;
+		use_ipv6 = true;
+	}
 
 	if (data[IFLA_VXLAN_LINK] &&
 	    (dst->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]))) {
@@ -1812,12 +2251,23 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
 			return -ENODEV;
 		}
 
+#if IS_ENABLED(CONFIG_IPV6)
+		if (use_ipv6) {
+			struct inet6_dev *idev = __in6_dev_get(lowerdev);
+			if (idev && idev->cnf.disable_ipv6) {
+				pr_info("IPv6 is disabled via sysctl\n");
+				return -EPERM;
+			}
+			vxlan->flags |= VXLAN_F_IPV6;
+		}
+#endif
+
 		if (!tb[IFLA_MTU])
-			dev->mtu = lowerdev->mtu - VXLAN_HEADROOM;
+			dev->mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM);
 
 		/* update header length based on lower device */
 		dev->hard_header_len = lowerdev->hard_header_len +
-				       VXLAN_HEADROOM;
+				       (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM);
 	}
 
 	if (data[IFLA_VXLAN_TOS])
@@ -1868,7 +2318,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
 
 	/* create an fdb entry for default destination */
 	err = vxlan_fdb_create(vxlan, all_zeros_mac,
-			       vxlan->default_dst.remote_ip,
+			       &vxlan->default_dst.remote_ip,
 			       NUD_REACHABLE|NUD_PERMANENT,
 			       NLM_F_EXCL|NLM_F_CREATE,
 			       vxlan->dst_port, vxlan->default_dst.remote_vni,
@@ -1905,9 +2355,9 @@ static size_t vxlan_get_size(const struct net_device *dev)
 {
 
 	return nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_ID */
-		nla_total_size(sizeof(__be32)) +/* IFLA_VXLAN_GROUP */
+		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */
 		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LINK */
-		nla_total_size(sizeof(__be32))+	/* IFLA_VXLAN_LOCAL */
+		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
@@ -1934,14 +2384,36 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	if (nla_put_u32(skb, IFLA_VXLAN_ID, dst->remote_vni))
 		goto nla_put_failure;
 
-	if (dst->remote_ip && nla_put_be32(skb, IFLA_VXLAN_GROUP, dst->remote_ip))
-		goto nla_put_failure;
+	if (!vxlan_addr_any(&dst->remote_ip)) {
+		if (dst->remote_ip.sa.sa_family == AF_INET) {
+			if (nla_put_be32(skb, IFLA_VXLAN_GROUP,
+					 dst->remote_ip.sin.sin_addr.s_addr))
+				goto nla_put_failure;
+#if IS_ENABLED(CONFIG_IPV6)
+		} else {
+			if (nla_put(skb, IFLA_VXLAN_GROUP6, sizeof(struct in6_addr),
+				    &dst->remote_ip.sin6.sin6_addr))
+				goto nla_put_failure;
+#endif
+		}
+	}
 
 	if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex))
 		goto nla_put_failure;
 
-	if (vxlan->saddr && nla_put_be32(skb, IFLA_VXLAN_LOCAL, vxlan->saddr))
-		goto nla_put_failure;
+	if (!vxlan_addr_any(&vxlan->saddr)) {
+		if (vxlan->saddr.sa.sa_family == AF_INET) {
+			if (nla_put_be32(skb, IFLA_VXLAN_LOCAL,
+					 vxlan->saddr.sin.sin_addr.s_addr))
+				goto nla_put_failure;
+#if IS_ENABLED(CONFIG_IPV6)
+		} else {
+			if (nla_put(skb, IFLA_VXLAN_LOCAL6, sizeof(struct in6_addr),
+				    &vxlan->saddr.sin6.sin6_addr))
+				goto nla_put_failure;
+#endif
+		}
+	}
 
 	if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) ||
 	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) ||
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index ad342e3688a0..d2b88cafa7a2 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -25,7 +25,7 @@ struct vxlan_sock {
 
 struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
 				  vxlan_rcv_t *rcv, void *data,
-				  bool no_share);
+				  bool no_share, bool ipv6);
 
 void vxlan_sock_release(struct vxlan_sock *vs);
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 04c0e7a5d484..80394e8dc3a3 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -314,6 +314,8 @@ enum {
 	IFLA_VXLAN_L2MISS,
 	IFLA_VXLAN_L3MISS,
 	IFLA_VXLAN_PORT,	/* destination port */
+	IFLA_VXLAN_GROUP6,
+	IFLA_VXLAN_LOCAL6,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index 36848bd54a77..a0060245b4e1 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -123,7 +123,7 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
 	vxlan_port = vxlan_vport(vport);
 	strncpy(vxlan_port->name, parms->name, IFNAMSIZ);
 
-	vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true);
+	vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true, false);
 	if (IS_ERR(vs)) {
 		ovs_vport_free(vport);
 		return (void *)vs;
-- 
cgit v1.2.3


From e15a00aafa4b7953ad717d3cb1ad7acf4ff76945 Mon Sep 17 00:00:00 2001
From: Cong Wang <amwang@redhat.com>
Date: Sat, 31 Aug 2013 13:44:34 +0800
Subject: vxlan: add ipv6 route short circuit support

route short circuit only has IPv4 part, this patch adds
the IPv6 part. nd_tbl will be needed.

Cc: David S. Miller <davem@davemloft.net>
Cc: David Stevens <dlstevens@us.ibm.com>
Signed-off-by: Cong Wang <amwang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c    | 30 ++++++++++++++++++++++++++++--
 include/net/addrconf.h |  1 +
 net/ipv6/af_inet6.c    |  1 +
 3 files changed, 30 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index faf131e02a72..c833763fe5c4 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1200,7 +1200,6 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct neighbour *n;
-	struct iphdr *pip;
 
 	if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
 		return false;
@@ -1208,6 +1207,9 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
 	n = NULL;
 	switch (ntohs(eth_hdr(skb)->h_proto)) {
 	case ETH_P_IP:
+	{
+		struct iphdr *pip;
+
 		if (!pskb_may_pull(skb, sizeof(struct iphdr)))
 			return false;
 		pip = ip_hdr(skb);
@@ -1223,6 +1225,29 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
 		}
 
 		break;
+	}
+#if IS_ENABLED(CONFIG_IPV6)
+	case ETH_P_IPV6:
+	{
+		struct ipv6hdr *pip6;
+
+		if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+			return false;
+		pip6 = ipv6_hdr(skb);
+		n = neigh_lookup(ipv6_stub->nd_tbl, &pip6->daddr, dev);
+		if (!n && (vxlan->flags & VXLAN_F_L3MISS)) {
+			union vxlan_addr ipa = {
+				.sin6.sin6_addr = pip6->daddr,
+				.sa.sa_family = AF_INET6,
+			};
+
+			vxlan_ip_miss(dev, &ipa);
+			return false;
+		}
+
+		break;
+	}
+#endif
 	default:
 		return false;
 	}
@@ -1659,7 +1684,8 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 	did_rsc = false;
 
 	if (f && (f->flags & NTF_ROUTER) && (vxlan->flags & VXLAN_F_RSC) &&
-	    ntohs(eth->h_proto) == ETH_P_IP) {
+	    (ntohs(eth->h_proto) == ETH_P_IP ||
+	     ntohs(eth->h_proto) == ETH_P_IPV6)) {
 		did_rsc = route_shortcircuit(dev, skb);
 		if (did_rsc)
 			f = vxlan_find_mac(vxlan, eth->h_dest);
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 5339cab356bb..bcf957341b62 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -153,6 +153,7 @@ struct ipv6_stub {
 	int (*ipv6_dst_lookup)(struct sock *sk, struct dst_entry **dst,
 				struct flowi6 *fl6);
 	void (*udpv6_encap_enable)(void);
+	struct neigh_table *nd_tbl;
 };
 extern const struct ipv6_stub *ipv6_stub __read_mostly;
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 0c9c22f7487a..1996a7c34f73 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -815,6 +815,7 @@ static const struct ipv6_stub ipv6_stub_impl = {
 	.ipv6_sock_mc_drop = ipv6_sock_mc_drop,
 	.ipv6_dst_lookup = ip6_dst_lookup,
 	.udpv6_encap_enable = udpv6_encap_enable,
+	.nd_tbl	= &nd_tbl,
 };
 
 static int __init inet6_init(void)
-- 
cgit v1.2.3


From f39dc1023d6b9933528638a0c2dd618b4fdf664e Mon Sep 17 00:00:00 2001
From: Cong Wang <amwang@redhat.com>
Date: Sat, 31 Aug 2013 13:44:35 +0800
Subject: ipv6: move in6_dev_finish_destroy() into core kernel

in6_dev_put() will be needed by vxlan module, so is
in6_dev_finish_destroy().

Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Cong Wang <amwang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c      | 30 ------------------------------
 net/ipv6/addrconf_core.c | 31 +++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 30 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index baaaead69ee1..2a66eaad047f 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -304,36 +304,6 @@ err_ip:
 	return -ENOMEM;
 }
 
-static void snmp6_free_dev(struct inet6_dev *idev)
-{
-	kfree(idev->stats.icmpv6msgdev);
-	kfree(idev->stats.icmpv6dev);
-	snmp_mib_free((void __percpu **)idev->stats.ipv6);
-}
-
-/* Nobody refers to this device, we may destroy it. */
-
-void in6_dev_finish_destroy(struct inet6_dev *idev)
-{
-	struct net_device *dev = idev->dev;
-
-	WARN_ON(!list_empty(&idev->addr_list));
-	WARN_ON(idev->mc_list != NULL);
-	WARN_ON(timer_pending(&idev->rs_timer));
-
-#ifdef NET_REFCNT_DEBUG
-	pr_debug("%s: %s\n", __func__, dev ? dev->name : "NIL");
-#endif
-	dev_put(dev);
-	if (!idev->dead) {
-		pr_warn("Freeing alive inet6 device %p\n", idev);
-		return;
-	}
-	snmp6_free_dev(idev);
-	kfree_rcu(idev, rcu);
-}
-EXPORT_SYMBOL(in6_dev_finish_destroy);
-
 static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
 {
 	struct inet6_dev *ndev;
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index a864033d9e78..4c11cbcf8308 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -6,6 +6,7 @@
 #include <linux/export.h>
 #include <net/ipv6.h>
 #include <net/addrconf.h>
+#include <net/ip.h>
 
 #define IPV6_ADDR_SCOPE_TYPE(scope)	((scope) << 16)
 
@@ -117,3 +118,33 @@ const struct in6_addr in6addr_interfacelocal_allrouters = IN6ADDR_INTERFACELOCAL
 EXPORT_SYMBOL(in6addr_interfacelocal_allrouters);
 const struct in6_addr in6addr_sitelocal_allrouters = IN6ADDR_SITELOCAL_ALLROUTERS_INIT;
 EXPORT_SYMBOL(in6addr_sitelocal_allrouters);
+
+static void snmp6_free_dev(struct inet6_dev *idev)
+{
+	kfree(idev->stats.icmpv6msgdev);
+	kfree(idev->stats.icmpv6dev);
+	snmp_mib_free((void __percpu **)idev->stats.ipv6);
+}
+
+/* Nobody refers to this device, we may destroy it. */
+
+void in6_dev_finish_destroy(struct inet6_dev *idev)
+{
+	struct net_device *dev = idev->dev;
+
+	WARN_ON(!list_empty(&idev->addr_list));
+	WARN_ON(idev->mc_list != NULL);
+	WARN_ON(timer_pending(&idev->rs_timer));
+
+#ifdef NET_REFCNT_DEBUG
+	pr_debug("%s: %s\n", __func__, dev ? dev->name : "NIL");
+#endif
+	dev_put(dev);
+	if (!idev->dead) {
+		pr_warn("Freeing alive inet6 device %p\n", idev);
+		return;
+	}
+	snmp6_free_dev(idev);
+	kfree_rcu(idev, rcu);
+}
+EXPORT_SYMBOL(in6_dev_finish_destroy);
-- 
cgit v1.2.3


From f564f45c451809aa3b74f577754528520d315ac1 Mon Sep 17 00:00:00 2001
From: Cong Wang <amwang@redhat.com>
Date: Sat, 31 Aug 2013 13:44:36 +0800
Subject: vxlan: add ipv6 proxy support

This patch adds the IPv6 version of "arp_reduce", ndisc_send_na()
will be needed.

Cc: David S. Miller <davem@davemloft.net>
Cc: David Stevens <dlstevens@us.ibm.com>
Signed-off-by: Cong Wang <amwang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c    | 82 ++++++++++++++++++++++++++++++++++++++++++++++++--
 include/net/addrconf.h |  4 +++
 include/net/ndisc.h    |  5 +++
 net/ipv6/af_inet6.c    |  2 ++
 net/ipv6/ndisc.c       |  8 ++---
 5 files changed, 95 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index c833763fe5c4..3ffb22d684a9 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1196,6 +1196,70 @@ out:
 	return NETDEV_TX_OK;
 }
 
+#if IS_ENABLED(CONFIG_IPV6)
+static int neigh_reduce(struct net_device *dev, struct sk_buff *skb)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct neighbour *n;
+	union vxlan_addr ipa;
+	const struct ipv6hdr *iphdr;
+	const struct in6_addr *saddr, *daddr;
+	struct nd_msg *msg;
+	struct inet6_dev *in6_dev = NULL;
+
+	in6_dev = __in6_dev_get(dev);
+	if (!in6_dev)
+		goto out;
+
+	if (!pskb_may_pull(skb, skb->len))
+		goto out;
+
+	iphdr = ipv6_hdr(skb);
+	saddr = &iphdr->saddr;
+	daddr = &iphdr->daddr;
+
+	if (ipv6_addr_loopback(daddr) ||
+	    ipv6_addr_is_multicast(daddr))
+		goto out;
+
+	msg = (struct nd_msg *)skb_transport_header(skb);
+	if (msg->icmph.icmp6_code != 0 ||
+	    msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION)
+		goto out;
+
+	n = neigh_lookup(ipv6_stub->nd_tbl, daddr, dev);
+
+	if (n) {
+		struct vxlan_fdb *f;
+
+		if (!(n->nud_state & NUD_CONNECTED)) {
+			neigh_release(n);
+			goto out;
+		}
+
+		f = vxlan_find_mac(vxlan, n->ha);
+		if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
+			/* bridge-local neighbor */
+			neigh_release(n);
+			goto out;
+		}
+
+		ipv6_stub->ndisc_send_na(dev, n, saddr, &msg->target,
+					 !!in6_dev->cnf.forwarding,
+					 true, false, false);
+		neigh_release(n);
+	} else if (vxlan->flags & VXLAN_F_L3MISS) {
+		ipa.sin6.sin6_addr = *daddr;
+		ipa.sa.sa_family = AF_INET6;
+		vxlan_ip_miss(dev, &ipa);
+	}
+
+out:
+	consume_skb(skb);
+	return NETDEV_TX_OK;
+}
+#endif
+
 static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
@@ -1677,8 +1741,22 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 	skb_reset_mac_header(skb);
 	eth = eth_hdr(skb);
 
-	if ((vxlan->flags & VXLAN_F_PROXY) && ntohs(eth->h_proto) == ETH_P_ARP)
-		return arp_reduce(dev, skb);
+	if ((vxlan->flags & VXLAN_F_PROXY)) {
+		if (ntohs(eth->h_proto) == ETH_P_ARP)
+			return arp_reduce(dev, skb);
+#if IS_ENABLED(CONFIG_IPV6)
+		else if (ntohs(eth->h_proto) == ETH_P_IPV6 &&
+			 skb->len >= sizeof(struct ipv6hdr) + sizeof(struct nd_msg) &&
+			 ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
+				struct nd_msg *msg;
+
+				msg = (struct nd_msg *)skb_transport_header(skb);
+				if (msg->icmph.icmp6_code == 0 &&
+				    msg->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION)
+					return neigh_reduce(dev, skb);
+		}
+#endif
+	}
 
 	f = vxlan_find_mac(vxlan, eth->h_dest);
 	did_rsc = false;
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index bcf957341b62..fb314de2b61b 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -153,6 +153,10 @@ struct ipv6_stub {
 	int (*ipv6_dst_lookup)(struct sock *sk, struct dst_entry **dst,
 				struct flowi6 *fl6);
 	void (*udpv6_encap_enable)(void);
+	void (*ndisc_send_na)(struct net_device *dev, struct neighbour *neigh,
+			      const struct in6_addr *daddr,
+			      const struct in6_addr *solicited_addr,
+			      bool router, bool solicited, bool override, bool inc_opt);
 	struct neigh_table *nd_tbl;
 };
 extern const struct ipv6_stub *ipv6_stub __read_mostly;
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index 6fea32340ae8..3c4211f0bed6 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -204,6 +204,11 @@ extern void			ndisc_send_ns(struct net_device *dev,
 extern void			ndisc_send_rs(struct net_device *dev,
 					      const struct in6_addr *saddr,
 					      const struct in6_addr *daddr);
+extern void			ndisc_send_na(struct net_device *dev, struct neighbour *neigh,
+					      const struct in6_addr *daddr,
+					      const struct in6_addr *solicited_addr,
+					      bool router, bool solicited, bool override,
+					      bool inc_opt);
 
 extern void			ndisc_send_redirect(struct sk_buff *skb,
 						    const struct in6_addr *target);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 1996a7c34f73..136fe55c1a47 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -56,6 +56,7 @@
 #include <net/transp_v6.h>
 #include <net/ip6_route.h>
 #include <net/addrconf.h>
+#include <net/ndisc.h>
 #ifdef CONFIG_IPV6_TUNNEL
 #include <net/ip6_tunnel.h>
 #endif
@@ -815,6 +816,7 @@ static const struct ipv6_stub ipv6_stub_impl = {
 	.ipv6_sock_mc_drop = ipv6_sock_mc_drop,
 	.ipv6_dst_lookup = ip6_dst_lookup,
 	.udpv6_encap_enable = udpv6_encap_enable,
+	.ndisc_send_na = ndisc_send_na,
 	.nd_tbl	= &nd_tbl,
 };
 
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index bb6fd95a5cd1..14bd2f9d9dbb 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -461,10 +461,10 @@ static void ndisc_send_skb(struct sk_buff *skb,
 	rcu_read_unlock();
 }
 
-static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh,
-			  const struct in6_addr *daddr,
-			  const struct in6_addr *solicited_addr,
-			  bool router, bool solicited, bool override, bool inc_opt)
+void ndisc_send_na(struct net_device *dev, struct neighbour *neigh,
+		   const struct in6_addr *daddr,
+		   const struct in6_addr *solicited_addr,
+		   bool router, bool solicited, bool override, bool inc_opt)
 {
 	struct sk_buff *skb;
 	struct in6_addr tmpaddr;
-- 
cgit v1.2.3


From d949d826c09fb65e230f55868ff70dc581ec06fa Mon Sep 17 00:00:00 2001
From: Cong Wang <amwang@redhat.com>
Date: Sat, 31 Aug 2013 13:44:37 +0800
Subject: ipv6: Add generic UDP Tunnel segmentation

Similar to commit 731362674580cb0c696cd1b1a03d8461a10cf90a
(tunneling: Add generic Tunnel segmentation)

This patch adds generic tunneling offloading support for
IPv6-UDP based tunnels.

This can be used by tunneling protocols like VXLAN.

Cc: Jesse Gross <jesse@nicira.com>
Cc: Pravin B Shelar <pshelar@nicira.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Cong Wang <amwang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_offload.c |   4 +-
 net/ipv6/udp_offload.c | 159 +++++++++++++++++++++++++++++++++----------------
 2 files changed, 111 insertions(+), 52 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index a263b990ee11..d82de7228100 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -91,6 +91,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 	unsigned int unfrag_ip6hlen;
 	u8 *prevhdr;
 	int offset = 0;
+	bool tunnel;
 
 	if (unlikely(skb_shinfo(skb)->gso_type &
 		     ~(SKB_GSO_UDP |
@@ -106,6 +107,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 	if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h))))
 		goto out;
 
+	tunnel = skb->encapsulation;
 	ipv6h = ipv6_hdr(skb);
 	__skb_pull(skb, sizeof(*ipv6h));
 	segs = ERR_PTR(-EPROTONOSUPPORT);
@@ -126,7 +128,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 		ipv6h = ipv6_hdr(skb);
 		ipv6h->payload_len = htons(skb->len - skb->mac_len -
 					   sizeof(*ipv6h));
-		if (proto == IPPROTO_UDP) {
+		if (!tunnel && proto == IPPROTO_UDP) {
 			unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr);
 			fptr = (struct frag_hdr *)(skb_network_header(skb) +
 				unfrag_ip6hlen);
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 5d1b8d7ac993..7e5e5acc9619 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -21,26 +21,79 @@ static int udp6_ufo_send_check(struct sk_buff *skb)
 	const struct ipv6hdr *ipv6h;
 	struct udphdr *uh;
 
-	/* UDP Tunnel offload on ipv6 is not yet supported. */
-	if (skb->encapsulation)
-		return -EINVAL;
-
 	if (!pskb_may_pull(skb, sizeof(*uh)))
 		return -EINVAL;
 
-	ipv6h = ipv6_hdr(skb);
-	uh = udp_hdr(skb);
+	if (likely(!skb->encapsulation)) {
+		ipv6h = ipv6_hdr(skb);
+		uh = udp_hdr(skb);
+
+		uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len,
+					     IPPROTO_UDP, 0);
+		skb->csum_start = skb_transport_header(skb) - skb->head;
+		skb->csum_offset = offsetof(struct udphdr, check);
+		skb->ip_summed = CHECKSUM_PARTIAL;
+	}
 
-	uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len,
-				     IPPROTO_UDP, 0);
-	skb->csum_start = skb_transport_header(skb) - skb->head;
-	skb->csum_offset = offsetof(struct udphdr, check);
-	skb->ip_summed = CHECKSUM_PARTIAL;
 	return 0;
 }
 
+static struct sk_buff *skb_udp6_tunnel_segment(struct sk_buff *skb,
+					       netdev_features_t features)
+{
+	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	int mac_len = skb->mac_len;
+	int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
+	int outer_hlen;
+	netdev_features_t enc_features;
+
+	if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
+		goto out;
+
+	skb->encapsulation = 0;
+	__skb_pull(skb, tnl_hlen);
+	skb_reset_mac_header(skb);
+	skb_set_network_header(skb, skb_inner_network_offset(skb));
+	skb->mac_len = skb_inner_network_offset(skb);
+
+	/* segment inner packet. */
+	enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
+	segs = skb_mac_gso_segment(skb, enc_features);
+	if (!segs || IS_ERR(segs))
+		goto out;
+
+	outer_hlen = skb_tnl_header_len(skb);
+	skb = segs;
+	do {
+		struct udphdr *uh;
+		struct ipv6hdr *ipv6h;
+		int udp_offset = outer_hlen - tnl_hlen;
+		u32 len;
+
+		skb->mac_len = mac_len;
+
+		skb_push(skb, outer_hlen);
+		skb_reset_mac_header(skb);
+		skb_set_network_header(skb, mac_len);
+		skb_set_transport_header(skb, udp_offset);
+		uh = udp_hdr(skb);
+		uh->len = htons(skb->len - udp_offset);
+		ipv6h = ipv6_hdr(skb);
+		len = skb->len - udp_offset;
+
+		uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
+					     len, IPPROTO_UDP, 0);
+		uh->check = csum_fold(skb_checksum(skb, udp_offset, len, 0));
+		if (uh->check == 0)
+			uh->check = CSUM_MANGLED_0;
+		skb->ip_summed = CHECKSUM_NONE;
+	} while ((skb = skb->next));
+out:
+	return segs;
+}
+
 static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
-	netdev_features_t features)
+					 netdev_features_t features)
 {
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
 	unsigned int mss;
@@ -75,47 +128,51 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 		goto out;
 	}
 
-	/* Do software UFO. Complete and fill in the UDP checksum as HW cannot
-	 * do checksum of UDP packets sent as multiple IP fragments.
-	 */
-	offset = skb_checksum_start_offset(skb);
-	csum = skb_checksum(skb, offset, skb->len - offset, 0);
-	offset += skb->csum_offset;
-	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
-	skb->ip_summed = CHECKSUM_NONE;
-
-	/* Check if there is enough headroom to insert fragment header. */
-	tnl_hlen = skb_tnl_header_len(skb);
-	if (skb_headroom(skb) < (tnl_hlen + frag_hdr_sz)) {
-		if (gso_pskb_expand_head(skb, tnl_hlen + frag_hdr_sz))
-			goto out;
+	if (skb->encapsulation && skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL)
+		segs = skb_udp6_tunnel_segment(skb, features);
+	else {
+		/* Do software UFO. Complete and fill in the UDP checksum as HW cannot
+		 * do checksum of UDP packets sent as multiple IP fragments.
+		 */
+		offset = skb_checksum_start_offset(skb);
+		csum = skb_checksum(skb, offset, skb->len - offset, 0);
+		offset += skb->csum_offset;
+		*(__sum16 *)(skb->data + offset) = csum_fold(csum);
+		skb->ip_summed = CHECKSUM_NONE;
+
+		/* Check if there is enough headroom to insert fragment header. */
+		tnl_hlen = skb_tnl_header_len(skb);
+		if (skb_headroom(skb) < (tnl_hlen + frag_hdr_sz)) {
+			if (gso_pskb_expand_head(skb, tnl_hlen + frag_hdr_sz))
+				goto out;
+		}
+
+		/* Find the unfragmentable header and shift it left by frag_hdr_sz
+		 * bytes to insert fragment header.
+		 */
+		unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr);
+		nexthdr = *prevhdr;
+		*prevhdr = NEXTHDR_FRAGMENT;
+		unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) +
+			     unfrag_ip6hlen + tnl_hlen;
+		packet_start = (u8 *) skb->head + SKB_GSO_CB(skb)->mac_offset;
+		memmove(packet_start-frag_hdr_sz, packet_start, unfrag_len);
+
+		SKB_GSO_CB(skb)->mac_offset -= frag_hdr_sz;
+		skb->mac_header -= frag_hdr_sz;
+		skb->network_header -= frag_hdr_sz;
+
+		fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen);
+		fptr->nexthdr = nexthdr;
+		fptr->reserved = 0;
+		ipv6_select_ident(fptr, (struct rt6_info *)skb_dst(skb));
+
+		/* Fragment the skb. ipv6 header and the remaining fields of the
+		 * fragment header are updated in ipv6_gso_segment()
+		 */
+		segs = skb_segment(skb, features);
 	}
 
-	/* Find the unfragmentable header and shift it left by frag_hdr_sz
-	 * bytes to insert fragment header.
-	 */
-	unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr);
-	nexthdr = *prevhdr;
-	*prevhdr = NEXTHDR_FRAGMENT;
-	unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) +
-		     unfrag_ip6hlen + tnl_hlen;
-	packet_start = (u8 *) skb->head + SKB_GSO_CB(skb)->mac_offset;
-	memmove(packet_start-frag_hdr_sz, packet_start, unfrag_len);
-
-	SKB_GSO_CB(skb)->mac_offset -= frag_hdr_sz;
-	skb->mac_header -= frag_hdr_sz;
-	skb->network_header -= frag_hdr_sz;
-
-	fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen);
-	fptr->nexthdr = nexthdr;
-	fptr->reserved = 0;
-	ipv6_select_ident(fptr, (struct rt6_info *)skb_dst(skb));
-
-	/* Fragment the skb. ipv6 header and the remaining fields of the
-	 * fragment header are updated in ipv6_gso_segment()
-	 */
-	segs = skb_segment(skb, features);
-
 out:
 	return segs;
 }
-- 
cgit v1.2.3


From eb3c0d83cc78361a28e52e514a7095fdbf771e7e Mon Sep 17 00:00:00 2001
From: Cong Wang <amwang@redhat.com>
Date: Sat, 31 Aug 2013 13:44:38 +0800
Subject: net: unify skb_udp_tunnel_segment() and skb_udp6_tunnel_segment()

As suggested by Pravin, we can unify the code in case of duplicated
code.

Cc: Pravin Shelar <pshelar@nicira.com>
Signed-off-by: Cong Wang <amwang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c         | 13 +++++++++++-
 net/ipv6/udp_offload.c | 56 +-------------------------------------------------
 2 files changed, 13 insertions(+), 56 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 0b24508bcdc4..74d2c95db57f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2337,7 +2337,7 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
 		uh->len = htons(skb->len - udp_offset);
 
 		/* csum segment if tunnel sets skb with csum. */
-		if (unlikely(uh->check)) {
+		if (protocol == htons(ETH_P_IP) && unlikely(uh->check)) {
 			struct iphdr *iph = ip_hdr(skb);
 
 			uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
@@ -2348,7 +2348,18 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
 			if (uh->check == 0)
 				uh->check = CSUM_MANGLED_0;
 
+		} else if (protocol == htons(ETH_P_IPV6)) {
+			struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+			u32 len = skb->len - udp_offset;
+
+			uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
+						     len, IPPROTO_UDP, 0);
+			uh->check = csum_fold(skb_checksum(skb, udp_offset, len, 0));
+			if (uh->check == 0)
+				uh->check = CSUM_MANGLED_0;
+			skb->ip_summed = CHECKSUM_NONE;
 		}
+
 		skb->protocol = protocol;
 	} while ((skb = skb->next));
 out:
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 7e5e5acc9619..60559511bd9c 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -38,60 +38,6 @@ static int udp6_ufo_send_check(struct sk_buff *skb)
 	return 0;
 }
 
-static struct sk_buff *skb_udp6_tunnel_segment(struct sk_buff *skb,
-					       netdev_features_t features)
-{
-	struct sk_buff *segs = ERR_PTR(-EINVAL);
-	int mac_len = skb->mac_len;
-	int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
-	int outer_hlen;
-	netdev_features_t enc_features;
-
-	if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
-		goto out;
-
-	skb->encapsulation = 0;
-	__skb_pull(skb, tnl_hlen);
-	skb_reset_mac_header(skb);
-	skb_set_network_header(skb, skb_inner_network_offset(skb));
-	skb->mac_len = skb_inner_network_offset(skb);
-
-	/* segment inner packet. */
-	enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
-	segs = skb_mac_gso_segment(skb, enc_features);
-	if (!segs || IS_ERR(segs))
-		goto out;
-
-	outer_hlen = skb_tnl_header_len(skb);
-	skb = segs;
-	do {
-		struct udphdr *uh;
-		struct ipv6hdr *ipv6h;
-		int udp_offset = outer_hlen - tnl_hlen;
-		u32 len;
-
-		skb->mac_len = mac_len;
-
-		skb_push(skb, outer_hlen);
-		skb_reset_mac_header(skb);
-		skb_set_network_header(skb, mac_len);
-		skb_set_transport_header(skb, udp_offset);
-		uh = udp_hdr(skb);
-		uh->len = htons(skb->len - udp_offset);
-		ipv6h = ipv6_hdr(skb);
-		len = skb->len - udp_offset;
-
-		uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
-					     len, IPPROTO_UDP, 0);
-		uh->check = csum_fold(skb_checksum(skb, udp_offset, len, 0));
-		if (uh->check == 0)
-			uh->check = CSUM_MANGLED_0;
-		skb->ip_summed = CHECKSUM_NONE;
-	} while ((skb = skb->next));
-out:
-	return segs;
-}
-
 static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 					 netdev_features_t features)
 {
@@ -129,7 +75,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 	}
 
 	if (skb->encapsulation && skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL)
-		segs = skb_udp6_tunnel_segment(skb, features);
+		segs = skb_udp_tunnel_segment(skb, features);
 	else {
 		/* Do software UFO. Complete and fill in the UDP checksum as HW cannot
 		 * do checksum of UDP packets sent as multiple IP fragments.
-- 
cgit v1.2.3


From 5a17a390de7bdbcfff9b8f344273a886ca4cf8bf Mon Sep 17 00:00:00 2001
From: Cong Wang <amwang@redhat.com>
Date: Mon, 2 Sep 2013 10:06:53 +0800
Subject: net: make snmp_mib_free static inline

Fengguang reported:

   net/built-in.o: In function `in6_dev_finish_destroy':
   (.text+0x4ca7d): undefined reference to `snmp_mib_free'

this is due to snmp_mib_free() is defined when CONFIG_INET is enabled,
but in6_dev_finish_destroy() is now moved to core kernel.

I think snmp_mib_free() is small enough to be inlined, so just make it
static inline.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Cong Wang <amwang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h   | 12 +++++++++++-
 net/ipv4/af_inet.c | 12 ------------
 2 files changed, 11 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/include/net/ip.h b/include/net/ip.h
index a68f838a132c..48f55979d842 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -194,7 +194,17 @@ static inline u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp
 }
 #endif
 extern int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align);
-extern void snmp_mib_free(void __percpu *ptr[2]);
+
+static inline void snmp_mib_free(void __percpu *ptr[SNMP_ARRAY_SZ])
+{
+	int i;
+
+	BUG_ON(ptr == NULL);
+	for (i = 0; i < SNMP_ARRAY_SZ; i++) {
+		free_percpu(ptr[i]);
+		ptr[i] = NULL;
+	}
+}
 
 extern struct local_ports {
 	seqlock_t	lock;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index b4d0be2b7ce9..7a1874b7b8fd 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1532,18 +1532,6 @@ int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align)
 }
 EXPORT_SYMBOL_GPL(snmp_mib_init);
 
-void snmp_mib_free(void __percpu *ptr[SNMP_ARRAY_SZ])
-{
-	int i;
-
-	BUG_ON(ptr == NULL);
-	for (i = 0; i < SNMP_ARRAY_SZ; i++) {
-		free_percpu(ptr[i]);
-		ptr[i] = NULL;
-	}
-}
-EXPORT_SYMBOL_GPL(snmp_mib_free);
-
 #ifdef CONFIG_IP_MULTICAST
 static const struct net_protocol igmp_protocol = {
 	.handler =	igmp_rcv,
-- 
cgit v1.2.3


From 6b93f4a1f2f22b23a2e1e4cca1e1aecffee2193e Mon Sep 17 00:00:00 2001
From: Bjørn Mork <bjorn@mork.no>
Date: Fri, 30 Aug 2013 18:08:45 +0200
Subject: net: vlan: inherit addr_assign_type along with dev_addr
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A device inheriting a random or set address should reflect this in
its addr_assign_type.

Cc: Patrick McHardy <kaber@trash.net>
Signed-off-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlan_dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 9ab8a7ed99c0..09bf1c38805b 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -582,7 +582,7 @@ static int vlan_dev_init(struct net_device *dev)
 	dev->dev_id = real_dev->dev_id;
 
 	if (is_zero_ether_addr(dev->dev_addr))
-		memcpy(dev->dev_addr, real_dev->dev_addr, dev->addr_len);
+		eth_hw_addr_inherit(dev, real_dev);
 	if (is_zero_ether_addr(dev->broadcast))
 		memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len);
 
-- 
cgit v1.2.3


From 2fcc8005834a390e999ede2a8933012b92db73ae Mon Sep 17 00:00:00 2001
From: Bjørn Mork <bjorn@mork.no>
Date: Fri, 30 Aug 2013 18:08:46 +0200
Subject: net: dsa: inherit addr_assign_type along with dev_addr
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A device inheriting a random or set address should reflect this in
its addr_assign_type.

Signed-off-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/slave.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 6ebd8fbd9285..29d684ebca6a 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -347,7 +347,7 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent,
 
 	slave_dev->features = master->vlan_features;
 	SET_ETHTOOL_OPS(slave_dev, &dsa_slave_ethtool_ops);
-	memcpy(slave_dev->dev_addr, master->dev_addr, ETH_ALEN);
+	eth_hw_addr_inherit(slave_dev, master);
 	slave_dev->tx_queue_len = 0;
 
 	switch (ds->dst->tag_protocol) {
-- 
cgit v1.2.3


From 3e25c65ed085b361cc91a8f02e028f1158c9f255 Mon Sep 17 00:00:00 2001
From: Tim Gardner <tim.gardner@canonical.com>
Date: Thu, 29 Aug 2013 06:38:47 -0600
Subject: net: neighbour: Remove CONFIG_ARPD

This config option is superfluous in that it only guards a call
to neigh_app_ns(). Enabling CONFIG_ARPD by default has no
change in behavior. There will now be call to __neigh_notify()
for each ARP resolution, which has no impact unless there is a
user space daemon waiting to receive the notification, i.e.,
the case for which CONFIG_ARPD was designed anyways.

Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: James Morris <jmorris@namei.org>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: Patrick McHardy <kaber@trash.net>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Gao feng <gaofeng@cn.fujitsu.com>
Cc: Joe Perches <joe@perches.com>
Cc: Veaceslav Falico <vfalico@redhat.com>
Signed-off-by: Tim Gardner <tim.gardner@canonical.com>
Reviewed-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c |  2 --
 net/ipv4/Kconfig     | 16 ----------------
 net/ipv4/arp.c       |  2 --
 net/ipv6/ndisc.c     |  2 --
 4 files changed, 22 deletions(-)

(limited to 'net')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 60533db8b72d..6072610a8672 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -2759,13 +2759,11 @@ errout:
 		rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
 }
 
-#ifdef CONFIG_ARPD
 void neigh_app_ns(struct neighbour *n)
 {
 	__neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST);
 }
 EXPORT_SYMBOL(neigh_app_ns);
-#endif /* CONFIG_ARPD */
 
 #ifdef CONFIG_SYSCTL
 static int zero;
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 37cf1a6ea3ad..05c57f0fcabe 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -259,22 +259,6 @@ config IP_PIMSM_V2
 	  gated-5). This routing protocol is not used widely, so say N unless
 	  you want to play with it.
 
-config ARPD
-	bool "IP: ARP daemon support"
-	---help---
-	  The kernel maintains an internal cache which maps IP addresses to
-	  hardware addresses on the local network, so that Ethernet
-	  frames are sent to the proper address on the physical networking
-	  layer. Normally, kernel uses the ARP protocol to resolve these
-	  mappings.
-
-	  Saying Y here adds support to have an user space daemon to do this
-	  resolution instead. This is useful for implementing an alternate
-	  address resolution protocol (e.g. NHRP on mGRE tunnels) and also for
-	  testing purposes.
-
-	  If unsure, say N.
-
 config SYN_COOKIES
 	bool "IP: TCP syncookie support"
 	---help---
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 4429b013f269..7808093cede6 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -368,9 +368,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
 	} else {
 		probes -= neigh->parms->app_probes;
 		if (probes < 0) {
-#ifdef CONFIG_ARPD
 			neigh_app_ns(neigh);
-#endif
 			return;
 		}
 	}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 14bd2f9d9dbb..22210650596f 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -662,9 +662,7 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
 		}
 		ndisc_send_ns(dev, neigh, target, target, saddr);
 	} else if ((probes -= neigh->parms->app_probes) < 0) {
-#ifdef CONFIG_ARPD
 		neigh_app_ns(neigh);
-#endif
 	} else {
 		addrconf_addr_solict_mult(target, &mcaddr);
 		ndisc_send_ns(dev, NULL, target, &mcaddr, saddr);
-- 
cgit v1.2.3


From b1b72076b90d631637497b35e36bc64254df199d Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Thu, 29 Aug 2013 18:54:49 +0200
Subject: net: sctp: probe: allow more advanced ingress filtering by mark

This is a follow-up commit for commit b1dcdc68b1f4 ("net: tcp_probe:
allow more advanced ingress filtering by mark") that allows for
advanced SCTP probe module filtering based on skb mark (for a more
detailed description and advantages using mark, refer to b1dcdc68b1f4).
The current option to filter by a given port is still being preserved.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/probe.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/sctp/probe.c b/net/sctp/probe.c
index cd72ae57aff1..53c452efb40b 100644
--- a/net/sctp/probe.c
+++ b/net/sctp/probe.c
@@ -46,6 +46,10 @@ static int port __read_mostly = 0;
 MODULE_PARM_DESC(port, "Port to match (0=all)");
 module_param(port, int, 0);
 
+static unsigned int fwmark __read_mostly = 0;
+MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)");
+module_param(fwmark, uint, 0);
+
 static int bufsize __read_mostly = 64 * 1024;
 MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)");
 module_param(bufsize, int, 0);
@@ -129,15 +133,19 @@ static sctp_disposition_t jsctp_sf_eat_sack(struct net *net,
 					    void *arg,
 					    sctp_cmd_seq_t *commands)
 {
+	struct sctp_chunk *chunk = arg;
+	struct sk_buff *skb = chunk->skb;
 	struct sctp_transport *sp;
 	static __u32 lcwnd = 0;
 	struct timespec now;
 
 	sp = asoc->peer.primary_path;
 
-	if ((full || sp->cwnd != lcwnd) &&
-	    (!port || asoc->peer.port == port ||
-	     ep->base.bind_addr.port == port)) {
+	if (((port == 0 && fwmark == 0) ||
+	     asoc->peer.port == port ||
+	     ep->base.bind_addr.port == port ||
+	     (fwmark > 0 && skb->mark == fwmark)) &&
+	    (full || sp->cwnd != lcwnd)) {
 		lcwnd = sp->cwnd;
 
 		getnstimeofday(&now);
@@ -198,8 +206,8 @@ static __init int sctpprobe_init(void)
 	if (ret)
 		goto remove_proc;
 
-	pr_info("probe registered (port=%d)\n", port);
-
+	pr_info("probe registered (port=%d/fwmark=%u) bufsize=%u\n",
+		port, fwmark, bufsize);
 	return 0;
 
 remove_proc:
-- 
cgit v1.2.3


From 61e76b178dbe7145e8d6afa84bb4ccea71918994 Mon Sep 17 00:00:00 2001
From: Jiri Bohac <jbohac@suse.cz>
Date: Fri, 30 Aug 2013 11:18:45 +0200
Subject: ICMPv6: treat dest unreachable codes 5 and 6 as EACCES, not EPROTO

RFC 4443 has defined two additional codes for ICMPv6 type 1 (destination
unreachable) messages:
        5 - Source address failed ingress/egress policy
	6 - Reject route to destination

Now they are treated as protocol error and icmpv6_err_convert() converts them
to EPROTO.

RFC 4443 says:
	"Codes 5 and 6 are more informative subsets of code 1."

Treat codes 5 and 6 as code 1 (EACCES)

Btw, connect() returning -EPROTO confuses firefox, so that fallback to
other/IPv4 addresses does not work:
https://bugzilla.mozilla.org/show_bug.cgi?id=910773

Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/icmpv6.h |  2 ++
 net/ipv6/icmp.c             | 10 +++++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/uapi/linux/icmpv6.h b/include/uapi/linux/icmpv6.h
index e0133c73c304..590beda78ea0 100644
--- a/include/uapi/linux/icmpv6.h
+++ b/include/uapi/linux/icmpv6.h
@@ -115,6 +115,8 @@ struct icmp6hdr {
 #define ICMPV6_NOT_NEIGHBOUR		2
 #define ICMPV6_ADDR_UNREACH		3
 #define ICMPV6_PORT_UNREACH		4
+#define ICMPV6_POLICY_FAIL		5
+#define ICMPV6_REJECT_ROUTE		6
 
 /*
  *	Codes for Time Exceeded
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 7cfc8d284870..67ae4e0d40bf 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -940,6 +940,14 @@ static const struct icmp6_err {
 		.err	= ECONNREFUSED,
 		.fatal	= 1,
 	},
+	{	/* POLICY_FAIL */
+		.err	= EACCES,
+		.fatal	= 1,
+	},
+	{	/* REJECT_ROUTE	*/
+		.err	= EACCES,
+		.fatal	= 1,
+	},
 };
 
 int icmpv6_err_convert(u8 type, u8 code, int *err)
@@ -951,7 +959,7 @@ int icmpv6_err_convert(u8 type, u8 code, int *err)
 	switch (type) {
 	case ICMPV6_DEST_UNREACH:
 		fatal = 1;
-		if (code <= ICMPV6_PORT_UNREACH) {
+		if (code < ARRAY_SIZE(tab_unreach)) {
 			*err  = tab_unreach[code].err;
 			fatal = tab_unreach[code].fatal;
 		}
-- 
cgit v1.2.3


From 13c7bf0871509723a2b3c054d3d6e9e506464e71 Mon Sep 17 00:00:00 2001
From: Petr Holasek <pholasek@redhat.com>
Date: Fri, 30 Aug 2013 17:02:38 +0200
Subject: ipv6: ipv6_create_tempaddr cleanup

This two-liner removes max_addresses variable which is now unecessary related
to patch [ipv6: remove max_addresses check from ipv6_create_tempaddr].

Signed-off-by: Petr Holasek <pholasek@redhat.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Ding Tianhong <dingtianhong@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 498ea99194af..05c83dd54a4d 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1054,7 +1054,6 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *i
 	unsigned long regen_advance;
 	int tmp_plen;
 	int ret = 0;
-	int max_addresses;
 	u32 addr_flags;
 	unsigned long now = jiffies;
 
@@ -1100,7 +1099,6 @@ retry:
 				 idev->cnf.temp_prefered_lft + age -
 				 idev->cnf.max_desync_factor);
 	tmp_plen = ifp->prefix_len;
-	max_addresses = idev->cnf.max_addresses;
 	tmp_tstamp = ifp->tstamp;
 	spin_unlock_bh(&ifp->lock);
 
-- 
cgit v1.2.3


From 951fd874c3b014c4abf38a8e588d4687b98fedb4 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 1 Sep 2013 13:11:55 -0700
Subject: llc: Use normal etherdevice.h tests

Convert the llc_<foo> static inlines to the
equivalents from etherdevice.h and remove
the llc_<foo> static inline functions.

llc_mac_null -> is_zero_ether_addr
llc_mac_multicast -> is_multicast_ether_addr
llc_mac_match -> ether_addr_equal

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/llc_if.h | 30 ------------------------------
 net/llc/af_llc.c     |  6 +++---
 net/llc/llc_conn.c   |  6 +++---
 net/llc/llc_sap.c    |  4 ++--
 4 files changed, 8 insertions(+), 38 deletions(-)

(limited to 'net')

diff --git a/include/net/llc_if.h b/include/net/llc_if.h
index b595a004d31b..f0cb909b60eb 100644
--- a/include/net/llc_if.h
+++ b/include/net/llc_if.h
@@ -62,36 +62,6 @@
 #define LLC_STATUS_CONFLICT	7 /* disconnect conn */
 #define LLC_STATUS_RESET_DONE	8 /*  */
 
-/**
- *      llc_mac_null - determines if a address is a null mac address
- *      @mac: Mac address to test if null.
- *
- *      Determines if a given address is a null mac address.  Returns 0 if the
- *      address is not a null mac, 1 if the address is a null mac.
- */
-static inline int llc_mac_null(const u8 *mac)
-{
-	return is_zero_ether_addr(mac);
-}
-
-static inline int llc_mac_multicast(const u8 *mac)
-{
-	return is_multicast_ether_addr(mac);
-}
-/**
- *	llc_mac_match - determines if two mac addresses are the same
- *	@mac1: First mac address to compare.
- *	@mac2: Second mac address to compare.
- *
- *	Determines if two given mac address are the same.  Returns 0 if there
- *	is not a complete match up to len, 1 if a complete match up to len is
- *	found.
- */
-static inline int llc_mac_match(const u8 *mac1, const u8 *mac2)
-{
-	return !compare_ether_addr(mac1, mac2);
-}
-
 extern int llc_establish_connection(struct sock *sk, u8 *lmac,
 				    u8 *dmac, u8 dsap);
 extern int llc_build_and_send_pkt(struct sock *sk, struct sk_buff *skb);
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 48aaa89253e0..6cba486353e8 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -321,12 +321,12 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen)
 		if (llc->dev) {
 			if (!addr->sllc_arphrd)
 				addr->sllc_arphrd = llc->dev->type;
-			if (llc_mac_null(addr->sllc_mac))
+			if (is_zero_ether_addr(addr->sllc_mac))
 				memcpy(addr->sllc_mac, llc->dev->dev_addr,
 				       IFHWADDRLEN);
 			if (addr->sllc_arphrd != llc->dev->type ||
-			    !llc_mac_match(addr->sllc_mac,
-					   llc->dev->dev_addr)) {
+			    !ether_addr_equal(addr->sllc_mac,
+					      llc->dev->dev_addr)) {
 				rc = -EINVAL;
 				llc->dev = NULL;
 			}
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index 0d0d416dfab6..cd8724177965 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -478,8 +478,8 @@ static inline bool llc_estab_match(const struct llc_sap *sap,
 
 	return llc->laddr.lsap == laddr->lsap &&
 		llc->daddr.lsap == daddr->lsap &&
-		llc_mac_match(llc->laddr.mac, laddr->mac) &&
-		llc_mac_match(llc->daddr.mac, daddr->mac);
+		ether_addr_equal(llc->laddr.mac, laddr->mac) &&
+		ether_addr_equal(llc->daddr.mac, daddr->mac);
 }
 
 /**
@@ -550,7 +550,7 @@ static inline bool llc_listener_match(const struct llc_sap *sap,
 
 	return sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN &&
 		llc->laddr.lsap == laddr->lsap &&
-		llc_mac_match(llc->laddr.mac, laddr->mac);
+		ether_addr_equal(llc->laddr.mac, laddr->mac);
 }
 
 static struct sock *__llc_lookup_listener(struct llc_sap *sap,
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index 78be45cda5c1..e5850699098e 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -302,7 +302,7 @@ static inline bool llc_dgram_match(const struct llc_sap *sap,
 
      return sk->sk_type == SOCK_DGRAM &&
 	  llc->laddr.lsap == laddr->lsap &&
-	  llc_mac_match(llc->laddr.mac, laddr->mac);
+	  ether_addr_equal(llc->laddr.mac, laddr->mac);
 }
 
 /**
@@ -425,7 +425,7 @@ void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb)
 	llc_pdu_decode_da(skb, laddr.mac);
 	llc_pdu_decode_dsap(skb, &laddr.lsap);
 
-	if (llc_mac_multicast(laddr.mac)) {
+	if (is_multicast_ether_addr(laddr.mac)) {
 		llc_sap_mcast(sap, &laddr, skb);
 		kfree_skb(skb);
 	} else {
-- 
cgit v1.2.3


From c3923b7a3dae842017d1206f43e23cd9ba1148b9 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 1 Sep 2013 15:45:08 -0700
Subject: batman: Remove reference to compare_ether_addr

This function is being removed, rename the reference.

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Antonio Quartulli <ordex@autistici.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/batman-adv/main.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 5d00f23b4621..24675523930f 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -254,7 +254,7 @@ static inline void batadv_dbg(int type __always_unused,
 
 /* returns 1 if they are the same ethernet addr
  *
- * note: can't use compare_ether_addr() as it requires aligned memory
+ * note: can't use ether_addr_equal() as it requires aligned memory
  */
 static inline int batadv_compare_eth(const void *data1, const void *data2)
 {
-- 
cgit v1.2.3


From 1372a298ea4f2772655b2e69caa634f132f1d019 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 1 Sep 2013 15:48:27 -0700
Subject: wireless: scan: Remove comment to compare_ether_addr

This function is being removed, so remove the reference to it.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/wireless/scan.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index ad1e4068ce06..eeb71480f1af 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -465,10 +465,6 @@ static int cmp_bss(struct cfg80211_bss *a,
 		}
 	}
 
-	/*
-	 * we can't use compare_ether_addr here since we need a < > operator.
-	 * The binary return value of compare_ether_addr isn't enough
-	 */
 	r = memcmp(a->bssid, b->bssid, sizeof(a->bssid));
 	if (r)
 		return r;
-- 
cgit v1.2.3


From 8b7ed2d91d6afb0b55ba75f94b66e51f70783a46 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Mon, 2 Sep 2013 15:34:54 +0200
Subject: iptunnels: remove net arg from iptunnel_xmit()

This argument is not used, let's remove it.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c         | 3 +--
 include/net/ip_tunnels.h    | 3 +--
 net/ipv4/ip_tunnel.c        | 3 +--
 net/ipv4/ip_tunnel_core.c   | 3 +--
 net/ipv6/sit.c              | 4 ++--
 net/openvswitch/vport-gre.c | 2 +-
 6 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 8f6d6c1153ce..e25c97dfbacd 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1528,8 +1528,7 @@ int vxlan_xmit_skb(struct net *net, struct vxlan_sock *vs,
 	if (err)
 		return err;
 
-	return iptunnel_xmit(net, rt, skb, src, dst,
-			IPPROTO_UDP, tos, ttl, df);
+	return iptunnel_xmit(rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df);
 }
 EXPORT_SYMBOL_GPL(vxlan_xmit_skb);
 
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 0ce316bb3c65..94fe8fd07bc9 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -146,8 +146,7 @@ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph,
 }
 
 int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto);
-int iptunnel_xmit(struct net *net, struct rtable *rt,
-		  struct sk_buff *skb,
+int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb,
 		  __be32 src, __be32 dst, __u8 proto,
 		  __u8 tos, __u8 ttl, __be16 df);
 
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 830de3f4e293..0a6cf0e69478 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -654,8 +654,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 		}
 	}
 
-	err = iptunnel_xmit(tunnel->net, rt, skb,
-			    fl4.saddr, fl4.daddr, protocol,
+	err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, protocol,
 			    ip_tunnel_ecn_encap(tos, inner_iph, skb), ttl, df);
 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
 
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 850525b34899..e82045842351 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -46,8 +46,7 @@
 #include <net/netns/generic.h>
 #include <net/rtnetlink.h>
 
-int iptunnel_xmit(struct net *net, struct rtable *rt,
-		  struct sk_buff *skb,
+int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb,
 		  __be32 src, __be32 dst, __u8 proto,
 		  __u8 tos, __u8 ttl, __be16 df)
 {
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index f18f842ac893..1d1458a3b7c4 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -888,8 +888,8 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 		ttl = iph6->hop_limit;
 	tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6));
 
-	err = iptunnel_xmit(dev_net(dev), rt, skb, fl4.saddr, fl4.daddr,
-			    IPPROTO_IPV6, tos, ttl, df);
+	err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, IPPROTO_IPV6, tos,
+			    ttl, df);
 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
 	return NETDEV_TX_OK;
 
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index 21d5073e148e..9b3713ef831e 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -176,7 +176,7 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
 
 	skb->local_df = 1;
 
-	return iptunnel_xmit(net, rt, skb, fl.saddr,
+	return iptunnel_xmit(rt, skb, fl.saddr,
 			     OVS_CB(skb)->tun_key->ipv4_dst, IPPROTO_GRE,
 			     OVS_CB(skb)->tun_key->ipv4_tos,
 			     OVS_CB(skb)->tun_key->ipv4_ttl, df);
-- 
cgit v1.2.3


From 117961878cc1386923cfddcdd9016b777827c8dd Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Mon, 2 Sep 2013 15:34:55 +0200
Subject: vxlan: remove net arg from vxlan[6]_xmit_skb()

This argument is not used, let's remove it.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c           | 8 ++++----
 include/net/vxlan.h           | 2 +-
 net/openvswitch/vport-vxlan.c | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index e25c97dfbacd..a334bfb91c5a 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1380,7 +1380,7 @@ static int handle_offloads(struct sk_buff *skb)
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-static int vxlan6_xmit_skb(struct net *net, struct vxlan_sock *vs,
+static int vxlan6_xmit_skb(struct vxlan_sock *vs,
 			   struct dst_entry *dst, struct sk_buff *skb,
 			   struct net_device *dev, struct in6_addr *saddr,
 			   struct in6_addr *daddr, __u8 prio, __u8 ttl,
@@ -1475,7 +1475,7 @@ static int vxlan6_xmit_skb(struct net *net, struct vxlan_sock *vs,
 }
 #endif
 
-int vxlan_xmit_skb(struct net *net, struct vxlan_sock *vs,
+int vxlan_xmit_skb(struct vxlan_sock *vs,
 		   struct rtable *rt, struct sk_buff *skb,
 		   __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
 		   __be16 src_port, __be16 dst_port, __be32 vni)
@@ -1651,7 +1651,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
 		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
 
-		err = vxlan_xmit_skb(dev_net(dev), vxlan->vn_sock, rt, skb,
+		err = vxlan_xmit_skb(vxlan->vn_sock, rt, skb,
 				     fl4.saddr, dst->sin.sin_addr.s_addr,
 				     tos, ttl, df, src_port, dst_port,
 				     htonl(vni << 8));
@@ -1703,7 +1703,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 
 		ttl = ttl ? : ip6_dst_hoplimit(ndst);
 
-		err = vxlan6_xmit_skb(dev_net(dev), vxlan->vn_sock, ndst, skb,
+		err = vxlan6_xmit_skb(vxlan->vn_sock, ndst, skb,
 				      dev, &fl6.saddr, &fl6.daddr, 0, ttl,
 				      src_port, dst_port, htonl(vni << 8));
 #endif
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index d2b88cafa7a2..e09c40b68027 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -29,7 +29,7 @@ struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
 
 void vxlan_sock_release(struct vxlan_sock *vs);
 
-int vxlan_xmit_skb(struct net *net, struct vxlan_sock *vs,
+int vxlan_xmit_skb(struct vxlan_sock *vs,
 		   struct rtable *rt, struct sk_buff *skb,
 		   __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
 		   __be16 src_port, __be16 dst_port, __be32 vni);
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index a0060245b4e1..a481c03e2861 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -176,7 +176,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
 	inet_get_local_port_range(&port_min, &port_max);
 	src_port = vxlan_src_port(port_min, port_max, skb);
 
-	err = vxlan_xmit_skb(net, vxlan_port->vs, rt, skb,
+	err = vxlan_xmit_skb(vxlan_port->vs, rt, skb,
 			     fl.saddr, OVS_CB(skb)->tun_key->ipv4_dst,
 			     OVS_CB(skb)->tun_key->ipv4_tos,
 			     OVS_CB(skb)->tun_key->ipv4_ttl, df,
-- 
cgit v1.2.3


From 8b27f27797cac5ed9b2f3e63dac89a7ae70e70a7 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Mon, 2 Sep 2013 15:34:56 +0200
Subject: skb: allow skb_scrub_packet() to be used by tunnels

This function was only used when a packet was sent to another netns. Now, it can
also be used after tunnel encapsulation or decapsulation.

Only skb_orphan() should not be done when a packet is not crossing netns.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  2 +-
 net/core/dev.c         |  2 +-
 net/core/skbuff.c      | 19 ++++++++++++-------
 net/ipv4/ip_tunnel.c   |  4 ++--
 net/ipv6/ip6_tunnel.c  |  4 ++--
 net/ipv6/sit.c         |  4 ++--
 6 files changed, 20 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6f1330af1ebb..2ddb48d9312c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2392,7 +2392,7 @@ extern void	       skb_split(struct sk_buff *skb,
 				 struct sk_buff *skb1, const u32 len);
 extern int	       skb_shift(struct sk_buff *tgt, struct sk_buff *skb,
 				 int shiftlen);
-extern void	       skb_scrub_packet(struct sk_buff *skb);
+extern void	       skb_scrub_packet(struct sk_buff *skb, bool xnet);
 
 extern struct sk_buff *skb_segment(struct sk_buff *skb,
 				   netdev_features_t features);
diff --git a/net/core/dev.c b/net/core/dev.c
index 6fbb0c90849b..07684e880a5d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1697,7 +1697,7 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 	 * call skb_scrub_packet() after it to clear pkt_type _after_ calling
 	 * eth_type_trans().
 	 */
-	skb_scrub_packet(skb);
+	skb_scrub_packet(skb, true);
 
 	return netif_rx(skb);
 }
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2c3d0f53d198..d81cff119f73 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3500,17 +3500,22 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
 EXPORT_SYMBOL(skb_try_coalesce);
 
 /**
- * skb_scrub_packet - scrub an skb before sending it to another netns
+ * skb_scrub_packet - scrub an skb
  *
  * @skb: buffer to clean
- *
- * skb_scrub_packet can be used to clean an skb before injecting it in
- * another namespace. We have to clear all information in the skb that
- * could impact namespace isolation.
+ * @xnet: packet is crossing netns
+ *
+ * skb_scrub_packet can be used after encapsulating or decapsulting a packet
+ * into/from a tunnel. Some information have to be cleared during these
+ * operations.
+ * skb_scrub_packet can also be used to clean a skb before injecting it in
+ * another namespace (@xnet == true). We have to clear all information in the
+ * skb that could impact namespace isolation.
  */
-void skb_scrub_packet(struct sk_buff *skb)
+void skb_scrub_packet(struct sk_buff *skb, bool xnet)
 {
-	skb_orphan(skb);
+	if (xnet)
+		skb_orphan(skb);
 	skb->tstamp.tv64 = 0;
 	skb->pkt_type = PACKET_HOST;
 	skb->skb_iif = 0;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 0a6cf0e69478..b0e74e17088f 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -462,7 +462,7 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
 	}
 
 	if (!net_eq(tunnel->net, dev_net(tunnel->dev)))
-		skb_scrub_packet(skb);
+		skb_scrub_packet(skb, true);
 
 	gro_cells_receive(&tunnel->gro_cells, skb);
 	return 0;
@@ -615,7 +615,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 	}
 
 	if (!net_eq(tunnel->net, dev_net(dev)))
-		skb_scrub_packet(skb);
+		skb_scrub_packet(skb, true);
 
 	if (tunnel->err_count > 0) {
 		if (time_before(jiffies,
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index d6e00a39274c..72372ac90159 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -830,7 +830,7 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol,
 		tstats->rx_bytes += skb->len;
 
 		if (!net_eq(t->net, dev_net(t->dev)))
-			skb_scrub_packet(skb);
+			skb_scrub_packet(skb, true);
 
 		netif_rx(skb);
 
@@ -1002,7 +1002,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
 	}
 
 	if (!net_eq(t->net, dev_net(dev)))
-		skb_scrub_packet(skb);
+		skb_scrub_packet(skb, true);
 
 	/*
 	 * Okay, now see if we can stuff it in the buffer as-is.
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 1d1458a3b7c4..b2e44f478e14 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -622,7 +622,7 @@ static int ipip6_rcv(struct sk_buff *skb)
 		tstats->rx_bytes += skb->len;
 
 		if (!net_eq(tunnel->net, dev_net(tunnel->dev)))
-			skb_scrub_packet(skb);
+			skb_scrub_packet(skb, true);
 		netif_rx(skb);
 
 		return 0;
@@ -861,7 +861,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 	}
 
 	if (!net_eq(tunnel->net, dev_net(dev)))
-		skb_scrub_packet(skb);
+		skb_scrub_packet(skb, true);
 
 	/*
 	 * Okay, now see if we can stuff it in the buffer as-is.
-- 
cgit v1.2.3


From 963a88b31ddbbe99f38502239b1a46601773d217 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Mon, 2 Sep 2013 15:34:57 +0200
Subject: tunnels: harmonize cleanup done on skb on xmit path

The goal of this patch is to harmonize cleanup done on a skbuff on xmit path.
Before this patch, behaviors were different depending of the tunnel type.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c         | 6 ++++--
 include/net/ip6_tunnel.h    | 1 -
 include/net/ip_tunnels.h    | 2 +-
 net/ipv4/ip_tunnel.c        | 6 ++----
 net/ipv4/ip_tunnel_core.c   | 7 +++----
 net/ipv6/ip6_gre.c          | 4 ++--
 net/ipv6/ip6_tunnel.c       | 4 +---
 net/ipv6/sit.c              | 5 +----
 net/openvswitch/vport-gre.c | 2 +-
 9 files changed, 15 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index a334bfb91c5a..ebda3a1c2f31 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1397,6 +1397,8 @@ static int vxlan6_xmit_skb(struct vxlan_sock *vs,
 		skb->encapsulation = 1;
 	}
 
+	skb_scrub_packet(skb, false);
+
 	min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
 			+ VXLAN_HLEN + sizeof(struct ipv6hdr)
 			+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
@@ -1432,7 +1434,6 @@ static int vxlan6_xmit_skb(struct vxlan_sock *vs,
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
 			      IPSKB_REROUTED);
-	skb_dst_drop(skb);
 	skb_dst_set(skb, dst);
 
 	if (!skb_is_gso(skb) && !(dst->dev->features & NETIF_F_IPV6_CSUM)) {
@@ -1528,7 +1529,8 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
 	if (err)
 		return err;
 
-	return iptunnel_xmit(rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df);
+	return iptunnel_xmit(rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df,
+			     false);
 }
 EXPORT_SYMBOL_GPL(vxlan_xmit_skb);
 
diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index 2265b0bf97e5..6d1549c4893c 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -75,7 +75,6 @@ static inline void ip6tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct net_device_stats *stats = &dev->stats;
 	int pkt_len, err;
 
-	nf_reset(skb);
 	pkt_len = skb->len;
 	err = ip6_local_out(skb);
 
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 94fe8fd07bc9..a0a4a100f5c9 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -148,7 +148,7 @@ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph,
 int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto);
 int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb,
 		  __be32 src, __be32 dst, __u8 proto,
-		  __u8 tos, __u8 ttl, __be16 df);
+		  __u8 tos, __u8 ttl, __be16 df, bool xnet);
 
 static inline void iptunnel_xmit_stats(int err,
 				       struct net_device_stats *err_stats,
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index b0e74e17088f..88d7d7d1eccb 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -614,9 +614,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 		goto tx_error;
 	}
 
-	if (!net_eq(tunnel->net, dev_net(dev)))
-		skb_scrub_packet(skb, true);
-
 	if (tunnel->err_count > 0) {
 		if (time_before(jiffies,
 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
@@ -655,7 +652,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 	}
 
 	err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, protocol,
-			    ip_tunnel_ecn_encap(tos, inner_iph, skb), ttl, df);
+			    ip_tunnel_ecn_encap(tos, inner_iph, skb), ttl, df,
+			    !net_eq(tunnel->net, dev_net(dev)));
 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
 
 	return;
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index e82045842351..d6c856b17fd4 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -48,16 +48,15 @@
 
 int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb,
 		  __be32 src, __be32 dst, __u8 proto,
-		  __u8 tos, __u8 ttl, __be16 df)
+		  __u8 tos, __u8 ttl, __be16 df, bool xnet)
 {
 	int pkt_len = skb->len;
 	struct iphdr *iph;
 	int err;
 
-	nf_reset(skb);
-	secpath_reset(skb);
+	skb_scrub_packet(skb, xnet);
+
 	skb->rxhash = 0;
-	skb_dst_drop(skb);
 	skb_dst_set(skb, &rt->dst);
 	memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
 
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index f2d0a42f8057..f179ff1f56ec 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -694,6 +694,8 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,
 			tunnel->err_count = 0;
 	}
 
+	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev)));
+
 	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + dst->header_len;
 
 	if (skb_headroom(skb) < max_headroom || skb_shared(skb) ||
@@ -710,8 +712,6 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,
 		skb = new_skb;
 	}
 
-	skb_dst_drop(skb);
-
 	if (fl6->flowi6_mark) {
 		skb_dst_set(skb, dst);
 		ndst = NULL;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 72372ac90159..ecbcdbd4bc4f 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1001,8 +1001,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
 		goto tx_err_dst_release;
 	}
 
-	if (!net_eq(t->net, dev_net(dev)))
-		skb_scrub_packet(skb, true);
+	skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev)));
 
 	/*
 	 * Okay, now see if we can stuff it in the buffer as-is.
@@ -1021,7 +1020,6 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
 		consume_skb(skb);
 		skb = new_skb;
 	}
-	skb_dst_drop(skb);
 	if (fl6->flowi6_mark) {
 		skb_dst_set(skb, dst);
 		ndst = NULL;
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index b2e44f478e14..82b425b9b7d2 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -860,9 +860,6 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 			tunnel->err_count = 0;
 	}
 
-	if (!net_eq(tunnel->net, dev_net(dev)))
-		skb_scrub_packet(skb, true);
-
 	/*
 	 * Okay, now see if we can stuff it in the buffer as-is.
 	 */
@@ -889,7 +886,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 	tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6));
 
 	err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, IPPROTO_IPV6, tos,
-			    ttl, df);
+			    ttl, df, !net_eq(tunnel->net, dev_net(dev)));
 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
 	return NETDEV_TX_OK;
 
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index 9b3713ef831e..c99dea543d64 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -179,7 +179,7 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
 	return iptunnel_xmit(rt, skb, fl.saddr,
 			     OVS_CB(skb)->tun_key->ipv4_dst, IPPROTO_GRE,
 			     OVS_CB(skb)->tun_key->ipv4_tos,
-			     OVS_CB(skb)->tun_key->ipv4_ttl, df);
+			     OVS_CB(skb)->tun_key->ipv4_ttl, df, false);
 err_free_rt:
 	ip_rt_put(rt);
 error:
-- 
cgit v1.2.3


From ea23192e8e577dfc51e0f4fc5ca113af334edff9 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Mon, 2 Sep 2013 15:34:58 +0200
Subject: tunnels: harmonize cleanup done on skb on rx path

The goal of this patch is to harmonize cleanup done on a skbuff on rx path.
Before this patch, behaviors were different depending of the tunnel type.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h     | 12 +++++++-----
 net/ipv4/ip_tunnel.c  |  3 +--
 net/ipv4/ipmr.c       |  3 +--
 net/ipv6/ip6_gre.c    |  5 +----
 net/ipv6/ip6_tunnel.c |  7 +------
 net/ipv6/ip6mr.c      |  3 +--
 net/ipv6/sit.c        |  6 +-----
 7 files changed, 13 insertions(+), 26 deletions(-)

(limited to 'net')

diff --git a/include/net/dst.h b/include/net/dst.h
index 1f8fd109e225..3bc4865f8267 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -311,11 +311,13 @@ static inline void skb_dst_force(struct sk_buff *skb)
  *	__skb_tunnel_rx - prepare skb for rx reinsert
  *	@skb: buffer
  *	@dev: tunnel device
+ *	@net: netns for packet i/o
  *
  *	After decapsulation, packet is going to re-enter (netif_rx()) our stack,
  *	so make some cleanups. (no accounting done)
  */
-static inline void __skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev)
+static inline void __skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev,
+				   struct net *net)
 {
 	skb->dev = dev;
 
@@ -327,8 +329,7 @@ static inline void __skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev)
 	if (!skb->l4_rxhash)
 		skb->rxhash = 0;
 	skb_set_queue_mapping(skb, 0);
-	skb_dst_drop(skb);
-	nf_reset(skb);
+	skb_scrub_packet(skb, !net_eq(net, dev_net(dev)));
 }
 
 /**
@@ -340,12 +341,13 @@ static inline void __skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev)
  *	so make some cleanups, and perform accounting.
  *	Note: this accounting is not SMP safe.
  */
-static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev)
+static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev,
+				 struct net *net)
 {
 	/* TODO : stats should be SMP safe */
 	dev->stats.rx_packets++;
 	dev->stats.rx_bytes += skb->len;
-	__skb_tunnel_rx(skb, dev);
+	__skb_tunnel_rx(skb, dev, net);
 }
 
 /* Children define the path of the packet through the
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 88d7d7d1eccb..ac9fabe0300f 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -461,8 +461,7 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
 		skb->dev = tunnel->dev;
 	}
 
-	if (!net_eq(tunnel->net, dev_net(tunnel->dev)))
-		skb_scrub_packet(skb, true);
+	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
 
 	gro_cells_receive(&tunnel->gro_cells, skb);
 	return 0;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index bacc0bcf48ce..9ae54b09254f 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2067,9 +2067,8 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
 	skb_reset_network_header(skb);
 	skb->protocol = htons(ETH_P_IP);
 	skb->ip_summed = CHECKSUM_NONE;
-	skb->pkt_type = PACKET_HOST;
 
-	skb_tunnel_rx(skb, reg_dev);
+	skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev));
 
 	netif_rx(skb);
 
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index f179ff1f56ec..db992a373011 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -509,8 +509,6 @@ static int ip6gre_rcv(struct sk_buff *skb)
 			goto drop;
 		}
 
-		secpath_reset(skb);
-
 		skb->protocol = gre_proto;
 		/* WCCP version 1 and 2 protocol decoding.
 		 * - Change protocol to IP
@@ -525,7 +523,6 @@ static int ip6gre_rcv(struct sk_buff *skb)
 		skb->mac_header = skb->network_header;
 		__pskb_pull(skb, offset);
 		skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
-		skb->pkt_type = PACKET_HOST;
 
 		if (((flags&GRE_CSUM) && csum) ||
 		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
@@ -557,7 +554,7 @@ static int ip6gre_rcv(struct sk_buff *skb)
 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 		}
 
-		__skb_tunnel_rx(skb, tunnel->dev);
+		__skb_tunnel_rx(skb, tunnel->dev, tunnel->net);
 
 		skb_reset_network_header(skb);
 
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index ecbcdbd4bc4f..55999d923f26 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -802,14 +802,12 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol,
 			rcu_read_unlock();
 			goto discard;
 		}
-		secpath_reset(skb);
 		skb->mac_header = skb->network_header;
 		skb_reset_network_header(skb);
 		skb->protocol = htons(protocol);
-		skb->pkt_type = PACKET_HOST;
 		memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
 
-		__skb_tunnel_rx(skb, t->dev);
+		__skb_tunnel_rx(skb, t->dev, t->net);
 
 		err = dscp_ecn_decapsulate(t, ipv6h, skb);
 		if (unlikely(err)) {
@@ -829,9 +827,6 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol,
 		tstats->rx_packets++;
 		tstats->rx_bytes += skb->len;
 
-		if (!net_eq(t->net, dev_net(t->dev)))
-			skb_scrub_packet(skb, true);
-
 		netif_rx(skb);
 
 		rcu_read_unlock();
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index a60a84ef04f7..f365310bfcca 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -672,9 +672,8 @@ static int pim6_rcv(struct sk_buff *skb)
 	skb_reset_network_header(skb);
 	skb->protocol = htons(ETH_P_IPV6);
 	skb->ip_summed = CHECKSUM_NONE;
-	skb->pkt_type = PACKET_HOST;
 
-	skb_tunnel_rx(skb, reg_dev);
+	skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev));
 
 	netif_rx(skb);
 
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 82b425b9b7d2..19abcc9d6a1a 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -581,12 +581,10 @@ static int ipip6_rcv(struct sk_buff *skb)
 		    tunnel->parms.iph.protocol != 0)
 			goto out;
 
-		secpath_reset(skb);
 		skb->mac_header = skb->network_header;
 		skb_reset_network_header(skb);
 		IPCB(skb)->flags = 0;
 		skb->protocol = htons(ETH_P_IPV6);
-		skb->pkt_type = PACKET_HOST;
 
 		if (tunnel->dev->priv_flags & IFF_ISATAP) {
 			if (!isatap_chksrc(skb, iph, tunnel)) {
@@ -603,7 +601,7 @@ static int ipip6_rcv(struct sk_buff *skb)
 			}
 		}
 
-		__skb_tunnel_rx(skb, tunnel->dev);
+		__skb_tunnel_rx(skb, tunnel->dev, tunnel->net);
 
 		err = IP_ECN_decapsulate(iph, skb);
 		if (unlikely(err)) {
@@ -621,8 +619,6 @@ static int ipip6_rcv(struct sk_buff *skb)
 		tstats->rx_packets++;
 		tstats->rx_bytes += skb->len;
 
-		if (!net_eq(tunnel->net, dev_net(tunnel->dev)))
-			skb_scrub_packet(skb, true);
 		netif_rx(skb);
 
 		return 0;
-- 
cgit v1.2.3


From 82476b316084e6826a9dd339d1dad892a598af9a Mon Sep 17 00:00:00 2001
From: Veaceslav Falico <vfalico@redhat.com>
Date: Mon, 2 Sep 2013 16:26:51 +0200
Subject: net: correctly interlink lower/upper devices

Currently we're linking upper devices to lower ones, which results in
upside-down relationship: upper devices seeing lower devices via its upper
lists.

Fix this by correctly linking lower devices to the upper ones.

CC: "David S. Miller" <davem@davemloft.net>
CC: Eric Dumazet <edumazet@google.com>
CC: Jiri Pirko <jiri@resnulli.us>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Cong Wang <amwang@redhat.com>
Signed-off-by: Veaceslav Falico <vfalico@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 07684e880a5d..5c713f2239cc 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4679,8 +4679,8 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 	 * versa, and don't forget the devices itself. All of these
 	 * links are non-neighbours.
 	 */
-	list_for_each_entry(i, &upper_dev->upper_dev_list, list) {
-		list_for_each_entry(j, &dev->lower_dev_list, list) {
+	list_for_each_entry(i, &dev->lower_dev_list, list) {
+		list_for_each_entry(j, &upper_dev->upper_dev_list, list) {
 			ret = __netdev_adjacent_dev_link(i->dev, j->dev);
 			if (ret)
 				goto rollback_mesh;
-- 
cgit v1.2.3


From 80aa4e10963cc7d9b5dd5b2568ce28d74af20bf9 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 3 Sep 2013 12:03:40 +0300
Subject: x25: add a sanity check parsing X.25 facilities

This was found with a manual audit and I don't have a reproducer.  We
limit ->calling_len and ->called_len when we get them from
copy_from_user() in x25_ioctl() so when they come from skb->data then
we should cap them there as well.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/x25/x25_facilities.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/x25/x25_facilities.c b/net/x25/x25_facilities.c
index 66c638730c7a..b8253250d723 100644
--- a/net/x25/x25_facilities.c
+++ b/net/x25/x25_facilities.c
@@ -156,6 +156,8 @@ int x25_parse_facilities(struct sk_buff *skb, struct x25_facilities *facilities,
 			case X25_FAC_CALLING_AE:
 				if (p[1] > X25_MAX_DTE_FACIL_LEN || p[1] <= 1)
 					return -1;
+				if (p[2] > X25_MAX_AE_LEN)
+					return -1;
 				dte_facs->calling_len = p[2];
 				memcpy(dte_facs->calling_ae, &p[3], p[1] - 1);
 				*vc_fac_mask |= X25_MASK_CALLING_AE;
@@ -163,6 +165,8 @@ int x25_parse_facilities(struct sk_buff *skb, struct x25_facilities *facilities,
 			case X25_FAC_CALLED_AE:
 				if (p[1] > X25_MAX_DTE_FACIL_LEN || p[1] <= 1)
 					return -1;
+				if (p[2] > X25_MAX_AE_LEN)
+					return -1;
 				dte_facs->called_len = p[2];
 				memcpy(dte_facs->called_ae, &p[3], p[1] - 1);
 				*vc_fac_mask |= X25_MASK_CALLED_AE;
-- 
cgit v1.2.3


From cc8c6c1b21c9b1e0f1e89428c07c7c52435cba0c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Tue, 3 Sep 2013 18:24:02 +0200
Subject: net: tcp_probe: adapt tbuf size for recent changes

With recent changes in tcp_probe module (e.g. f925d0a62d ("net: tcp_probe:
add IPv6 support")) we also need to take into account that tbuf needs to
be updated as format string will be further expanded. tbuf sits on the stack
in tcpprobe_read() function that is invoked when user space reads procfs
file /proc/net/tcpprobe, hence not fast path as in jtcp_rcv_established().
Having a size similarly as in sctp_probe module of 256 bytes is fully
sufficient for that, we need theoretical maximum of 252 bytes otherwise we
could get truncated.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_probe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 622a4377b397..1f6aa543b64e 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -218,7 +218,7 @@ static ssize_t tcpprobe_read(struct file *file, char __user *buf,
 		return -EINVAL;
 
 	while (cnt < len) {
-		char tbuf[164];
+		char tbuf[256];
 		int width;
 
 		/* Wait for data in buffer */
-- 
cgit v1.2.3


From c995ae2259ee36caf48bbfacf40111998dacd4af Mon Sep 17 00:00:00 2001
From: Vijay Subramanian <subramanian.vijay@gmail.com>
Date: Tue, 3 Sep 2013 12:23:22 -0700
Subject: tcp: Change return value of tcp_rcv_established()

tcp_rcv_established() returns only one value namely 0. We change the return
value to void (as suggested by David Miller).

After commit 0c24604b (tcp: implement RFC 5961 4.2), we no longer send RSTs in
response to SYNs. We can remove the check and processing on the return value of
tcp_rcv_established().

We also fix jtcp_rcv_established() in tcp_probe.c to match that of
tcp_rcv_established().

Signed-off-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    |  4 ++--
 net/ipv4/tcp_input.c | 13 ++++++-------
 net/ipv4/tcp_ipv4.c  |  5 +----
 net/ipv4/tcp_probe.c |  5 ++---
 net/ipv6/tcp_ipv6.c  |  3 +--
 5 files changed, 12 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6a6a88db462d..b1aa324c5e65 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -371,8 +371,8 @@ extern void tcp_delack_timer_handler(struct sock *sk);
 extern int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
 extern int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 				 const struct tcphdr *th, unsigned int len);
-extern int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
-			       const struct tcphdr *th, unsigned int len);
+extern void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
+				const struct tcphdr *th, unsigned int len);
 extern void tcp_rcv_space_adjust(struct sock *sk);
 extern void tcp_cleanup_rbuf(struct sock *sk, int copied);
 extern int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1a84fffe6993..93d7e9de4143 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5049,8 +5049,8 @@ discard:
  *	the rest is checked inline. Fast processing is turned on in
  *	tcp_data_queue when everything is OK.
  */
-int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
-			const struct tcphdr *th, unsigned int len)
+void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
+			 const struct tcphdr *th, unsigned int len)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
@@ -5127,7 +5127,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 				tcp_ack(sk, skb, 0);
 				__kfree_skb(skb);
 				tcp_data_snd_check(sk);
-				return 0;
+				return;
 			} else { /* Header too small */
 				TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
 				goto discard;
@@ -5220,7 +5220,7 @@ no_ack:
 			if (eaten)
 				kfree_skb_partial(skb, fragstolen);
 			sk->sk_data_ready(sk, 0);
-			return 0;
+			return;
 		}
 	}
 
@@ -5236,7 +5236,7 @@ slow_path:
 	 */
 
 	if (!tcp_validate_incoming(sk, skb, th, 1))
-		return 0;
+		return;
 
 step5:
 	if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
@@ -5252,7 +5252,7 @@ step5:
 
 	tcp_data_snd_check(sk);
 	tcp_ack_snd_check(sk);
-	return 0;
+	return;
 
 csum_error:
 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
@@ -5260,7 +5260,6 @@ csum_error:
 
 discard:
 	__kfree_skb(skb);
-	return 0;
 }
 EXPORT_SYMBOL(tcp_rcv_established);
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 09d45d718973..b14266bb91eb 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1799,10 +1799,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 				sk->sk_rx_dst = NULL;
 			}
 		}
-		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
-			rsk = sk;
-			goto reset;
-		}
+		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
 		return 0;
 	}
 
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 1f6aa543b64e..611beab38a00 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -122,8 +122,8 @@ static inline int tcp_probe_avail(void)
  * Hook inserted to be called before each receive packet.
  * Note: arguments must match tcp_rcv_established()!
  */
-static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
-				const struct tcphdr *th, unsigned int len)
+static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
+				 const struct tcphdr *th, unsigned int len)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	const struct inet_sock *inet = inet_sk(sk);
@@ -172,7 +172,6 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	}
 
 	jprobe_return();
-	return 0;
 }
 
 static struct jprobe tcp_jprobe = {
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5bcfadf09e95..9acdcedf9a14 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1360,8 +1360,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 			}
 		}
 
-		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len))
-			goto reset;
+		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
 		if (opt_skb)
 			goto ipv6_pktoptions;
 		return 0;
-- 
cgit v1.2.3


From 775ada6d9f4c9dc440f5aeca00354eb87f6e0696 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 28 Aug 2013 15:14:38 +0200
Subject: netfilter: more strict TCP flag matching in SYNPROXY

Its seems Patrick missed to incoorporate some of my requested changes
during review v2 of SYNPROXY netfilter module.

Which were, to avoid SYN+ACK packets to enter the path, meant for the
ACK packet from the client (from the 3WHS).

Further there were a bug in ip6t_SYNPROXY.c, for matching SYN packets
that didn't exclude the ACK flag.

Go a step further with SYN packet/flag matching by excluding flags
ACK+FIN+RST, in both IPv4 and IPv6 modules.

The intented usage of SYNPROXY is as follows:
(gracefully describing usage in commit)

 iptables -t raw -A PREROUTING -i eth0 -p tcp --dport 80 --syn -j NOTRACK
 iptables -A INPUT -i eth0 -p tcp --dport 80 -m state UNTRACKED,INVALID \
         -j SYNPROXY --sack-perm --timestamp --mss 1480 --wscale 7 --ecn

 echo 0 > /proc/sys/net/netfilter/nf_conntrack_tcp_loose

This does filter SYN flags early, for packets in the UNTRACKED state,
but packets in the INVALID state with other TCP flags could still
reach the module, thus this stricter flag matching is still needed.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ipt_SYNPROXY.c  | 4 ++--
 net/ipv6/netfilter/ip6t_SYNPROXY.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 94371db6aecc..90e489eb1c0a 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -269,7 +269,7 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 
 	synproxy_parse_options(skb, par->thoff, th, &opts);
 
-	if (th->syn && !th->ack) {
+	if (th->syn && !(th->ack || th->fin || th->rst)) {
 		/* Initial SYN from client */
 		this_cpu_inc(snet->stats->syn_received);
 
@@ -285,7 +285,7 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 					  XT_SYNPROXY_OPT_ECN);
 
 		synproxy_send_client_synack(skb, th, &opts);
-	} else if (th->ack && !(th->fin || th->rst))
+	} else if (th->ack && !(th->fin || th->rst || th->syn))
 		/* ACK from client */
 		synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq));
 
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index 4270a9b145e5..a5af0bfef126 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -284,7 +284,7 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 
 	synproxy_parse_options(skb, par->thoff, th, &opts);
 
-	if (th->syn) {
+	if (th->syn && !(th->ack || th->fin || th->rst)) {
 		/* Initial SYN from client */
 		this_cpu_inc(snet->stats->syn_received);
 
@@ -300,7 +300,7 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 					  XT_SYNPROXY_OPT_ECN);
 
 		synproxy_send_client_synack(skb, th, &opts);
-	} else if (th->ack && !(th->fin || th->rst))
+	} else if (th->ack && !(th->fin || th->rst || th->syn))
 		/* ACK from client */
 		synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq));
 
-- 
cgit v1.2.3


From f4de4c89d89df5ead42de9fea895f5b8155270da Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 29 Aug 2013 10:32:09 +0200
Subject: netfilter: synproxy_core: fix warning in __nf_ct_ext_add_length()

With CONFIG_NETFILTER_DEBUG we get the following warning during SYNPROXY init:

[   80.558906] WARNING: CPU: 1 PID: 4833 at net/netfilter/nf_conntrack_extend.c:80 __nf_ct_ext_add_length+0x217/0x220 [nf_conntrack]()

The reason is that the conntrack template is set to confirmed before adding
the extension and it is invalid to add extensions to already confirmed
conntracks. Fix by adding the extensions before setting the conntrack to
confirmed.

Reported-by: Jesper Dangaard Brouer <jesper.brouer@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_synproxy_core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index d23dc791aca7..6fd967c6278c 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -356,12 +356,12 @@ static int __net_init synproxy_net_init(struct net *net)
 		goto err1;
 	}
 
-	__set_bit(IPS_TEMPLATE_BIT, &ct->status);
-	__set_bit(IPS_CONFIRMED_BIT, &ct->status);
 	if (!nfct_seqadj_ext_add(ct))
 		goto err2;
 	if (!nfct_synproxy_ext_add(ct))
 		goto err2;
+	__set_bit(IPS_TEMPLATE_BIT, &ct->status);
+	__set_bit(IPS_CONFIRMED_BIT, &ct->status);
 
 	snet->tmpl = ct;
 
-- 
cgit v1.2.3


From 7cc9eb6ef78d0dcb97d543ea19966486e98afa0b Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 29 Aug 2013 12:18:46 +0200
Subject: netfilter: SYNPROXY: let unrelated packets continue

Packets reaching SYNPROXY were default dropped, as they were most
likely invalid (given the recommended state matching).  This
patch, changes SYNPROXY target to let packets, not consumed,
continue being processed by the stack.

This will be more in line other target modules. As it will allow
more flexible configurations of handling, logging or matching on
packets in INVALID states.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ipt_SYNPROXY.c  | 8 ++++++--
 net/ipv6/netfilter/ip6t_SYNPROXY.c | 8 ++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 90e489eb1c0a..67e17dcda65e 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -285,11 +285,15 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 					  XT_SYNPROXY_OPT_ECN);
 
 		synproxy_send_client_synack(skb, th, &opts);
-	} else if (th->ack && !(th->fin || th->rst || th->syn))
+		return NF_DROP;
+
+	} else if (th->ack && !(th->fin || th->rst || th->syn)) {
 		/* ACK from client */
 		synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq));
+		return NF_DROP;
+	}
 
-	return NF_DROP;
+	return XT_CONTINUE;
 }
 
 static unsigned int ipv4_synproxy_hook(unsigned int hooknum,
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index a5af0bfef126..19cfea8dbcaa 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -300,11 +300,15 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 					  XT_SYNPROXY_OPT_ECN);
 
 		synproxy_send_client_synack(skb, th, &opts);
-	} else if (th->ack && !(th->fin || th->rst || th->syn))
+		return NF_DROP;
+
+	} else if (th->ack && !(th->fin || th->rst || th->syn)) {
 		/* ACK from client */
 		synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq));
+		return NF_DROP;
+	}
 
-	return NF_DROP;
+	return XT_CONTINUE;
 }
 
 static unsigned int ipv6_synproxy_hook(unsigned int hooknum,
-- 
cgit v1.2.3


From 1205e1fa615805c9efa97303b552cf445965752a Mon Sep 17 00:00:00 2001
From: Phil Oester <kernel@linuxace.com>
Date: Sun, 1 Sep 2013 08:32:21 -0700
Subject: netfilter: xt_TCPMSS: correct return value in tcpmss_mangle_packet

In commit b396966c4 (netfilter: xt_TCPMSS: Fix missing fragmentation handling),
I attempted to add safe fragment handling to xt_TCPMSS.  However, Andy Padavan
of Project N56U correctly points out that returning XT_CONTINUE in this
function does not work.  The callers (tcpmss_tg[46]) expect to receive a value
of 0 in order to return XT_CONTINUE.

Signed-off-by: Phil Oester <kernel@linuxace.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_TCPMSS.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index 6113cc7efffc..cd24290f3b2f 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -60,7 +60,7 @@ tcpmss_mangle_packet(struct sk_buff *skb,
 
 	/* This is a fragment, no TCP header is available */
 	if (par->fragoff != 0)
-		return XT_CONTINUE;
+		return 0;
 
 	if (!skb_make_writable(skb, skb->len))
 		return -1;
-- 
cgit v1.2.3


From c08751c851b78514f6ec5f77f7cbebaac63d15c0 Mon Sep 17 00:00:00 2001
From: Alexander Sverdlin <alexander.sverdlin@nsn.com>
Date: Mon, 2 Sep 2013 15:58:25 +0200
Subject: net: sctp: Fix data chunk fragmentation for MTU values which are not
 multiple of 4

net: sctp: Fix data chunk fragmentation for MTU values which are not multiple of 4

Initially the problem was observed with ipsec, but later it became clear that
SCTP data chunk fragmentation algorithm has problems with MTU values which are
not multiple of 4. Test program was used which just transmits 2000 bytes long
packets to other host. tcpdump was used to observe re-fragmentation in IP layer
after SCTP already fragmented data chunks.

With MTU 1500:
12:54:34.082904 IP (tos 0x2,ECT(0), ttl 64, id 0, offset 0, flags [DF], proto SCTP (132), length 1500)
    10.151.38.153.39303 > 10.151.24.91.54321: sctp (1) [DATA] (B) [TSN: 2366088589] [SID: 0] [SSEQ 1] [PPID 0x0]
12:54:34.082933 IP (tos 0x2,ECT(0), ttl 64, id 0, offset 0, flags [DF], proto SCTP (132), length 596)
    10.151.38.153.39303 > 10.151.24.91.54321: sctp (1) [DATA] (E) [TSN: 2366088590] [SID: 0] [SSEQ 1] [PPID 0x0]
12:54:34.090576 IP (tos 0x2,ECT(0), ttl 63, id 0, offset 0, flags [DF], proto SCTP (132), length 48)
    10.151.24.91.54321 > 10.151.38.153.39303: sctp (1) [SACK] [cum ack 2366088590] [a_rwnd 79920] [#gap acks 0] [#dup tsns 0]

With MTU 1499:
13:02:49.955220 IP (tos 0x2,ECT(0), ttl 64, id 48215, offset 0, flags [+], proto SCTP (132), length 1492)
    10.151.38.153.39084 > 10.151.24.91.54321: sctp[|sctp]
13:02:49.955249 IP (tos 0x2,ECT(0), ttl 64, id 48215, offset 1472, flags [none], proto SCTP (132), length 28)
    10.151.38.153 > 10.151.24.91: ip-proto-132
13:02:49.955262 IP (tos 0x2,ECT(0), ttl 64, id 0, offset 0, flags [DF], proto SCTP (132), length 600)
    10.151.38.153.39084 > 10.151.24.91.54321: sctp (1) [DATA] (E) [TSN: 404355346] [SID: 0] [SSEQ 1] [PPID 0x0]
13:02:49.956770 IP (tos 0x2,ECT(0), ttl 63, id 0, offset 0, flags [DF], proto SCTP (132), length 48)
    10.151.24.91.54321 > 10.151.38.153.39084: sctp (1) [SACK] [cum ack 404355346] [a_rwnd 79920] [#gap acks 0] [#dup tsns 0]

Here problem in data portion limit calculation leads to re-fragmentation in IP,
which is sub-optimal. The problem is max_data initial value, which doesn't take
into account the fact, that data chunk must be padded to 4-bytes boundary.
It's enough to correct max_data, because all later adjustments are correctly
aligned to 4-bytes boundary.

After the fix is applied, everything is fragmented correctly for uneven MTUs:
15:16:27.083881 IP (tos 0x2,ECT(0), ttl 64, id 0, offset 0, flags [DF], proto SCTP (132), length 1496)
    10.151.38.153.53417 > 10.151.24.91.54321: sctp (1) [DATA] (B) [TSN: 3077098183] [SID: 0] [SSEQ 1] [PPID 0x0]
15:16:27.083907 IP (tos 0x2,ECT(0), ttl 64, id 0, offset 0, flags [DF], proto SCTP (132), length 600)
    10.151.38.153.53417 > 10.151.24.91.54321: sctp (1) [DATA] (E) [TSN: 3077098184] [SID: 0] [SSEQ 1] [PPID 0x0]
15:16:27.085640 IP (tos 0x2,ECT(0), ttl 63, id 0, offset 0, flags [DF], proto SCTP (132), length 48)
    10.151.24.91.54321 > 10.151.38.153.53417: sctp (1) [SACK] [cum ack 3077098184] [a_rwnd 79920] [#gap acks 0] [#dup tsns 0]

The bug was there for years already, but
 - is a performance issue, the packets are still transmitted
 - doesn't show up with default MTU 1500, but possibly with ipsec (MTU 1438)

Signed-off-by: Alexander Sverdlin <alexander.sverdlin@nsn.com>
Acked-by: Vlad Yasevich <vyasevich@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/chunk.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index bd0bdd0ba8f1..7bd5ed4a8657 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -195,9 +195,9 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 	/* This is the biggest possible DATA chunk that can fit into
 	 * the packet
 	 */
-	max_data = asoc->pathmtu -
+	max_data = (asoc->pathmtu -
 		sctp_sk(asoc->base.sk)->pf->af->net_header_len -
-		sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk);
+		sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk)) & ~3;
 
 	max = asoc->frag_point;
 	/* If the the peer requested that we authenticate DATA chunks
-- 
cgit v1.2.3


From 639739b5e609a5074839bb22fc061b37baa06269 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Tue, 3 Sep 2013 02:13:31 +0200
Subject: ipv6: fix null pointer dereference in __ip6addrlbl_add

Commit b67bfe0d42cac56c512dd5da4b1b347a23f4b70a ("hlist: drop
the node parameter from iterators") changed the behavior of
hlist_for_each_entry_safe to leave the p argument NULL.

Fix this up by tracking the last argument.

Reported-by: Michele Baldessari <michele@acksyn.org>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Tested-by: Michele Baldessari <michele@acksyn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrlabel.c | 48 +++++++++++++++++++++++-------------------------
 1 file changed, 23 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index f083a583a05c..b30ad3741b46 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -251,38 +251,36 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net,
 /* add a label */
 static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace)
 {
+	struct hlist_node *n;
+	struct ip6addrlbl_entry *last = NULL, *p = NULL;
 	int ret = 0;
 
-	ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n",
-			__func__,
-			newp, replace);
+	ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n", __func__, newp,
+		  replace);
 
-	if (hlist_empty(&ip6addrlbl_table.head)) {
-		hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head);
-	} else {
-		struct hlist_node *n;
-		struct ip6addrlbl_entry *p = NULL;
-		hlist_for_each_entry_safe(p, n,
-					  &ip6addrlbl_table.head, list) {
-			if (p->prefixlen == newp->prefixlen &&
-			    net_eq(ip6addrlbl_net(p), ip6addrlbl_net(newp)) &&
-			    p->ifindex == newp->ifindex &&
-			    ipv6_addr_equal(&p->prefix, &newp->prefix)) {
-				if (!replace) {
-					ret = -EEXIST;
-					goto out;
-				}
-				hlist_replace_rcu(&p->list, &newp->list);
-				ip6addrlbl_put(p);
-				goto out;
-			} else if ((p->prefixlen == newp->prefixlen && !p->ifindex) ||
-				   (p->prefixlen < newp->prefixlen)) {
-				hlist_add_before_rcu(&newp->list, &p->list);
+	hlist_for_each_entry_safe(p, n,	&ip6addrlbl_table.head, list) {
+		if (p->prefixlen == newp->prefixlen &&
+		    net_eq(ip6addrlbl_net(p), ip6addrlbl_net(newp)) &&
+		    p->ifindex == newp->ifindex &&
+		    ipv6_addr_equal(&p->prefix, &newp->prefix)) {
+			if (!replace) {
+				ret = -EEXIST;
 				goto out;
 			}
+			hlist_replace_rcu(&p->list, &newp->list);
+			ip6addrlbl_put(p);
+			goto out;
+		} else if ((p->prefixlen == newp->prefixlen && !p->ifindex) ||
+			   (p->prefixlen < newp->prefixlen)) {
+			hlist_add_before_rcu(&newp->list, &p->list);
+			goto out;
 		}
-		hlist_add_after_rcu(&p->list, &newp->list);
+		last = p;
 	}
+	if (last)
+		hlist_add_after_rcu(&last->list, &newp->list);
+	else
+		hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head);
 out:
 	if (!ret)
 		ip6addrlbl_table.seq++;
-- 
cgit v1.2.3


From 25a6e6b84fba601eff7c28d30da8ad7cfbef0d43 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 3 Sep 2013 13:37:01 +0200
Subject: ipv6: Don't depend on per socket memory for neighbour discovery
 messages

Allocating skbs when sending out neighbour discovery messages
currently uses sock_alloc_send_skb() based on a per net namespace
socket and thus share a socket wmem buffer space.

If a netdevice is temporarily unable to transmit due to carrier
loss or for other reasons, the queued up ndisc messages will cosnume
all of the wmem space and will thus prevent from any more skbs to
be allocated even for netdevices that are able to transmit packets.

The number of neighbour discovery messages sent is very limited,
use of alloc_skb() bypasses the socket wmem buffer size enforcement
while the manual call to skb_set_owner_w() maintains the socket
reference needed for the IPv6 output path.

This patch has orginally been posted by Eric Dumazet in a modified
form.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Stephen Warren <swarren@wwwdotorg.org>
Cc: Fabio Estevam <festevam@gmail.com>
Tested-by: Fabio Estevam <fabio.estevam@freescale.com>
Tested-by: Stephen Warren <swarren@nvidia.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ndisc.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 04d31c2fbef1..78141fa46cf8 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -372,14 +372,11 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev,
 	int tlen = dev->needed_tailroom;
 	struct sock *sk = dev_net(dev)->ipv6.ndisc_sk;
 	struct sk_buff *skb;
-	int err;
 
-	skb = sock_alloc_send_skb(sk,
-				  hlen + sizeof(struct ipv6hdr) + len + tlen,
-				  1, &err);
+	skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC);
 	if (!skb) {
-		ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb, err=%d\n",
-			  __func__, err);
+		ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb\n",
+			  __func__);
 		return NULL;
 	}
 
@@ -389,6 +386,11 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev,
 	skb_reserve(skb, hlen + sizeof(struct ipv6hdr));
 	skb_reset_transport_header(skb);
 
+	/* Manually assign socket ownership as we avoid calling
+	 * sock_alloc_send_pskb() to bypass wmem buffer limits
+	 */
+	skb_set_owner_w(skb, sk);
+
 	return skb;
 }
 
-- 
cgit v1.2.3


From 52f20e655d9f6f7f937a1cdacf219d9df3ab6166 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Tue, 3 Sep 2013 14:14:35 -0700
Subject: tcp: better comments for RTO initiallization

Commit 1b7fdd2ab585("tcp: do not use cached RTT for RTT estimation")
removes important comments on how RTO is initialized and updated.
Hopefully this patch puts those information back.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_metrics.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 273ed735cca2..4a22f3e715df 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -481,13 +481,27 @@ void tcp_init_metrics(struct sock *sk)
 	crtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
 	rcu_read_unlock();
 reset:
+	/* The initial RTT measurement from the SYN/SYN-ACK is not ideal
+	 * to seed the RTO for later data packets because SYN packets are
+	 * small. Use the per-dst cached values to seed the RTO but keep
+	 * the RTT estimator variables intact (e.g., srtt, mdev, rttvar).
+	 * Later the RTO will be updated immediately upon obtaining the first
+	 * data RTT sample (tcp_rtt_estimator()). Hence the cached RTT only
+	 * influences the first RTO but not later RTT estimation.
+	 *
+	 * But if RTT is not available from the SYN (due to retransmits or
+	 * syn cookies) or the cache, force a conservative 3secs timeout.
+	 *
+	 * A bit of theory. RTT is time passed after "normal" sized packet
+	 * is sent until it is ACKed. In normal circumstances sending small
+	 * packets force peer to delay ACKs and calculation is correct too.
+	 * The algorithm is adaptive and, provided we follow specs, it
+	 * NEVER underestimate RTT. BUT! If peer tries to make some clever
+	 * tricks sort of "quick acks" for time long enough to decrease RTT
+	 * to low value, and then abruptly stops to do it and starts to delay
+	 * ACKs, wait for troubles.
+	 */
 	if (crtt > tp->srtt) {
-		/* Initial RTT (tp->srtt) from SYN usually don't measure
-		 * serialization delay on low BW links well so RTO may be
-		 * under-estimated. Stay conservative and seed RTO with
-		 * the RTTs from past data exchanges, using the same seeding
-		 * formula in tcp_rtt_estimator().
-		 */
 		inet_csk(sk)->icsk_rto = crtt + max(crtt >> 2, tcp_rto_min(sk));
 	} else if (tp->srtt == 0) {
 		/* RFC6298: 5.7 We've failed to get a valid RTT sample from
-- 
cgit v1.2.3


From 3a1c756590633c0e86df606e5c618c190926a0df Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Tue, 3 Sep 2013 19:29:12 +0200
Subject: net: ipv6: tcp: fix potential use after free in tcp_v6_do_rcv

In tcp_v6_do_rcv() code, when processing pkt options, we soley work
on our skb clone opt_skb that we've created earlier before entering
tcp_rcv_established() on our way. However, only in condition ...

  if (np->rxopt.bits.rxtclass)
    np->rcv_tclass = ipv6_get_dsfield(ipv6_hdr(skb));

... we work on skb itself. As we extract every other information out
of opt_skb in ipv6_pktoptions path, this seems wrong, since skb can
already be released by tcp_rcv_established() earlier on. When we try
to access it in ipv6_hdr(), we will dereference freed skb.

[ Bug added by commit 4c507d2897bd9b ("net: implement IP_RECVTOS for
  IP_PKTOPTIONS") ]

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 6e1649d58533..eeb4cb0c57c1 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1427,7 +1427,7 @@ ipv6_pktoptions:
 		if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim)
 			np->mcast_hops = ipv6_hdr(opt_skb)->hop_limit;
 		if (np->rxopt.bits.rxtclass)
-			np->rcv_tclass = ipv6_get_dsfield(ipv6_hdr(skb));
+			np->rcv_tclass = ipv6_get_dsfield(ipv6_hdr(opt_skb));
 		if (ipv6_opt_accepted(sk, opt_skb)) {
 			skb_set_owner_r(opt_skb, sk);
 			opt_skb = xchg(&np->pktoptions, opt_skb);
-- 
cgit v1.2.3


From 89225d1ce6af3916bf32aecbe9d83f571a098588 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Wed, 4 Sep 2013 00:19:37 +0200
Subject: net: ipv6: mld: fix v1/v2 switchback timeout to rfc3810, 9.12.

i) RFC3810, 9.2. Query Interval [QI] says:

   The Query Interval variable denotes the interval between General
   Queries sent by the Querier. Default value: 125 seconds. [...]

ii) RFC3810, 9.3. Query Response Interval [QRI] says:

  The Maximum Response Delay used to calculate the Maximum Response
  Code inserted into the periodic General Queries. Default value:
  10000 (10 seconds) [...] The number of seconds represented by the
  [Query Response Interval] must be less than the [Query Interval].

iii) RFC3810, 9.12. Older Version Querier Present Timeout [OVQPT] says:

  The Older Version Querier Present Timeout is the time-out for
  transitioning a host back to MLDv2 Host Compatibility Mode. When an
  MLDv1 query is received, MLDv2 hosts set their Older Version Querier
  Present Timer to [Older Version Querier Present Timeout].

  This value MUST be ([Robustness Variable] times (the [Query Interval]
  in the last Query received)) plus ([Query Response Interval]).

Hence, on *default* the timeout results in:

  [RV] = 2, [QI] = 125sec, [QRI] = 10sec
  [OVQPT] = [RV] * [QI] + [QRI] = 260sec

Having that said, we currently calculate [OVQPT] (here given as 'switchback'
variable) as ...

  switchback = (idev->mc_qrv + 1) * max_delay

RFC3810, 9.12. says "the [Query Interval] in the last Query received". In
section "9.14. Configuring timers", it is said:

  This section is meant to provide advice to network administrators on
  how to tune these settings to their network. Ambitious router
  implementations might tune these settings dynamically based upon
  changing characteristics of the network. [...]

iv) RFC38010, 9.14.2. Query Interval:

  The overall level of periodic MLD traffic is inversely proportional
  to the Query Interval. A longer Query Interval results in a lower
  overall level of MLD traffic. The value of the Query Interval MUST
  be equal to or greater than the Maximum Response Delay used to
  calculate the Maximum Response Code inserted in General Query
  messages.

I assume that was why switchback is calculated as is (3 * max_delay), although
this setting seems to be meant for routers only to configure their [QI]
interval for non-default intervals. So usage here like this is clearly wrong.

Concluding, the current behaviour in IPv6's multicast code is not conform
to the RFC as switch back is calculated wrongly. That is, it has a too small
value, so MLDv2 hosts switch back again to MLDv2 way too early, i.e. ~30secs
instead of ~260secs on default.

Hence, introduce necessary helper functions and fix this up properly as it
should be.

Introduced in 06da92283 ("[IPV6]: Add MLDv2 support."). Credits to Hannes
Frederic Sowa who also had a hand in this as well. Also thanks to Hangbin Liu
who did initial testing.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Cc: David Stevens <dlstevens@us.ibm.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/if_inet6.h |   9 +++-
 include/net/mld.h      |  27 +++++++++++-
 net/ipv6/mcast.c       | 116 ++++++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 143 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index 736b5fb95474..02ef7727bb55 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -171,12 +171,17 @@ struct inet6_dev {
 	struct ifmcaddr6	*mc_list;
 	struct ifmcaddr6	*mc_tomb;
 	spinlock_t		mc_lock;
-	unsigned char		mc_qrv;
+
+	unsigned char		mc_qrv;		/* Query Robustness Variable */
 	unsigned char		mc_gq_running;
 	unsigned char		mc_ifc_count;
 	unsigned char		mc_dad_count;
-	unsigned long		mc_v1_seen;
+
+	unsigned long		mc_v1_seen;	/* Max time we stay in MLDv1 mode */
+	unsigned long		mc_qi;		/* Query Interval */
+	unsigned long		mc_qri;		/* Query Response Interval */
 	unsigned long		mc_maxdelay;
+
 	struct timer_list	mc_gq_timer;	/* general query timer */
 	struct timer_list	mc_ifc_timer;	/* interface change timer */
 	struct timer_list	mc_dad_timer;	/* dad complete mc timer */
diff --git a/include/net/mld.h b/include/net/mld.h
index 467143cd4e2f..2b5421f6a7c2 100644
--- a/include/net/mld.h
+++ b/include/net/mld.h
@@ -63,7 +63,7 @@ struct mld2_query {
 #define mld2q_mrc		mld2q_hdr.icmp6_maxdelay
 #define mld2q_resv1		mld2q_hdr.icmp6_dataun.un_data16[1]
 
-/* Max Response Code */
+/* Max Response Code, TODO: transform this to use the below */
 #define MLDV2_MASK(value, nb) ((nb)>=32 ? (value) : ((1<<(nb))-1) & (value))
 #define MLDV2_EXP(thresh, nbmant, nbexp, value) \
 	((value) < (thresh) ? (value) : \
@@ -72,4 +72,29 @@ struct mld2_query {
 
 #define MLDV2_MRC(value) MLDV2_EXP(0x8000, 12, 3, value)
 
+/* RFC3810, 5.1.3. Maximum Response Code:
+ *
+ * If Maximum Response Code >= 32768, Maximum Response Code represents a
+ * floating-point value as follows:
+ *
+ *  0 1 2 3 4 5 6 7 8 9 A B C D E F
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |1| exp |          mant         |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+#define MLDV2_MRC_EXP(value)	(((value) >> 12) & 0x0007)
+#define MLDV2_MRC_MAN(value)	((value) & 0x0fff)
+
+/* RFC3810, 5.1.9. QQIC (Querier's Query Interval Code):
+ *
+ * If QQIC >= 128, QQIC represents a floating-point value as follows:
+ *
+ *  0 1 2 3 4 5 6 7
+ * +-+-+-+-+-+-+-+-+
+ * |1| exp | mant  |
+ * +-+-+-+-+-+-+-+-+
+ */
+#define MLDV2_QQIC_EXP(value)	(((value) >> 4) & 0x07)
+#define MLDV2_QQIC_MAN(value)	((value) & 0x0f)
+
 #endif
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 98ead2b1a669..8992ff262789 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -108,6 +108,10 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml,
 			    struct inet6_dev *idev);
 
 #define MLD_QRV_DEFAULT		2
+/* RFC3810, 9.2. Query Interval */
+#define MLD_QI_DEFAULT		(125 * HZ)
+/* RFC3810, 9.3. Query Response Interval */
+#define MLD_QRI_DEFAULT		(10 * HZ)
 
 /* RFC3810, 8.1 Query Version Distinctions */
 #define MLD_V1_QUERY_LEN	24
@@ -1112,6 +1116,93 @@ static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs,
 	return true;
 }
 
+static void mld_set_v1_mode(struct inet6_dev *idev)
+{
+	/* RFC3810, relevant sections:
+	 *  - 9.1. Robustness Variable
+	 *  - 9.2. Query Interval
+	 *  - 9.3. Query Response Interval
+	 *  - 9.12. Older Version Querier Present Timeout
+	 */
+	unsigned long switchback;
+
+	switchback = (idev->mc_qrv * idev->mc_qi) + idev->mc_qri;
+
+	idev->mc_v1_seen = jiffies + switchback;
+}
+
+static void mld_update_qrv(struct inet6_dev *idev,
+			   const struct mld2_query *mlh2)
+{
+	/* RFC3810, relevant sections:
+	 *  - 5.1.8. QRV (Querier's Robustness Variable)
+	 *  - 9.1. Robustness Variable
+	 */
+
+	/* The value of the Robustness Variable MUST NOT be zero,
+	 * and SHOULD NOT be one. Catch this here if we ever run
+	 * into such a case in future.
+	 */
+	WARN_ON(idev->mc_qrv == 0);
+
+	if (mlh2->mld2q_qrv > 0)
+		idev->mc_qrv = mlh2->mld2q_qrv;
+
+	if (unlikely(idev->mc_qrv < 2)) {
+		net_warn_ratelimited("IPv6: MLD: clamping QRV from %u to %u!\n",
+				     idev->mc_qrv, MLD_QRV_DEFAULT);
+		idev->mc_qrv = MLD_QRV_DEFAULT;
+	}
+}
+
+static void mld_update_qi(struct inet6_dev *idev,
+			  const struct mld2_query *mlh2)
+{
+	/* RFC3810, relevant sections:
+	 *  - 5.1.9. QQIC (Querier's Query Interval Code)
+	 *  - 9.2. Query Interval
+	 *  - 9.12. Older Version Querier Present Timeout
+	 *    (the [Query Interval] in the last Query received)
+	 */
+	unsigned long mc_qqi;
+
+	if (mlh2->mld2q_qqic < 128) {
+		mc_qqi = mlh2->mld2q_qqic;
+	} else {
+		unsigned long mc_man, mc_exp;
+
+		mc_exp = MLDV2_QQIC_EXP(mlh2->mld2q_qqic);
+		mc_man = MLDV2_QQIC_MAN(mlh2->mld2q_qqic);
+
+		mc_qqi = (mc_man | 0x10) << (mc_exp + 3);
+	}
+
+	idev->mc_qi = mc_qqi * HZ;
+}
+
+static void mld_update_qri(struct inet6_dev *idev,
+			   const struct mld2_query *mlh2)
+{
+	/* RFC3810, relevant sections:
+	 *  - 5.1.3. Maximum Response Code
+	 *  - 9.3. Query Response Interval
+	 */
+	unsigned long mc_qri, mc_mrc = ntohs(mlh2->mld2q_mrc);
+
+	if (mc_mrc < 32768) {
+		mc_qri = mc_mrc;
+	} else {
+		unsigned long mc_man, mc_exp;
+
+		mc_exp = MLDV2_MRC_EXP(mc_mrc);
+		mc_man = MLDV2_MRC_MAN(mc_mrc);
+
+		mc_qri = (mc_man | 0x1000) << (mc_exp + 3);
+	}
+
+	idev->mc_qri = msecs_to_jiffies(mc_qri);
+}
+
 /* called with rcu_read_lock() */
 int igmp6_event_query(struct sk_buff *skb)
 {
@@ -1150,12 +1241,15 @@ int igmp6_event_query(struct sk_buff *skb)
 		return -EINVAL;
 
 	if (len == MLD_V1_QUERY_LEN) {
-		int switchback;
 		/* MLDv1 router present */
-
 		max_delay = msecs_to_jiffies(ntohs(mld->mld_maxdelay));
-		switchback = (idev->mc_qrv + 1) * max_delay;
-		idev->mc_v1_seen = jiffies + switchback;
+
+		mld_set_v1_mode(idev);
+
+		/* cancel MLDv2 report timer */
+		idev->mc_gq_running = 0;
+		if (del_timer(&idev->mc_gq_timer))
+			__in6_dev_put(idev);
 
 		/* cancel the interface change timer */
 		idev->mc_ifc_count = 0;
@@ -1166,6 +1260,10 @@ int igmp6_event_query(struct sk_buff *skb)
 	} else if (len >= MLD_V2_QUERY_LEN_MIN) {
 		int srcs_offset = sizeof(struct mld2_query) -
 				  sizeof(struct icmp6hdr);
+
+		/* hosts need to stay in MLDv1 mode, discard MLDv2 queries */
+		if (MLD_V1_SEEN(idev))
+			return 0;
 		if (!pskb_may_pull(skb, srcs_offset))
 			return -EINVAL;
 
@@ -1175,8 +1273,10 @@ int igmp6_event_query(struct sk_buff *skb)
 
 		idev->mc_maxdelay = max_delay;
 
-		if (mlh2->mld2q_qrv)
-			idev->mc_qrv = mlh2->mld2q_qrv;
+		mld_update_qrv(idev, mlh2);
+		mld_update_qi(idev, mlh2);
+		mld_update_qri(idev, mlh2);
+
 		if (group_type == IPV6_ADDR_ANY) { /* general query */
 			if (mlh2->mld2q_nsrcs)
 				return -EINVAL; /* no sources allowed */
@@ -2337,7 +2437,11 @@ void ipv6_mc_init_dev(struct inet6_dev *idev)
 			(unsigned long)idev);
 	setup_timer(&idev->mc_dad_timer, mld_dad_timer_expire,
 		    (unsigned long)idev);
+
 	idev->mc_qrv = MLD_QRV_DEFAULT;
+	idev->mc_qi = MLD_QI_DEFAULT;
+	idev->mc_qri = MLD_QRI_DEFAULT;
+
 	idev->mc_maxdelay = unsolicited_report_interval(idev);
 	idev->mc_v1_seen = 0;
 	write_unlock_bh(&idev->lock);
-- 
cgit v1.2.3


From 6c567b78c8a7da26e5e6f5bd458882ad9967233a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Wed, 4 Sep 2013 00:19:38 +0200
Subject: net: ipv6: mld: clean up MLD_V1_SEEN macro

Replace the macro with a function to make it more readable. GCC will
eventually decide whether to inline this or not (also, that's not
fast-path anyway).

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/mcast.c | 34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 8992ff262789..f86e26b7296c 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -95,6 +95,7 @@ static void mld_ifc_event(struct inet6_dev *idev);
 static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc);
 static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *addr);
 static void mld_clear_delrec(struct inet6_dev *idev);
+static bool mld_in_v1_mode(const struct inet6_dev *idev);
 static int sf_setstate(struct ifmcaddr6 *pmc);
 static void sf_markstate(struct ifmcaddr6 *pmc);
 static void ip6_mc_clear_src(struct ifmcaddr6 *pmc);
@@ -117,11 +118,6 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml,
 #define MLD_V1_QUERY_LEN	24
 #define MLD_V2_QUERY_LEN_MIN	28
 
-#define MLD_V1_SEEN(idev) (dev_net((idev)->dev)->ipv6.devconf_all->force_mld_version == 1 || \
-		(idev)->cnf.force_mld_version == 1 || \
-		((idev)->mc_v1_seen && \
-		time_before(jiffies, (idev)->mc_v1_seen)))
-
 #define IPV6_MLD_MAX_MSF	64
 
 int sysctl_mld_max_msf __read_mostly = IPV6_MLD_MAX_MSF;
@@ -139,7 +135,7 @@ static int unsolicited_report_interval(struct inet6_dev *idev)
 {
 	int iv;
 
-	if (MLD_V1_SEEN(idev))
+	if (mld_in_v1_mode(idev))
 		iv = idev->cnf.mldv1_unsolicited_report_interval;
 	else
 		iv = idev->cnf.mldv2_unsolicited_report_interval;
@@ -695,7 +691,7 @@ static void igmp6_group_added(struct ifmcaddr6 *mc)
 	if (!(dev->flags & IFF_UP) || (mc->mca_flags & MAF_NOREPORT))
 		return;
 
-	if (MLD_V1_SEEN(mc->idev)) {
+	if (mld_in_v1_mode(mc->idev)) {
 		igmp6_join_group(mc);
 		return;
 	}
@@ -1116,6 +1112,18 @@ static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs,
 	return true;
 }
 
+static bool mld_in_v1_mode(const struct inet6_dev *idev)
+{
+	if (dev_net(idev->dev)->ipv6.devconf_all->force_mld_version == 1)
+		return true;
+	if (idev->cnf.force_mld_version == 1)
+		return true;
+	if (idev->mc_v1_seen && time_before(jiffies, idev->mc_v1_seen))
+		return true;
+
+	return false;
+}
+
 static void mld_set_v1_mode(struct inet6_dev *idev)
 {
 	/* RFC3810, relevant sections:
@@ -1262,7 +1270,7 @@ int igmp6_event_query(struct sk_buff *skb)
 				  sizeof(struct icmp6hdr);
 
 		/* hosts need to stay in MLDv1 mode, discard MLDv2 queries */
-		if (MLD_V1_SEEN(idev))
+		if (mld_in_v1_mode(idev))
 			return 0;
 		if (!pskb_may_pull(skb, srcs_offset))
 			return -EINVAL;
@@ -1942,7 +1950,7 @@ err_out:
 
 static void mld_resend_report(struct inet6_dev *idev)
 {
-	if (MLD_V1_SEEN(idev)) {
+	if (mld_in_v1_mode(idev)) {
 		struct ifmcaddr6 *mcaddr;
 		read_lock_bh(&idev->lock);
 		for (mcaddr = idev->mc_list; mcaddr; mcaddr = mcaddr->next) {
@@ -2006,7 +2014,7 @@ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode,
 		else
 			pmc->mca_sources = psf->sf_next;
 		if (psf->sf_oldin && !(pmc->mca_flags & MAF_NOREPORT) &&
-		    !MLD_V1_SEEN(idev)) {
+		    !mld_in_v1_mode(idev)) {
 			psf->sf_crcount = idev->mc_qrv;
 			psf->sf_next = pmc->mca_tomb;
 			pmc->mca_tomb = psf;
@@ -2306,7 +2314,7 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml,
 
 static void igmp6_leave_group(struct ifmcaddr6 *ma)
 {
-	if (MLD_V1_SEEN(ma->idev)) {
+	if (mld_in_v1_mode(ma->idev)) {
 		if (ma->mca_flags & MAF_LAST_REPORTER)
 			igmp6_send(&ma->mca_addr, ma->idev->dev,
 				ICMPV6_MGM_REDUCTION);
@@ -2340,7 +2348,7 @@ static void mld_ifc_timer_expire(unsigned long data)
 
 static void mld_ifc_event(struct inet6_dev *idev)
 {
-	if (MLD_V1_SEEN(idev))
+	if (mld_in_v1_mode(idev))
 		return;
 	idev->mc_ifc_count = idev->mc_qrv;
 	mld_ifc_start_timer(idev, 1);
@@ -2351,7 +2359,7 @@ static void igmp6_timer_handler(unsigned long data)
 {
 	struct ifmcaddr6 *ma = (struct ifmcaddr6 *) data;
 
-	if (MLD_V1_SEEN(ma->idev))
+	if (mld_in_v1_mode(ma->idev))
 		igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT);
 	else
 		mld_send_report(ma->idev, ma);
-- 
cgit v1.2.3


From e3f5b17047dec4acd8957dad053e70d87f18d97e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Wed, 4 Sep 2013 00:19:39 +0200
Subject: net: ipv6: mld: get rid of MLDV2_MRC and simplify calculation

Get rid of MLDV2_MRC and use our new macros for mantisse and
exponent to calculate Maximum Response Delay out of the Maximum
Response Code.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/mld.h         | 28 +++++++++++++++++++---------
 net/bridge/br_multicast.c |  3 ++-
 net/ipv6/mcast.c          | 18 ++----------------
 3 files changed, 23 insertions(+), 26 deletions(-)

(limited to 'net')

diff --git a/include/net/mld.h b/include/net/mld.h
index 2b5421f6a7c2..faa1d161bf24 100644
--- a/include/net/mld.h
+++ b/include/net/mld.h
@@ -63,15 +63,6 @@ struct mld2_query {
 #define mld2q_mrc		mld2q_hdr.icmp6_maxdelay
 #define mld2q_resv1		mld2q_hdr.icmp6_dataun.un_data16[1]
 
-/* Max Response Code, TODO: transform this to use the below */
-#define MLDV2_MASK(value, nb) ((nb)>=32 ? (value) : ((1<<(nb))-1) & (value))
-#define MLDV2_EXP(thresh, nbmant, nbexp, value) \
-	((value) < (thresh) ? (value) : \
-	((MLDV2_MASK(value, nbmant) | (1<<(nbmant))) << \
-	(MLDV2_MASK((value) >> (nbmant), nbexp) + (nbexp))))
-
-#define MLDV2_MRC(value) MLDV2_EXP(0x8000, 12, 3, value)
-
 /* RFC3810, 5.1.3. Maximum Response Code:
  *
  * If Maximum Response Code >= 32768, Maximum Response Code represents a
@@ -97,4 +88,23 @@ struct mld2_query {
 #define MLDV2_QQIC_EXP(value)	(((value) >> 4) & 0x07)
 #define MLDV2_QQIC_MAN(value)	((value) & 0x0f)
 
+static inline unsigned long mldv2_mrc(const struct mld2_query *mlh2)
+{
+	/* RFC3810, 5.1.3. Maximum Response Code */
+	unsigned long ret, mc_mrc = ntohs(mlh2->mld2q_mrc);
+
+	if (mc_mrc < 32768) {
+		ret = mc_mrc;
+	} else {
+		unsigned long mc_man, mc_exp;
+
+		mc_exp = MLDV2_MRC_EXP(mc_mrc);
+		mc_man = MLDV2_MRC_MAN(mc_mrc);
+
+		ret = (mc_man | 0x1000) << (mc_exp + 3);
+	}
+
+	return ret;
+}
+
 #endif
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 08e576ada0b2..4accd0de6e8e 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1203,7 +1203,8 @@ static int br_ip6_multicast_query(struct net_bridge *br,
 		mld2q = (struct mld2_query *)icmp6_hdr(skb);
 		if (!mld2q->mld2q_nsrcs)
 			group = &mld2q->mld2q_mca;
-		max_delay = mld2q->mld2q_mrc ? MLDV2_MRC(ntohs(mld2q->mld2q_mrc)) : 1;
+
+		max_delay = max(msecs_to_jiffies(mldv2_mrc(mld2q)), 1UL);
 	}
 
 	br_multicast_query_received(br, port, !ipv6_addr_any(&ip6h->saddr),
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index f86e26b7296c..005b22fd6bcb 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1195,20 +1195,7 @@ static void mld_update_qri(struct inet6_dev *idev,
 	 *  - 5.1.3. Maximum Response Code
 	 *  - 9.3. Query Response Interval
 	 */
-	unsigned long mc_qri, mc_mrc = ntohs(mlh2->mld2q_mrc);
-
-	if (mc_mrc < 32768) {
-		mc_qri = mc_mrc;
-	} else {
-		unsigned long mc_man, mc_exp;
-
-		mc_exp = MLDV2_MRC_EXP(mc_mrc);
-		mc_man = MLDV2_MRC_MAN(mc_mrc);
-
-		mc_qri = (mc_man | 0x1000) << (mc_exp + 3);
-	}
-
-	idev->mc_qri = msecs_to_jiffies(mc_qri);
+	idev->mc_qri = msecs_to_jiffies(mldv2_mrc(mlh2));
 }
 
 /* called with rcu_read_lock() */
@@ -1277,8 +1264,7 @@ int igmp6_event_query(struct sk_buff *skb)
 
 		mlh2 = (struct mld2_query *)skb_transport_header(skb);
 
-		max_delay = max(msecs_to_jiffies(MLDV2_MRC(ntohs(mlh2->mld2q_mrc))), 1UL);
-
+		max_delay = max(msecs_to_jiffies(mldv2_mrc(mlh2)), 1UL);
 		idev->mc_maxdelay = max_delay;
 
 		mld_update_qrv(idev, mlh2);
-- 
cgit v1.2.3


From 58c0ecfd8d9871cfa35bcdbf3e7b3ee9ca62ea67 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Wed, 4 Sep 2013 00:19:40 +0200
Subject: net: ipv6: mld: implement RFC3810 MLDv2 mode only

RFC3810, 10. Security Considerations says under subsection 10.1.
Query Message:

  A forged Version 1 Query message will put MLDv2 listeners on that
  link in MLDv1 Host Compatibility Mode. This scenario can be avoided
  by providing MLDv2 hosts with a configuration option to ignore
  Version 1 messages completely.

Hence, implement a MLDv2-only mode that will ignore MLDv1 traffic:

  echo 2 > /proc/sys/net/ipv6/conf/ethX/force_mld_version  or
  echo 2 > /proc/sys/net/ipv6/conf/all/force_mld_version

Note that <all> device has a higher precedence as it was previously
also the case in the macro MLD_V1_SEEN() that would "short-circuit"
if condition on <all> case.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/mcast.c | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 005b22fd6bcb..c916568d838a 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1112,11 +1112,34 @@ static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs,
 	return true;
 }
 
+static int mld_force_mld_version(const struct inet6_dev *idev)
+{
+	/* Normally, both are 0 here. If enforcement to a particular is
+	 * being used, individual device enforcement will have a lower
+	 * precedence over 'all' device (.../conf/all/force_mld_version).
+	 */
+
+	if (dev_net(idev->dev)->ipv6.devconf_all->force_mld_version != 0)
+		return dev_net(idev->dev)->ipv6.devconf_all->force_mld_version;
+	else
+		return idev->cnf.force_mld_version;
+}
+
+static bool mld_in_v2_mode_only(const struct inet6_dev *idev)
+{
+	return mld_force_mld_version(idev) == 2;
+}
+
+static bool mld_in_v1_mode_only(const struct inet6_dev *idev)
+{
+	return mld_force_mld_version(idev) == 1;
+}
+
 static bool mld_in_v1_mode(const struct inet6_dev *idev)
 {
-	if (dev_net(idev->dev)->ipv6.devconf_all->force_mld_version == 1)
-		return true;
-	if (idev->cnf.force_mld_version == 1)
+	if (mld_in_v2_mode_only(idev))
+		return false;
+	if (mld_in_v1_mode_only(idev))
 		return true;
 	if (idev->mc_v1_seen && time_before(jiffies, idev->mc_v1_seen))
 		return true;
@@ -1223,7 +1246,6 @@ int igmp6_event_query(struct sk_buff *skb)
 		return -EINVAL;
 
 	idev = __in6_dev_get(skb->dev);
-
 	if (idev == NULL)
 		return 0;
 
@@ -1236,6 +1258,10 @@ int igmp6_event_query(struct sk_buff *skb)
 		return -EINVAL;
 
 	if (len == MLD_V1_QUERY_LEN) {
+		/* Ignore v1 queries */
+		if (mld_in_v2_mode_only(idev))
+			return 0;
+
 		/* MLDv1 router present */
 		max_delay = msecs_to_jiffies(ntohs(mld->mld_maxdelay));
 
-- 
cgit v1.2.3


From cc7f7ab758f66a9110c8e737c3de3e9f5fc209b5 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Wed, 4 Sep 2013 00:19:41 +0200
Subject: net: ipv6: mld: similarly to MLDv2 have min max_delay of 1

Similarly as we do in MLDv2 queries, set a forged MLDv1 query with
0 ms mld_maxdelay to minimum timer shot time of 1 jiffies. This is
eventually done in igmp6_group_queried() anyway, so we can simplify
a check there.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/mcast.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index c916568d838a..beb76b7989a7 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1040,12 +1040,9 @@ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime)
 		delay = ma->mca_timer.expires - jiffies;
 	}
 
-	if (delay >= resptime) {
-		if (resptime)
-			delay = net_random() % resptime;
-		else
-			delay = 1;
-	}
+	if (delay >= resptime)
+		delay = net_random() % resptime;
+
 	ma->mca_timer.expires = jiffies + delay;
 	if (!mod_timer(&ma->mca_timer, jiffies + delay))
 		atomic_inc(&ma->mca_refcnt);
@@ -1258,12 +1255,15 @@ int igmp6_event_query(struct sk_buff *skb)
 		return -EINVAL;
 
 	if (len == MLD_V1_QUERY_LEN) {
+		unsigned long mldv1_md;
+
 		/* Ignore v1 queries */
 		if (mld_in_v2_mode_only(idev))
 			return 0;
 
 		/* MLDv1 router present */
-		max_delay = msecs_to_jiffies(ntohs(mld->mld_maxdelay));
+		mldv1_md = ntohs(mld->mld_maxdelay);
+		max_delay = max(msecs_to_jiffies(mldv1_md), 1UL);
 
 		mld_set_v1_mode(idev);
 
-- 
cgit v1.2.3


From 2b7c121f82b47ba5efac1ef47355376d1f8d6980 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Wed, 4 Sep 2013 00:19:42 +0200
Subject: net: ipv6: mld: refactor query processing into v1/v2 functions

Make igmp6_event_query() a bit easier to read by refactoring code
parts into mld_process_v1() and mld_process_v2().

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/mcast.c | 89 +++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 56 insertions(+), 33 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index beb76b7989a7..04399cb71e14 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1218,6 +1218,55 @@ static void mld_update_qri(struct inet6_dev *idev,
 	idev->mc_qri = msecs_to_jiffies(mldv2_mrc(mlh2));
 }
 
+static int mld_process_v1(struct inet6_dev *idev, struct mld_msg *mld,
+			  unsigned long *max_delay)
+{
+	unsigned long mldv1_md;
+
+	/* Ignore v1 queries */
+	if (mld_in_v2_mode_only(idev))
+		return -EINVAL;
+
+	/* MLDv1 router present */
+	mldv1_md = ntohs(mld->mld_maxdelay);
+	*max_delay = max(msecs_to_jiffies(mldv1_md), 1UL);
+
+	mld_set_v1_mode(idev);
+
+	/* cancel MLDv2 report timer */
+	idev->mc_gq_running = 0;
+	if (del_timer(&idev->mc_gq_timer))
+		__in6_dev_put(idev);
+
+	/* cancel the interface change timer */
+	idev->mc_ifc_count = 0;
+	if (del_timer(&idev->mc_ifc_timer))
+		__in6_dev_put(idev);
+
+	/* clear deleted report items */
+	mld_clear_delrec(idev);
+
+	return 0;
+}
+
+static int mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld,
+			  unsigned long *max_delay)
+{
+	/* hosts need to stay in MLDv1 mode, discard MLDv2 queries */
+	if (mld_in_v1_mode(idev))
+		return -EINVAL;
+
+	*max_delay = max(msecs_to_jiffies(mldv2_mrc(mld)), 1UL);
+
+	mld_update_qrv(idev, mld);
+	mld_update_qi(idev, mld);
+	mld_update_qri(idev, mld);
+
+	idev->mc_maxdelay = *max_delay;
+
+	return 0;
+}
+
 /* called with rcu_read_lock() */
 int igmp6_event_query(struct sk_buff *skb)
 {
@@ -1229,7 +1278,7 @@ int igmp6_event_query(struct sk_buff *skb)
 	struct mld_msg *mld;
 	int group_type;
 	int mark = 0;
-	int len;
+	int len, err;
 
 	if (!pskb_may_pull(skb, sizeof(struct in6_addr)))
 		return -EINVAL;
@@ -1255,47 +1304,21 @@ int igmp6_event_query(struct sk_buff *skb)
 		return -EINVAL;
 
 	if (len == MLD_V1_QUERY_LEN) {
-		unsigned long mldv1_md;
-
-		/* Ignore v1 queries */
-		if (mld_in_v2_mode_only(idev))
-			return 0;
-
-		/* MLDv1 router present */
-		mldv1_md = ntohs(mld->mld_maxdelay);
-		max_delay = max(msecs_to_jiffies(mldv1_md), 1UL);
-
-		mld_set_v1_mode(idev);
-
-		/* cancel MLDv2 report timer */
-		idev->mc_gq_running = 0;
-		if (del_timer(&idev->mc_gq_timer))
-			__in6_dev_put(idev);
-
-		/* cancel the interface change timer */
-		idev->mc_ifc_count = 0;
-		if (del_timer(&idev->mc_ifc_timer))
-			__in6_dev_put(idev);
-		/* clear deleted report items */
-		mld_clear_delrec(idev);
+		err = mld_process_v1(idev, mld, &max_delay);
+		if (err < 0)
+			return err;
 	} else if (len >= MLD_V2_QUERY_LEN_MIN) {
 		int srcs_offset = sizeof(struct mld2_query) -
 				  sizeof(struct icmp6hdr);
 
-		/* hosts need to stay in MLDv1 mode, discard MLDv2 queries */
-		if (mld_in_v1_mode(idev))
-			return 0;
 		if (!pskb_may_pull(skb, srcs_offset))
 			return -EINVAL;
 
 		mlh2 = (struct mld2_query *)skb_transport_header(skb);
 
-		max_delay = max(msecs_to_jiffies(mldv2_mrc(mlh2)), 1UL);
-		idev->mc_maxdelay = max_delay;
-
-		mld_update_qrv(idev, mlh2);
-		mld_update_qi(idev, mlh2);
-		mld_update_qri(idev, mlh2);
+		err = mld_process_v2(idev, mlh2, &max_delay);
+		if (err < 0)
+			return err;
 
 		if (group_type == IPV6_ADDR_ANY) { /* general query */
 			if (mlh2->mld2q_nsrcs)
-- 
cgit v1.2.3


From b4af8def5c083e5424c66051fe400444a01c4644 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Wed, 4 Sep 2013 00:19:43 +0200
Subject: net: ipv6: mld: introduce mld_{gq, ifc, dad}_stop_timer functions

We already have mld_{gq,ifc,dad}_start_timer() functions, so introduce
mld_{gq,ifc,dad}_stop_timer() functions to reduce code size and make it
more readable.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/mcast.c | 41 +++++++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 04399cb71e14..096cd67b737c 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1006,6 +1006,13 @@ static void mld_gq_start_timer(struct inet6_dev *idev)
 		in6_dev_hold(idev);
 }
 
+static void mld_gq_stop_timer(struct inet6_dev *idev)
+{
+	idev->mc_gq_running = 0;
+	if (del_timer(&idev->mc_gq_timer))
+		__in6_dev_put(idev);
+}
+
 static void mld_ifc_start_timer(struct inet6_dev *idev, unsigned long delay)
 {
 	unsigned long tv = net_random() % delay;
@@ -1014,6 +1021,13 @@ static void mld_ifc_start_timer(struct inet6_dev *idev, unsigned long delay)
 		in6_dev_hold(idev);
 }
 
+static void mld_ifc_stop_timer(struct inet6_dev *idev)
+{
+	idev->mc_ifc_count = 0;
+	if (del_timer(&idev->mc_ifc_timer))
+		__in6_dev_put(idev);
+}
+
 static void mld_dad_start_timer(struct inet6_dev *idev, unsigned long delay)
 {
 	unsigned long tv = net_random() % delay;
@@ -1022,6 +1036,12 @@ static void mld_dad_start_timer(struct inet6_dev *idev, unsigned long delay)
 		in6_dev_hold(idev);
 }
 
+static void mld_dad_stop_timer(struct inet6_dev *idev)
+{
+	if (del_timer(&idev->mc_dad_timer))
+		__in6_dev_put(idev);
+}
+
 /*
  *	IGMP handling (alias multicast ICMPv6 messages)
  */
@@ -1234,15 +1254,9 @@ static int mld_process_v1(struct inet6_dev *idev, struct mld_msg *mld,
 	mld_set_v1_mode(idev);
 
 	/* cancel MLDv2 report timer */
-	idev->mc_gq_running = 0;
-	if (del_timer(&idev->mc_gq_timer))
-		__in6_dev_put(idev);
-
+	mld_gq_stop_timer(idev);
 	/* cancel the interface change timer */
-	idev->mc_ifc_count = 0;
-	if (del_timer(&idev->mc_ifc_timer))
-		__in6_dev_put(idev);
-
+	mld_ifc_stop_timer(idev);
 	/* clear deleted report items */
 	mld_clear_delrec(idev);
 
@@ -2434,14 +2448,9 @@ void ipv6_mc_down(struct inet6_dev *idev)
 	/* Withdraw multicast list */
 
 	read_lock_bh(&idev->lock);
-	idev->mc_ifc_count = 0;
-	if (del_timer(&idev->mc_ifc_timer))
-		__in6_dev_put(idev);
-	idev->mc_gq_running = 0;
-	if (del_timer(&idev->mc_gq_timer))
-		__in6_dev_put(idev);
-	if (del_timer(&idev->mc_dad_timer))
-		__in6_dev_put(idev);
+	mld_ifc_stop_timer(idev);
+	mld_gq_stop_timer(idev);
+	mld_dad_stop_timer(idev);
 
 	for (i = idev->mc_list; i; i=i->next)
 		igmp6_group_dropped(i);
-- 
cgit v1.2.3


From 8fad9c39f31f9ed7bf3526c43a4537b2fcf1a5d5 Mon Sep 17 00:00:00 2001
From: Linus Lüssing <linus.luessing@web.de>
Date: Wed, 4 Sep 2013 02:13:38 +0200
Subject: bridge: prevent flooding IPv6 packets that do not have a listener
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently if there is no listener for a certain group then IPv6 packets
for that group are flooded on all ports, even though there might be no
host and router interested in it on a port.

With this commit they are only forwarded to ports with a multicast
router.

Just like commit bd4265fe36 ("bridge: Only flood unregistered groups
to routers") did for IPv4, let's do the same for IPv6 with the same
reasoning.

Signed-off-by: Linus Lüssing <linus.luessing@web.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_multicast.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 4accd0de6e8e..5388955b2a3c 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1491,8 +1491,14 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
 	 *  - MLD has always Router Alert hop-by-hop option
 	 *  - But we do not support jumbrograms.
 	 */
-	if (ip6h->version != 6 ||
-	    ip6h->nexthdr != IPPROTO_HOPOPTS ||
+	if (ip6h->version != 6)
+		return 0;
+
+	/* Prevent flooding this packet if there is no listener present */
+	if (ipv6_is_transient_multicast(&ip6h->daddr))
+		BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
+
+	if (ip6h->nexthdr != IPPROTO_HOPOPTS ||
 	    ip6h->payload_len == 0)
 		return 0;
 
-- 
cgit v1.2.3


From 3c3769e63301fd92fcaf51870c371583dd0282ce Mon Sep 17 00:00:00 2001
From: Linus Lüssing <linus.luessing@web.de>
Date: Wed, 4 Sep 2013 02:13:39 +0200
Subject: bridge: apply multicast snooping to IPv6 link-local, too
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The multicast snooping code should have matured enough to be safely
applicable to IPv6 link-local multicast addresses (excluding the
link-local all nodes address, ff02::1), too.

Signed-off-by: Linus Lüssing <linus.luessing@web.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_mdb.c       |  3 ++-
 net/bridge/br_multicast.c |  7 ++++---
 net/bridge/br_private.h   | 10 ----------
 3 files changed, 6 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index e4d5cd43b7fb..de818d95c476 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -9,6 +9,7 @@
 #include <net/netlink.h>
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
+#include <net/addrconf.h>
 #endif
 
 #include "br_private.h"
@@ -254,7 +255,7 @@ static bool is_valid_mdb_entry(struct br_mdb_entry *entry)
 			return false;
 #if IS_ENABLED(CONFIG_IPV6)
 	} else if (entry->addr.proto == htons(ETH_P_IPV6)) {
-		if (!ipv6_is_transient_multicast(&entry->addr.u.ip6))
+		if (ipv6_addr_is_ll_all_nodes(&entry->addr.u.ip6))
 			return false;
 #endif
 	} else
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 5388955b2a3c..23531471f16a 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -29,6 +29,7 @@
 #include <net/ipv6.h>
 #include <net/mld.h>
 #include <net/ip6_checksum.h>
+#include <net/addrconf.h>
 #endif
 
 #include "br_private.h"
@@ -723,7 +724,7 @@ static int br_ip6_multicast_add_group(struct net_bridge *br,
 {
 	struct br_ip br_group;
 
-	if (!ipv6_is_transient_multicast(group))
+	if (ipv6_addr_is_ll_all_nodes(group))
 		return 0;
 
 	br_group.u.ip6 = *group;
@@ -1354,7 +1355,7 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br,
 {
 	struct br_ip br_group;
 
-	if (!ipv6_is_transient_multicast(group))
+	if (ipv6_addr_is_ll_all_nodes(group))
 		return;
 
 	br_group.u.ip6 = *group;
@@ -1495,7 +1496,7 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
 		return 0;
 
 	/* Prevent flooding this packet if there is no listener present */
-	if (ipv6_is_transient_multicast(&ip6h->daddr))
+	if (!ipv6_addr_is_ll_all_nodes(&ip6h->daddr))
 		BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
 
 	if (ip6h->nexthdr != IPPROTO_HOPOPTS ||
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index d41283c57952..89b2949be02f 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -476,16 +476,6 @@ extern void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
 #define mlock_dereference(X, br) \
 	rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock))
 
-#if IS_ENABLED(CONFIG_IPV6)
-#include <net/addrconf.h>
-static inline int ipv6_is_transient_multicast(const struct in6_addr *addr)
-{
-	if (ipv6_addr_is_multicast(addr) && IPV6_ADDR_MC_FLAG_TRANSIENT(addr))
-		return 1;
-	return 0;
-}
-#endif
-
 static inline bool br_multicast_is_router(struct net_bridge *br)
 {
 	return br->multicast_router == 2 ||
-- 
cgit v1.2.3


From b55b76b22144ab97cefcb3862bab61f088adf411 Mon Sep 17 00:00:00 2001
From: Duan Jiong <duanj.fnst@cn.fujitsu.com>
Date: Wed, 4 Sep 2013 19:44:21 +0800
Subject: ipv6:introduce function to find route for redirect

RFC 4861 says that the IP source address of the Redirect is the
same as the current first-hop router for the specified ICMP
Destination Address, so the gateway should be taken into
consideration when we find the route for redirect.

There was once a check in commit
a6279458c534d01ccc39498aba61c93083ee0372 ("NDISC: Search over
all possible rules on receipt of redirect.") and the check
went away in commit b94f1c0904da9b8bf031667afc48080ba7c3e8c9
("ipv6: Use icmpv6_notify() to propagate redirect, instead of
rt6_redirect()").

The bug is only "exploitable" on layer-2 because the source
address of the redirect is checked to be a valid link-local
address but it makes spoofing a lot easier in the same L2
domain nonetheless.

Thanks very much for Hannes's help.

Signed-off-by: Duan Jiong <duanj.fnst@cn.fujitsu.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ah6.c     |  2 +-
 net/ipv6/esp6.c    |  2 +-
 net/ipv6/icmp.c    |  2 +-
 net/ipv6/ipcomp6.c |  2 +-
 net/ipv6/ndisc.c   |  3 +-
 net/ipv6/route.c   | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 6 files changed, 81 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index bb02e176cb70..73784c3d4642 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -628,7 +628,7 @@ static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		return;
 
 	if (type == NDISC_REDIRECT)
-		ip6_redirect(skb, net, 0, 0);
+		ip6_redirect(skb, net, skb->dev->ifindex, 0);
 	else
 		ip6_update_pmtu(skb, net, info, 0, 0);
 	xfrm_state_put(x);
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index aeac0dc3635d..d3618a78fcac 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -447,7 +447,7 @@ static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		return;
 
 	if (type == NDISC_REDIRECT)
-		ip6_redirect(skb, net, 0, 0);
+		ip6_redirect(skb, net, skb->dev->ifindex, 0);
 	else
 		ip6_update_pmtu(skb, net, info, 0, 0);
 	xfrm_state_put(x);
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 7cfc8d284870..73681c227453 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -92,7 +92,7 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	if (type == ICMPV6_PKT_TOOBIG)
 		ip6_update_pmtu(skb, net, info, 0, 0);
 	else if (type == NDISC_REDIRECT)
-		ip6_redirect(skb, net, 0, 0);
+		ip6_redirect(skb, net, skb->dev->ifindex, 0);
 
 	if (!(type & ICMPV6_INFOMSG_MASK))
 		if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST)
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 7af5aee75d98..5636a912074a 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -76,7 +76,7 @@ static void ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		return;
 
 	if (type == NDISC_REDIRECT)
-		ip6_redirect(skb, net, 0, 0);
+		ip6_redirect(skb, net, skb->dev->ifindex, 0);
 	else
 		ip6_update_pmtu(skb, net, info, 0, 0);
 	xfrm_state_put(x);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 22210650596f..c4bc7a35cd56 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1367,7 +1367,8 @@ static void ndisc_redirect_rcv(struct sk_buff *skb)
 		return;
 
 	if (!ndopts.nd_opts_rh) {
-		ip6_redirect_no_header(skb, dev_net(skb->dev), 0, 0);
+		ip6_redirect_no_header(skb, dev_net(skb->dev),
+					skb->dev->ifindex, 0);
 		return;
 	}
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index b770085ae36d..c979dd96d82a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1156,6 +1156,77 @@ void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
 }
 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
 
+/* Handle redirects */
+struct ip6rd_flowi {
+	struct flowi6 fl6;
+	struct in6_addr gateway;
+};
+
+static struct rt6_info *__ip6_route_redirect(struct net *net,
+					     struct fib6_table *table,
+					     struct flowi6 *fl6,
+					     int flags)
+{
+	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
+	struct rt6_info *rt;
+	struct fib6_node *fn;
+
+	/* Get the "current" route for this destination and
+	 * check if the redirect has come from approriate router.
+	 *
+	 * RFC 4861 specifies that redirects should only be
+	 * accepted if they come from the nexthop to the target.
+	 * Due to the way the routes are chosen, this notion
+	 * is a bit fuzzy and one might need to check all possible
+	 * routes.
+	 */
+
+	read_lock_bh(&table->tb6_lock);
+	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+restart:
+	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
+		if (rt6_check_expired(rt))
+			continue;
+		if (rt->dst.error)
+			break;
+		if (!(rt->rt6i_flags & RTF_GATEWAY))
+			continue;
+		if (fl6->flowi6_oif != rt->dst.dev->ifindex)
+			continue;
+		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
+			continue;
+		break;
+	}
+
+	if (!rt)
+		rt = net->ipv6.ip6_null_entry;
+	else if (rt->dst.error) {
+		rt = net->ipv6.ip6_null_entry;
+		goto out;
+	}
+	BACKTRACK(net, &fl6->saddr);
+out:
+	dst_hold(&rt->dst);
+
+	read_unlock_bh(&table->tb6_lock);
+
+	return rt;
+};
+
+static struct dst_entry *ip6_route_redirect(struct net *net,
+					const struct flowi6 *fl6,
+					const struct in6_addr *gateway)
+{
+	int flags = RT6_LOOKUP_F_HAS_SADDR;
+	struct ip6rd_flowi rdfl;
+
+	rdfl.fl6 = *fl6;
+	rdfl.gateway = *gateway;
+
+	return fib6_rule_lookup(net, &rdfl.fl6,
+				flags, __ip6_route_redirect);
+}
+
 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
 {
 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
@@ -1170,9 +1241,8 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
 	fl6.saddr = iph->saddr;
 	fl6.flowlabel = ip6_flowinfo(iph);
 
-	dst = ip6_route_output(net, NULL, &fl6);
-	if (!dst->error)
-		rt6_do_redirect(dst, NULL, skb);
+	dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
+	rt6_do_redirect(dst, NULL, skb);
 	dst_release(dst);
 }
 EXPORT_SYMBOL_GPL(ip6_redirect);
@@ -1192,9 +1262,8 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
 	fl6.daddr = msg->dest;
 	fl6.saddr = iph->daddr;
 
-	dst = ip6_route_output(net, NULL, &fl6);
-	if (!dst->error)
-		rt6_do_redirect(dst, NULL, skb);
+	dst = ip6_route_redirect(net, &fl6, &iph->saddr);
+	rt6_do_redirect(dst, NULL, skb);
 	dst_release(dst);
 }
 
-- 
cgit v1.2.3


From 0c1db731bfcf3a9fd6c58132134f8b0f423552f0 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Thu, 5 Sep 2013 00:11:19 -0400
Subject: caif: Add missing braces to multiline if in cfctrl_linkup_request

The indentation here implies this was meant to be a multi-line if.

Introduced several years back in commit c85c2951d4da1236e32f1858db418221e624aba5
("caif: Handle dev_queue_xmit errors.")

Signed-off-by: Dave Jones <davej@fedoraproject.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/caif/cfctrl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c
index 2bd4b58f4372..0f455227da83 100644
--- a/net/caif/cfctrl.c
+++ b/net/caif/cfctrl.c
@@ -293,9 +293,10 @@ int cfctrl_linkup_request(struct cflayer *layer,
 
 		count = cfctrl_cancel_req(&cfctrl->serv.layer,
 						user_layer);
-		if (count != 1)
+		if (count != 1) {
 			pr_err("Could not remove request (%d)", count);
 			return -ENODEV;
+		}
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From e2e5c4c07caf810d7849658dca42f598b3938e21 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Thu, 5 Sep 2013 13:43:34 -0400
Subject: tcp: Add missing braces to do_tcp_setsockopt

Signed-off-by: Dave Jones <davej@fedoraproject.org>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b2f6c74861af..95544e4028c0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2454,10 +2454,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 	case TCP_THIN_DUPACK:
 		if (val < 0 || val > 1)
 			err = -EINVAL;
-		else
+		else {
 			tp->thin_dupack = val;
 			if (tp->thin_dupack)
 				tcp_disable_early_retrans(tp);
+		}
 		break;
 
 	case TCP_REPAIR:
-- 
cgit v1.2.3


From 1a5bbfc3d6b700178b75743a2ba1fd2e58a8f36f Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 5 Sep 2013 14:38:03 -0400
Subject: netfilter: Fix build errors with xt_socket.c

As reported by Randy Dunlap:

====================
when CONFIG_IPV6=m
and CONFIG_NETFILTER_XT_MATCH_SOCKET=y:

net/built-in.o: In function `socket_mt6_v1_v2':
xt_socket.c:(.text+0x51b55): undefined reference to `udp6_lib_lookup'
net/built-in.o: In function `socket_mt_init':
xt_socket.c:(.init.text+0x1ef8): undefined reference to `nf_defrag_ipv6_enable'
====================

Like several other modules under net/netfilter/ we have to
have a dependency "IPV6 disabled or set compatibly with this
module" clause.

Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 62a171ab204f..6e839b6dff2b 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -1175,6 +1175,7 @@ config NETFILTER_XT_MATCH_SOCKET
 	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
 	depends on !NF_CONNTRACK || NF_CONNTRACK
+	depends on (IPV6 || IPV6=n)
 	select NF_DEFRAG_IPV4
 	select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
 	help
-- 
cgit v1.2.3


From 0d40f75bdab241868c0eb6f97aef9f8b3a66f7b3 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@nicira.com>
Date: Thu, 5 Sep 2013 12:17:05 -0700
Subject: openvswitch: Fix alignment of struct sw_flow_key.

sw_flow_key alignment was declared as " __aligned(__alignof__(long))".
However, this breaks on the m68k architecture where long is 32 bit in
size but 16 bit aligned by default. This aligns to the size of a long to
ensure that we can always do comparsions in full long-sized chunks. It
also adds an additional build check to catch any reduction in alignment.

CC: Andy Zhou <azhou@nicira.com>
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/flow.c | 1 +
 net/openvswitch/flow.h | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index ad1aeeb4f373..fb36f8565161 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -1981,6 +1981,7 @@ nla_put_failure:
  * Returns zero if successful or a negative error code. */
 int ovs_flow_init(void)
 {
+	BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
 	BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
 
 	flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0,
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index b65f885ac3dc..212fbf7510c4 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -125,7 +125,7 @@ struct sw_flow_key {
 			} nd;
 		} ipv6;
 	};
-} __aligned(__alignof__(long));
+} __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */
 
 struct sw_flow {
 	struct rcu_head rcu;
-- 
cgit v1.2.3